mend

82ea87c0 · Lili Gasser · f279c6bd · 82ea87c0
Commit 82ea87c0 authored 6 years ago by Lili Gasser
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+#%%
+%load_ext autoreload
+%autoreload 2

 import os
 import pickle
@@ -14,6 +17,7 @@ sys.path.append('src/python/')
 import def_classes as defc
 import utils_proc
 import utils_annot
+#%%

 # needed for running in atom, can be ignored
 input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle"
@@ -22,19 +26,22 @@ input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz"
 output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz"

 # detect arguments
-input_lastnames = sys.argv[1]
-input_correctedxml = sys.argv[2]
-input_correctedmeta = sys.argv[3]
-output_annotatedxml = sys.argv[4]
+#input_lastnames = sys.argv[1]
+#input_correctedxml = sys.argv[2]
+#input_correctedmeta = sys.argv[3]
+#output_annotatedxml = sys.argv[4]

-# extract suffix, year, folder_database
+#%%
+
+# extract suffixes, year, folder_database
 suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0]
 suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0]
-year = (input_correctedxml.split('/')[-2])
+year = input_correctedxml.split('/')[-2]
 folder_database = input_correctedxml.split(year)[0]
 suffix_correctedmeta = '_metacorr'
 suffix_correctedxml = '_datacorr'

+#%%
 # print some output
 print(year)
 print(type(year))
@@ -42,7 +49,7 @@ print(input_correctedxml)
 print(input_lastnames)
 print(input_correctedmeta)
 print(folder_database)
-
+#%%
 # pull necessary data
 # TODO pull necessary data from here
 #!!git lfs pull origin -I input_correctedxml
@@ -54,7 +61,7 @@ print(folder_database)

 # get dictionary of discussions
 # -----------------------------
-
+#%%
 start_time_discussions = time.time()
 print('start to identify discussions of the year', year, '\n\n\n')

@@ -71,8 +78,7 @@ list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
 list_stopwords.extend(stopwords.words('french'))
 list_stopwords.extend(['ils', 'les', 'celle'])

-# string for this year
-str_year = str(year)
+#%%
 # initialize empty dictionary for that year
 dict_year = {}

@@ -82,6 +88,7 @@ files_to_process.sort()
 print(files_to_process)

 #meta_files, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedmeta)
+#%%

 # open dataframe of last names from pickle file
 with open(input_lastnames, 'rb') as f:
@@ -89,25 +96,48 @@ with open(input_lastnames, 'rb') as f:

 print(df_lastnames)
 df_lastnames.columns
+#%%

 file_tarpath = './1891/20026440_datacorr.xml'
+
+file_number = file_tarpath.split('/')[-1][:8]
+metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta)
+
+# instantiate document object (always from original pdf)
+infile_aux = year + '/' + file_number + '.pdf'
+file_doc = defc.Document(infile_aux, folder_database)
+file_doc
+
+
+if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
+    print(file_number + '\n')
+
+
+
+
+
+
+
+
+
+
+
+#%%
 # for each file
 for file_tarpath in files_to_process:
 #    print(file_tarpath)
    file_number = file_tarpath.split('/')[-1][:8]
-    metafile_tarpath = './{}/{}{}.xml'.format(str_year, file_number, suffix_correctedmeta)

    # instantiate document object (always from original pdf)
-    infile_aux = str_year + '/' + file_number + '.pdf'
-    file_doc = defc.Document(infile_aux, folder_database)
-
-
+    file_aux = year + '/' + file_number + '.pdf'
+    file_doc = defc.Document(file_aux, folder_database)

    # ----> CONTINUE HERE NEXT TIME!!!!!!
-    #path_number = path_data + str_year + '/' + number + '/'
    # if document is a discussion
    if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
        print(file_number + '\n')
+
+#%%
        ## get dictionary with text
        #dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml')
        ## exclude parts from previous and next document