From 6dffcc06fa680cab337c94b778e9e6c04739c434 Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Fri, 4 Jan 2019 10:01:48 +0100 Subject: [PATCH] WIP extract discussions from bash file --- src/python/run_extract_discussions.py | 67 ++++++++++++--------------- src/sh/extract_discussions_yearly.sh | 2 +- 2 files changed, 31 insertions(+), 38 deletions(-) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 434c10d1..0a1fd5fd 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -3,8 +3,9 @@ # Code to extract discussions from corrected XML files #%% -%load_ext autoreload -%autoreload 2 +# to work with atom +#%load_ext autoreload +#%autoreload 2 import pickle import time @@ -18,6 +19,7 @@ import utils_proc import utils_annot #%% +# specify input and output files # needed for running in atom, can be ignored input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle" @@ -26,10 +28,10 @@ input_correctedmeta = "data/AB/1893/03_correctedmeta.tar.gz" output_annotatedxml = "data/AB/1893/05_annotatedxml.tar.gz" # detect arguments -#input_lastnames = sys.argv[1] -#input_correctedxml = sys.argv[2] -#input_correctedmeta = sys.argv[3] -#output_annotatedxml = sys.argv[4] +input_lastnames = sys.argv[1] +input_correctedxml = sys.argv[2] +input_correctedmeta = sys.argv[3] +output_annotatedxml = sys.argv[4] #%% # extract suffixes, year, folder_database @@ -40,13 +42,6 @@ folder_database = input_correctedxml.split(year)[0] suffix_correctedmeta = '_metacorr' #suffix_correctedxml = '_datacorr' -# print some output -print(year) -print(type(year)) -#print(input_lastnames) -#print(input_correctedxml) -#print(input_correctedmeta) -print(folder_database) #%% # TODO pull necessary data from here #!!git lfs pull origin -I input_correctedxml @@ -64,14 +59,16 @@ files_to_process.sort() print(files_to_process) # open dataframe of last names from pickle file +# (there is one file of lastnames per year) with open(input_lastnames, 'rb') as f: df_lastnames = pickle.load(f) -df_lastnames.columns +print(df_lastnames.column) + #%% # for each file for file_tarpath in files_to_process[66:]: -# print(file_tarpath) + #print(file_tarpath) id_doc = file_tarpath.split('/')[-1][:8] # instantiate document object (always from original pdf) @@ -93,30 +90,26 @@ utils_proc.compress_tar(output_annotatedxml) #%% - -file_tarpath = './1893/20026537_datacorr.xml' - -id_doc = file_tarpath.split('/')[-1][:8] -metafile_tarpath = './{}/{}{}.xml'.format(year, id_doc, suffix_correctedmeta) - -# instantiate document object (always from original pdf) -infile_aux = year + '/' + id_doc + '.pdf' -file_doc = defc.Document(infile_aux, folder_database) - -if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): - print(id_doc + '\n') - - file_doc.df_lastnames = df_lastnames - file_doc.annotate_xml() - - name_xml = 'data/AB/1893/id_doc_previewannotated-class.xml' - tree = ET.ElementTree(file_doc.XML_main_annot) - tree.write(name_xml, encoding = 'utf-8') - +## to test for one file +#file_tarpath = './1893/20026538_datacorr.xml' +# +#id_doc = file_tarpath.split('/')[-1][:8] +# +## instantiate document object (always from original pdf) +#infile_aux = year + '/' + id_doc + '.pdf' +#file_doc = defc.Document(infile_aux, folder_database) +# +#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): + #print(id_doc + '\n') +# + #file_doc.df_lastnames = df_lastnames + #file_doc.annotate_xml() +# +# #%% -id_doc +#id_doc -len(files_to_process) +#len(files_to_process) diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh index 4df0e302..9e8bef33 100755 --- a/src/sh/extract_discussions_yearly.sh +++ b/src/sh/extract_discussions_yearly.sh @@ -6,5 +6,5 @@ year_end=1891 for year in $(seq $year_start $year_end) do echo $year - python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/$year/04_correctedxml.tar.gz data/AB/$year/03_correctedmeta.tar.gz data/ data/AB/$year/05_annotatedxml.tar.gz + python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/ data/AB/${year}/05_annotatedxml.tar.gz done -- GitLab