From 82ea87c063e19a88e7565a78fc7bcfe24b52b45c Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Wed, 5 Dec 2018 18:11:27 +0100 Subject: [PATCH] mend --- src/python/run_extract_discussions.py | 62 ++++++++++++++++++++------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 4e2c1531..0dc23f86 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +#%% +%load_ext autoreload +%autoreload 2 import os import pickle @@ -14,6 +17,7 @@ sys.path.append('src/python/') import def_classes as defc import utils_proc import utils_annot +#%% # needed for running in atom, can be ignored input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle" @@ -22,19 +26,22 @@ input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz" output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz" # detect arguments -input_lastnames = sys.argv[1] -input_correctedxml = sys.argv[2] -input_correctedmeta = sys.argv[3] -output_annotatedxml = sys.argv[4] +#input_lastnames = sys.argv[1] +#input_correctedxml = sys.argv[2] +#input_correctedmeta = sys.argv[3] +#output_annotatedxml = sys.argv[4] -# extract suffix, year, folder_database +#%% + +# extract suffixes, year, folder_database suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0] suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0] -year = (input_correctedxml.split('/')[-2]) +year = input_correctedxml.split('/')[-2] folder_database = input_correctedxml.split(year)[0] suffix_correctedmeta = '_metacorr' suffix_correctedxml = '_datacorr' +#%% # print some output print(year) print(type(year)) @@ -42,7 +49,7 @@ print(input_correctedxml) print(input_lastnames) print(input_correctedmeta) print(folder_database) - +#%% # pull necessary data # TODO pull necessary data from here #!!git lfs pull origin -I input_correctedxml @@ -54,7 +61,7 @@ print(folder_database) # get dictionary of discussions # ----------------------------- - +#%% start_time_discussions = time.time() print('start to identify discussions of the year', year, '\n\n\n') @@ -71,8 +78,7 @@ list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) list_stopwords.extend(stopwords.words('french')) list_stopwords.extend(['ils', 'les', 'celle']) -# string for this year -str_year = str(year) +#%% # initialize empty dictionary for that year dict_year = {} @@ -82,6 +88,7 @@ files_to_process.sort() print(files_to_process) #meta_files, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedmeta) +#%% # open dataframe of last names from pickle file with open(input_lastnames, 'rb') as f: @@ -89,25 +96,48 @@ with open(input_lastnames, 'rb') as f: print(df_lastnames) df_lastnames.columns +#%% file_tarpath = './1891/20026440_datacorr.xml' + +file_number = file_tarpath.split('/')[-1][:8] +metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta) + +# instantiate document object (always from original pdf) +infile_aux = year + '/' + file_number + '.pdf' +file_doc = defc.Document(infile_aux, folder_database) +file_doc + + +if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']): + print(file_number + '\n') + + + + + + + + + + + +#%% # for each file for file_tarpath in files_to_process: # print(file_tarpath) file_number = file_tarpath.split('/')[-1][:8] - metafile_tarpath = './{}/{}{}.xml'.format(str_year, file_number, suffix_correctedmeta) # instantiate document object (always from original pdf) - infile_aux = str_year + '/' + file_number + '.pdf' - file_doc = defc.Document(infile_aux, folder_database) - - + file_aux = year + '/' + file_number + '.pdf' + file_doc = defc.Document(file_aux, folder_database) # ----> CONTINUE HERE NEXT TIME!!!!!! - #path_number = path_data + str_year + '/' + number + '/' # if document is a discussion if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']): print(file_number + '\n') + +#%% ## get dictionary with text #dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml') ## exclude parts from previous and next document -- GitLab