From babfb25f4befae6e64b7f279eeee51b959f40e2e Mon Sep 17 00:00:00 2001 From: Lili Gasser <gasserli@ethz.ch> Date: Thu, 7 Feb 2019 10:27:15 +0000 Subject: [PATCH] get rid of print statements --- src/python/def_classes.py | 14 +++++----- src/python/run_extract_discussions.py | 23 +++------------- src/python/utils_annot.py | 38 ++++++++++++++++----------- 3 files changed, 33 insertions(+), 42 deletions(-) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index cb2f6260..96897ce7 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -732,7 +732,7 @@ class Document: utils_proc.call_with_out(command) - def get_council_date(self, name_outmeta = '03_correctedmeta'): + def _get_council_date(self, name_outmeta = '03_correctedmeta'): if 'name_outmeta' not in self.__dict__.keys(): self.name_outmeta = name_outmeta @@ -750,7 +750,7 @@ class Document: # - dict_overlaps: dictionary with overlaps # output: # - (first_entry, last_entry): tuple of first and last textbox id - def get_first_last_textbox(self, dict_overlaps_year): + def _get_first_last_textbox(self, dict_overlaps_year): # initialize to impossible values first_entry = -1 @@ -781,7 +781,7 @@ class Document: self.name_outxml = name_outxml if 'XML_main_corr' not in self.__dict__.keys(): - print('no main corr') + #print('no main corr') name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' if os.path.isfile(name_tar): name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' @@ -799,13 +799,13 @@ class Document: #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, #name_outcorrxml = self.name_outcorrxml) - print('we have a main corr XML file') + #print('we have a main corr XML file') # get council and date - self.get_council_date() + self._get_council_date() # get start and end of document - entries = self.get_first_last_textbox(self.dict_overlaps_year) + entries = self._get_first_last_textbox(self.dict_overlaps_year) # update if document starts/ends as on pdf if entries[0] == -1: @@ -841,9 +841,7 @@ class Document: self.name_outannotxml = name_outannotxml self.name_annot_corr = [name_tar, name_xml] -# self._xml_ext(suffix_xml, self.name_outannotxml) command = 'rm -rf ./' + str(self.year) - #print(command) utils_proc.call_with_out(command) print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 2ad33fdf..a7689b9b 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -9,16 +9,12 @@ import pickle import time -import xml.etree.ElementTree as ET import sys sys.path.append('src/python/') import def_classes as defc import utils_proc -import utils_annot - -import os from utils_proc import call_with_out @@ -35,7 +31,7 @@ input_overlaps = "data/lists/dict_overlaps.pickle" output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" #%% -# detect arguments +# detect arguments from sh file input_lastnames = sys.argv[1] input_correctedxml = sys.argv[2] input_correctedmeta = sys.argv[3] @@ -50,33 +46,22 @@ suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0] #suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0] year = input_correctedxml.split('/')[-2] folder_database = input_correctedxml.split(year)[0] -suffix_correctedmeta = '_metacorr' -#suffix_correctedxml = '_datacorr' #%% -# TODO: make it work! # git lfs pull necessary data +# does not work in atom for lfsfile in [input_correctedxml, input_correctedmeta, input_overlaps]: command = 'git lfs pull -I ' + lfsfile call_with_out(command) - -print(input_lastnames.split(year)[0]) command = 'git lfs pull -I ' + input_lastnames.split(year)[0] call_with_out(command) #%% -# TODO: exclude overlaps --> after annotation - - -#%% -start_time_discussions = time.time() -print('start to identify discussions of the year', year, '\n') - # extract list of files files_to_process, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedxml) files_to_process.sort() -print('files to process loaded:', files_to_process) +print('files to process loaded') # open dataframe of last names from pickle file # (there is one file of lastnames per year) @@ -112,7 +97,7 @@ for file_tarpath in files_to_process: # if document is a discussion if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): - print(id_doc + '\n') + #print(id_doc + '\n') file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames file_doc.dict_overlaps_year = dict_overlaps_year diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index ce4a8856..a6e4c9cf 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -431,8 +431,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # set flag this_is_speech = True if bln_print: - print('found a name:', text_start, list_oi, str_name, str_role, '\n') - print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n') + print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n') return XML_new, this_is_speech @@ -513,9 +512,10 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str def get_string(term, df_names, str_name, list_uniqueID): # get name type name_type = df_names['nameType'].loc[df_names['shortName']==term].iloc[0] - if name_type != 'simple': - print(df_names[df_names['shortName']==term]) - print(term, name_type) + if bln_print: + if name_type != 'simple': + print(df_names[df_names['shortName']==term]) + print(term, name_type) # extract uniqueID and complete name for this term list_temp = [] @@ -528,7 +528,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str list_temp = list(df_names.loc[(df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) str_completeName = term + ' (CANTON MISSING)' - print(list_temp, str_completeName) + if bln_print: + print(list_temp, str_completeName) # set or update unique ID and name # if no unique ID and name has been assigned so far @@ -686,7 +687,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str if term in list_: str_canton = term canton_type = type_ - print('!!! is a canton', term, list_oi, str_name, str_role) + if bln_print: + print('!!! is a canton', term, list_oi, str_name, str_role) break # if person was not uniquely identified, check for misspellings @@ -699,7 +701,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str if term_approx: str_canton = term_approx canton_type = type_ - print('!!! is a canton', term, list_oi, str_name, str_role) + if bln_print: + print('!!! is a canton', term, list_oi, str_name, str_role) break # if a canton or similar was found @@ -728,15 +731,18 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: - print(canton_type, str_canton, str_name, df_temp) + if bln_print: + print(canton_type, str_canton, str_name, df_temp) list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] - print(list_temp, list_uniqueID, str_completeName) + if bln_print: + print(list_temp, list_uniqueID, str_completeName) if len(list_temp) > 0: list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) - print(str_completeName) + if bln_print: + print(str_completeName) if 'CANTON MISSING' in str_completeName: str_name = add_to_string('', str_completeName) elif str_completeName.split(' ')[0] == str_name: @@ -745,13 +751,15 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str str_name = add_to_string(str_name, str_completeName) else: - print('could not be identified as a canton:', term, list_oi, str_name, str_role) + if bln_print: + print('could not be identified as a canton:', term, list_oi, str_name, str_role) # if term is first name # needed when people are referenced by FirstName LastName, e.g. Simon Kohler elif term in list_all_firstnames: str_firstname = term - print('found a first name', str_firstname) + if bln_print: + print('found a first name', str_firstname) # if term is not easily mistaken as a name (avoid false positives) elif term not in list_notnames: @@ -780,7 +788,6 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # TODO check for false positives of these procedures if name_type == 'canton': # check if person can be identified from firstname - print(str_firstname) if str_firstname: df_temp = df_names.loc[(df_names['shortName']==str_name.split(' ')[0]) & (df_names['FirstName']==str_firstname)] if df_temp.shape[0] == 1: @@ -831,7 +838,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: str_name = add_to_string(str_name, str_completeName) - print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape) + if bln_print: + print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape) # TODO: function to update list unique ID and str_name -- GitLab