From 9934c2f20e97875f4b9f5bacff8ea0cea02604b5 Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Mon, 21 Jan 2019 15:20:30 +0100 Subject: [PATCH] notnames as txt file, start list with wrongly identified speeches/speakers --- data/lists/not_names.txt | 36 ++++++++++++++++++++++ data/lists/wrongly_identified_speakers.txt | 30 ++++++++++++++++++ src/python/def_classes.py | 30 +++++++++--------- src/python/run_extract_discussions.py | 16 +++++++--- src/python/utils_annot.py | 13 +++----- src/sh/extract_discussions_yearly.sh | 4 +-- 6 files changed, 100 insertions(+), 29 deletions(-) create mode 100644 data/lists/not_names.txt create mode 100644 data/lists/wrongly_identified_speakers.txt diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt new file mode 100644 index 00000000..cd2fce55 --- /dev/null +++ b/data/lists/not_names.txt @@ -0,0 +1,36 @@ +Alinea +Alter +Ari +Art +bietet +Fällen +fasse +Gallen +hausen +Herren +Herr +Kasse +nicht +Rath +Seite +selber +Steuer +StGallen +Stimmen +Stimme +stimmt +Hans +Walter +Werner +Wer +autre +Biffer +biffer +poser +cause +dernier +poser +projet +Rédiger +rédiger +vote diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt new file mode 100644 index 00000000..2c33e407 --- /dev/null +++ b/data/lists/wrongly_identified_speakers.txt @@ -0,0 +1,30 @@ + +also check for council: +----------------------- +1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann + + +one MP not active in whole year, leads to other not uniquely identified +----------------------------------------------------------------------- +1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) +1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) +1925/20029967: Huber (in December, the second Huber already left) --> finds two!) +1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. + + +identified as speech start but is in text: +------------------------------------------ +1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 : +1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt: +1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger +1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text +1925/20029917: Herr Hauser sagt: --> identified as speech start but is in text +1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt: +1925/20029981: Brügger möchte ich sagen: --> identified as speech start but is in text + + + + +some other persons wrongly identified as MP +------------------------------------------- +1925/20029833: Sauser-Hall (not a MP)--> Hauser diff --git a/src/python/def_classes.py b/src/python/def_classes.py index cce72822..8e2a480b 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -107,7 +107,7 @@ class Document: else: print('Not saving to tar') name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' - + self.name_xml = [name_tar, name_xml] if flag_save: h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml) @@ -119,10 +119,10 @@ class Document: self.n_pages = np.arange(len(self.XML_main)) command = 'rm -rf ./' + str(self.year) #print(command) - utils_proc.call_with_out(command) - - def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, - ind_page = 0, textb_textl = 1): + utils_proc.call_with_out(command) + + def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, + ind_page = 0, textb_textl = 1): # The page refers here to the page of the imgobj, which might not correspond # to the one of the xml. For that reason we use n_pages to obtain the index # for the xml @@ -357,12 +357,12 @@ class Document: if ind_page > (len(self.XML_main) - 1): flag_error = 1 return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error - + flag_central = 1 if self.year > self.limit_year: flag_central = 0 - flag_2col = 1 - + flag_2col = 1 + XML_root = ET.Element('pages') XML_root.append(self.XML_main[ind_abs[0]]) imarray = np.array(self.imgobj[ind_page]) @@ -380,10 +380,10 @@ class Document: XML_enrich = [] if level_proc > 0: - coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, - flag_2col, flag_central) - - if level_proc > 1: + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col, flag_central) + + if level_proc > 1: _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page) if level_proc > 2: @@ -645,8 +645,8 @@ class Document: name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + '_page' + str(ind_page) + '.' + format_fig) fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - + plt.close(fig) + def check_discussion(self): utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) @@ -748,7 +748,7 @@ class Document: print('we have a main corr XML file') #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) - XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False) + XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, bln_print=False) self.XML_main_annot = XML_main_annot # save xml file diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 495b23ce..d7fd2393 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -4,8 +4,8 @@ # Code to extract discussions from corrected XML files #%% # to work with atom -%load_ext autoreload -%autoreload 2 +#%load_ext autoreload +#%autoreload 2 import pickle import time @@ -26,10 +26,11 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1891' +year = '1925' input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" +input_notnames = "data/lists/not_names.txt" output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" #%% @@ -37,7 +38,8 @@ output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" input_lastnames = sys.argv[1] input_correctedxml = sys.argv[2] input_correctedmeta = sys.argv[3] -output_annotatedxml = sys.argv[4] +input_notnames = sys.argv[4] +output_annotatedxml = sys.argv[5] #%% # extract suffixes, year, folder_database @@ -77,6 +79,11 @@ with open(input_lastnames, 'rb') as f: print('dataframe with lastnames loaded') +with open(input_notnames) as f: + list_notnames = f.readlines() + +list_notnames = [term.rstrip() for term in list_notnames] + #%% # for each file # TODO !!!! get rid of [66:] @@ -92,6 +99,7 @@ for file_tarpath in files_to_process: if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): print(id_doc + '\n') file_doc.df_lastnames = df_lastnames + file_doc.list_notnames = list_notnames file_doc.annotate_xml() # Commands to get the compressed version of the file diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 66bedcc9..f9148b8d 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -69,7 +69,7 @@ def get_text(sometext): # function to annotated corrected XML -def get_annotated_xml(XML_root, df_lastnames, bln_print=False): +def get_annotated_xml(XML_root, df_lastnames, list_notnames, bln_print=False): # list of votation terms # TODO: make it work for é, etc. @@ -121,7 +121,7 @@ def get_annotated_xml(XML_root, df_lastnames, bln_print=False): if textbox_texttype in ['text_col1', 'text_col2']: - XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False) + XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, bln_print=False) if this_is_speech: prev_is_speech = True # print('stopped after finding speech start') @@ -222,7 +222,7 @@ def get_complete_text(textbox): # - bln_print: whether to print during execution, default False # output: # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID -def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False): +def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False): # initialize flag this_is_speech = False @@ -270,7 +270,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # # for every term # for term in list_oi: # if possible, find a name in a list - str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False) + str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, list_notnames, bln_print=False) if bln_print: print('name', str_name, 'role', str_role) @@ -426,7 +426,7 @@ def flatten(l): # - list_tupels: list of tupels containing all types of names # TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) -def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False): +def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, list_notnames, bln_print=False): def get_string(term, str_name, str_role, list_uniqueID, str_canton): name_type = '' @@ -522,9 +522,6 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', 'Vizepräsident'] - list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen', - 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'Rédiger', 'rédiger', 'Wer', 'Fällen', 'Ari', 'bietet', 'autre'] - list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names) # extract list and array of last names diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh index dbec0daf..e769de17 100755 --- a/src/sh/extract_discussions_yearly.sh +++ b/src/sh/extract_discussions_yearly.sh @@ -1,10 +1,10 @@ #!/bin/bash year_start=1891 -year_end=1893 +year_end=1891 for year in $(seq $year_start $year_end) do echo $year - python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz + python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz done -- GitLab