From 9934c2f20e97875f4b9f5bacff8ea0cea02604b5 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Mon, 21 Jan 2019 15:20:30 +0100
Subject: [PATCH] notnames as txt file, start list with wrongly identified
 speeches/speakers

---
 data/lists/not_names.txt                   | 36 ++++++++++++++++++++++
 data/lists/wrongly_identified_speakers.txt | 30 ++++++++++++++++++
 src/python/def_classes.py                  | 30 +++++++++---------
 src/python/run_extract_discussions.py      | 16 +++++++---
 src/python/utils_annot.py                  | 13 +++-----
 src/sh/extract_discussions_yearly.sh       |  4 +--
 6 files changed, 100 insertions(+), 29 deletions(-)
 create mode 100644 data/lists/not_names.txt
 create mode 100644 data/lists/wrongly_identified_speakers.txt

diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt
new file mode 100644
index 00000000..cd2fce55
--- /dev/null
+++ b/data/lists/not_names.txt
@@ -0,0 +1,36 @@
+Alinea
+Alter
+Ari
+Art
+bietet
+FÃ¤llen
+fasse
+Gallen
+hausen
+Herren
+Herr
+Kasse
+nicht
+Rath
+Seite
+selber
+Steuer
+StGallen
+Stimmen
+Stimme
+stimmt
+Hans
+Walter
+Werner
+Wer
+autre
+Biffer
+biffer
+poser
+cause
+dernier
+poser
+projet
+RÃ©diger
+rÃ©diger
+vote
diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt
new file mode 100644
index 00000000..2c33e407
--- /dev/null
+++ b/data/lists/wrongly_identified_speakers.txt
@@ -0,0 +1,30 @@
+
+also check for council:
+-----------------------
+1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann
+
+
+one MP not active in whole year, leads to other not uniquely identified
+-----------------------------------------------------------------------
+1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
+1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!)
+1925/20029967: Huber (in December, the second Huber already left) --> finds two!)
+1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc.
+
+
+identified as speech start but is in text:
+------------------------------------------
+1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 :
+1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt:
+1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr BrÃ¼gger gesagt hat: --> finds Lanz and BrÃ¼gger
+1925/20029917: MÃ¶gen Sie nun aber denken wie Herr Oberst BrÃ¼gger oder mÃ¶gen Sie denken wie ich: --> identified as speech start but is in text
+1925/20029917: Herr Hauser sagt: --> identified as speech start but is in text
+1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt:
+1925/20029981: BrÃ¼gger mÃ¶chte ich sagen: --> identified as speech start but is in text
+
+
+
+
+some other persons wrongly identified as MP
+-------------------------------------------
+1925/20029833: Sauser-Hall (not a MP)--> Hauser
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index cce72822..8e2a480b 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -107,7 +107,7 @@ class Document:
         else:
             print('Not saving to tar')
             name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
-            
+
         self.name_xml = [name_tar, name_xml]
         if flag_save:
             h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml)
@@ -119,10 +119,10 @@ class Document:
         self.n_pages = np.arange(len(self.XML_main))
         command = 'rm -rf ./' + str(self.year)
         #print(command)
-        utils_proc.call_with_out(command)        
-        
-    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, 
-                    ind_page = 0, textb_textl = 1):        
+        utils_proc.call_with_out(command)
+
+    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None,
+                    ind_page = 0, textb_textl = 1):
         # The page refers here to the page of the imgobj, which might not correspond
         # to the one of the xml. For that reason we use n_pages to obtain the index
         # for the xml
@@ -357,12 +357,12 @@ class Document:
         if ind_page > (len(self.XML_main) - 1):
             flag_error = 1
             return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
-        
+
         flag_central = 1
         if self.year > self.limit_year:
             flag_central = 0
-        flag_2col = 1        
-        
+        flag_2col = 1
+
         XML_root = ET.Element('pages')
         XML_root.append(self.XML_main[ind_abs[0]])
         imarray = np.array(self.imgobj[ind_page])
@@ -380,10 +380,10 @@ class Document:
         XML_enrich = []
 
         if level_proc > 0:
-            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
-                                                                                   flag_2col, flag_central)  
-            
-        if level_proc > 1:            
+            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page,
+                                                                                   flag_2col, flag_central)
+
+        if level_proc > 1:
             _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page)
 
         if level_proc > 2:
@@ -645,8 +645,8 @@ class Document:
             name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                         + '_page' + str(ind_page) + '.' + format_fig)
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)       
-    
+            plt.close(fig)
+
     def check_discussion(self):
         utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
         flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
@@ -748,7 +748,7 @@ class Document:
 
         print('we have a main corr XML file')
         #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
-        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, bln_print=False)
         self.XML_main_annot = XML_main_annot
 
         # save xml file
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 495b23ce..d7fd2393 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -4,8 +4,8 @@
 # Code to extract discussions from corrected XML files
 #%%
 # to work with atom
-%load_ext autoreload
-%autoreload 2
+#%load_ext autoreload
+#%autoreload 2
 
 import pickle
 import time
@@ -26,10 +26,11 @@ from utils_proc import call_with_out
 # specify input and output files
 
 # needed for running in atom, can be ignored
-year = '1891'
+year = '1925'
 input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
+input_notnames = "data/lists/not_names.txt"
 output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 
 #%%
@@ -37,7 +38,8 @@ output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 input_lastnames = sys.argv[1]
 input_correctedxml = sys.argv[2]
 input_correctedmeta = sys.argv[3]
-output_annotatedxml = sys.argv[4]
+input_notnames = sys.argv[4]
+output_annotatedxml = sys.argv[5]
 
 #%%
 # extract suffixes, year, folder_database
@@ -77,6 +79,11 @@ with open(input_lastnames, 'rb') as f:
 
 print('dataframe with lastnames loaded')
 
+with open(input_notnames) as f:
+    list_notnames = f.readlines()
+
+list_notnames = [term.rstrip() for term in list_notnames]
+
 #%%
 # for each file
 # TODO !!!! get rid of [66:]
@@ -92,6 +99,7 @@ for file_tarpath in files_to_process:
     if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
         print(id_doc + '\n')
         file_doc.df_lastnames = df_lastnames
+        file_doc.list_notnames = list_notnames
         file_doc.annotate_xml()
 
 # Commands to get the compressed version of the file
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 66bedcc9..f9148b8d 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -69,7 +69,7 @@ def get_text(sometext):
 
 
 # function to annotated corrected XML
-def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
+def get_annotated_xml(XML_root, df_lastnames, list_notnames, bln_print=False):
 
     # list of votation terms
     # TODO: make it work for Ã©, etc.
@@ -121,7 +121,7 @@ def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
 
                         if textbox_texttype in ['text_col1', 'text_col2']:
 
-                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, bln_print=False)
                             if this_is_speech:
                                 prev_is_speech = True
 #                                print('stopped after finding speech start')
@@ -222,7 +222,7 @@ def get_complete_text(textbox):
 # - bln_print: whether to print during execution, default False
 # output:
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
-def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False):
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False):
 
     # initialize flag
     this_is_speech = False
@@ -270,7 +270,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
 #        # for every term
 #        for term in list_oi:
         # if possible, find a name in a list
-        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False)
+        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, list_notnames, bln_print=False)
         if bln_print:
             print('name', str_name, 'role', str_role)
 
@@ -426,7 +426,7 @@ def flatten(l):
 # - list_tupels: list of tupels containing all types of names
 # TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton
 # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
-def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False):
+def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, list_notnames, bln_print=False):
 
     def get_string(term, str_name, str_role, list_uniqueID, str_canton):
         name_type = ''
@@ -522,9 +522,6 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
                   'Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller fÃ©dÃ©ral',
                   'VizeprÃ¤sident']
 
-    list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen',
-                     'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'RÃ©diger', 'rÃ©diger', 'Wer', 'FÃ¤llen', 'Ari', 'bietet', 'autre']
-
     list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names)
 
     # extract list and array of last names
diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh
index dbec0daf..e769de17 100755
--- a/src/sh/extract_discussions_yearly.sh
+++ b/src/sh/extract_discussions_yearly.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
 year_start=1891
-year_end=1893
+year_end=1891
 
 for year in $(seq $year_start $year_end)
 do
     echo $year
-    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz
+    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz
 done
-- 
GitLab