From 6d499ef5d5d04ade014e42afcd17afe6179d2bf9 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Mon, 4 Feb 2019 18:17:42 +0100
Subject: [PATCH] create output file with speakers

---
 data/lists/wrongly_identified_speakers.txt | 25 ++++++---
 src/python/def_classes.py                  | 16 ++++--
 src/python/run_extract_discussions.py      | 39 +------------
 src/python/utils_annot.py                  | 64 ++++++++++++----------
 4 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt
index 038c497e..972c36c5 100644
--- a/data/lists/wrongly_identified_speakers.txt
+++ b/data/lists/wrongly_identified_speakers.txt
@@ -40,16 +40,10 @@ speaker not uniquely identified when he spoke the second time:
 1940/20033001: Keller (CANTON MISSING) reporter majority [2868, 2871, 2890] 1940-03-27 00:00 16 ['Keller', 'Berichterstatter', 'Mehrheit']
 
 
-identifier is split into two words
+identifier is split into two words --> partly solved by adding more additional info columns to csv
 ----------------------------------
-1925/20029945, 1951/20035173: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING)  [4639, 4660]
 1971/20000498: ['M', 'Muf', 'ny', 'rapporteur', 'de', 'la', 'majoritÃ©'] 7 --> finds Muff but is Mugny
-1951/20034978,79,94: found a name: Bringolf- Schaff hausen  ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING)  [707, 706] --> solved by adding Schaff as additional Info
-1941/        : MÃ¼ller Aarb erg
-1956/20036201: Berger (CANTON MISSING) reporter [368, 373, 375] 1956-12-10 00:00 1 ['Berger', 'Neuch', 'Ã tei', 'rapporteur']
 1936/20031984: Keller (CANTON MISSING) reporter majority [2868, 2871, 2890] 1936-01-08 00:00 10 ['Keller', 'Aar', 'Berichterstatter', 'Mehrheit'] Aarau split in Aar au
-1936/20031998: StÃ¤hli (CANTON MISSING)  [4967, 4964] 1936-01-17 00:00 13 ['StÃ¤hli', 'Sieb', 'nen']
-1936/20032015: MÃ¼ller (CANTON MISSING)  [3638, 3645, 3652, 3654, 3658, 3659, 3662] 1936-01-30 00:00 6 ['MÃ¼ller', 'GrosshÃ¶chste', 'tten']
 
 
 identified as speech start but is in text: --> some of these might be solved by only looking at list_oi with less than 9 elements
@@ -78,9 +72,9 @@ list of people in a minority are recognized as speech starts:
 
 misspelled role:
 ----------------
-1936/20031986: Meyer (CANTON MISSING)  [3482, 3483, 3488, 3490] 1936-01-09 00:00 14 ['RundesprÃ¤sident', 'Meyer']
+1936/20031986: Meyer (CANTON MISSING)  [3482, 3483, 3488, 3490] 1936-01-09 00:00 14 ['RundesprÃ¤sident', 'Meyer']   ---> solved
 1936/20031992: Meyer (CANTON MISSING)  [3482, 3483, 3488, 3490] 1936-01-15 00:00 10 ['ÃŸundesprÃ¤sident', 'Meyer']
-1932/20031299: HÃ¤berlin (CANTON MISSING)  [2290, 2287] 1932-09-21 00:00 6 ['Bimdesrat', 'HÃ¤berlin']
+1932/20031299: HÃ¤berlin (CANTON MISSING)  [2290, 2287] 1932-09-21 00:00 6 ['Bimdesrat', 'HÃ¤berlin'] --> solved
 
 
 weird layout:
@@ -96,6 +90,8 @@ bad OCR:
 1952/20035242, and some others: reporter [] 1952-03-25 00:00 9 ['Spanier', 'Berichterstatter']  --> SpÃ¼hler not found
 1948/20034315: reporter [] 1948-09-23 00:00 5 ['Statili', 'Berichterstatter'] --> StÃ¤hli not found
 1936/20032015: reporter majority [] 1936-01-30 00:00 2 ['Statili', 'Berichterstatter', 'Mehrheit'] --> StÃ¤hli not found
+1936/20032189: page 4 ['Bundesrat', 'Bautltann'] 1936-12-17 00:00 --> Baumann not found
+1936/20031985: page 1 ['DollÃ®US', 'rapporteur'] 1936-01-09 00:00  --> Dollfus not found
 
 
 not sure about place:
@@ -193,3 +189,14 @@ solved: wrong entries in xlsx:
 ----------------------
 1931/20030940,49: Scherer (CANTON MISSING)  [4560, 4565] 1931-03-18 00:00 18 ['Scherer'] --> there are two entries for one person
 1971/20000055: DebÃ©taz, was not there as a NR
+
+
+
+solved: identifier is split into two words
+----------------------------------
+1925/20029945, 1951/20035173: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING)  [4639, 4660]
+1951/20034978,79,94: found a name: Bringolf- Schaff hausen  ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING)  [707, 706] --> solved by adding Schaff as additional Info
+1941/        : MÃ¼ller Aarb erg
+1956/20036201: Berger (CANTON MISSING) reporter [368, 373, 375] 1956-12-10 00:00 1 ['Berger', 'Neuch', 'Ã tei', 'rapporteur']
+1936/20031998: StÃ¤hli (CANTON MISSING)  [4967, 4964] 1936-01-17 00:00 13 ['StÃ¤hli', 'Sieb', 'nen']
+1936/20032015: MÃ¼ller (CANTON MISSING)  [3638, 3645, 3652, 3654, 3658, 3659, 3662] 1936-01-30 00:00 6 ['MÃ¼ller', 'GrosshÃ¶chste', 'tten']
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 6ee0909d..041d1b66 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -761,12 +761,20 @@ class Document:
                     #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml,
                     #name_outcorrxml = self.name_outcorrxml)
 
-        with open('data/lists/notunique.txt', 'a') as f:
-            f.write(' '.join(('\n\n-------', str(self.year), self.id_doc, '\n')))
-
         print('we have a main corr XML file')
+
+        # get council and date
+        (str_council, str_date) = self.get_council_date()
+        self.str_council = str_council
+        self.str_date = str_date
+
+        # file to track speakers
+        self.name_speakers = '_'.join((str(self.year), self.id_doc, 'speakers.txt'))
+        with open('data/lists/speakers/' + self.name_speakers, 'w') as f:
+            f.write(' '.join((str(self.year), self.id_doc, str_date, '\n')))
+
         #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
-        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, bln_print=False)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, bln_print=False)
         self.XML_main_annot = XML_main_annot
 
         # save xml file
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 88020f47..512cc94b 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -26,7 +26,7 @@ from utils_proc import call_with_out
 # specify input and output files
 
 # needed for running in atom, can be ignored
-year = '1956'
+year = '1936'
 input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
@@ -98,10 +98,6 @@ for file_tarpath in files_to_process:
         print(id_doc + '\n')
         file_doc.df_lastnames = df_lastnames
         file_doc.list_notnames = list_notnames
-        # TODO: add this to next deeper level
-        (str_council, str_date) = file_doc.get_council_date()
-        file_doc.str_council = str_council
-        file_doc.str_date = str_date
         file_doc.annotate_xml()
 
 # Commands to get the compressegid version of the file
@@ -119,7 +115,7 @@ with open(input_notnames) as f:
 list_notnames = [term.rstrip() for term in list_notnames]
 
 # to test for one file
-file_tarpath = './1956/20036021_datacorr.xml'
+file_tarpath = './1936/20031986_datacorr.xml'
 
 id_doc = file_tarpath.split('/')[-1][:8]
 
@@ -132,10 +128,6 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20
 
     file_doc.df_lastnames = df_lastnames
     file_doc.list_notnames = list_notnames
-    # TODO: add this to next deeper level
-    (str_council, str_date) = file_doc.get_council_date()
-    file_doc.str_council = str_council
-    file_doc.str_date = str_date
     file_doc.annotate_xml()
 
 
@@ -151,30 +143,3 @@ file_doc.check_discussion()
 str_date = '1925-12-09 08:00'
 import datetime
 datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')
-
-
-
-listilist = ['a', 'b', 'c', 'd']
-'a' in listilist
-listilist[0,2]
-# OPTIMIZE
-
-list_1 = [1, 2,3, 4, 5]
-list_2 = [2,7,8]
-len(set(list_1).intersection(list_2))
-
-
-if 'ab' in 'abc':
-    print('yay')
-
-a = 10
-if a < 7 or a > 9:
-    print(a)
-
-a = 'asdf'
-b = 'asdf'
-a == b == 'asdf'
-
-'a' in a
-
-'as' in a
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 63622c1b..8e8f554f 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -19,11 +19,11 @@ import datetime
 
 # function to check whether a file containts discussions
 # achieved by excluding title pages, table of content, etc.
-# !!! function works well for 1891 - 1900, not checked after that !!!
+# TODO: function works well for 1891 - 1900, not checked after that !!!
 def check_if_discussion(path_meta_xml_file,
         list_attributes  = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'],
         list_nondiscussion = ['inhaltsverzeiGGchnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste',
-            'umschlag', 'sachregister', 'titelblatt', 'numerierung'],
+            'jahres-rednerliste', 'umschlag', 'sachregister', 'titelblatt', 'numerierung'],
         list_nondiscussion2 = ['table', 'matiÃ¨res', 'rÃ©pertoire', 'procÃ¨s-verbaux']):
 
     # parse, get root and then part of interest
@@ -84,10 +84,9 @@ def get_text(sometext):
 
 
 # function to annotated corrected XML
-def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, bln_print=False):
+def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, bln_print=False):
 
     # list of votation terms
-    # TODO: make it work for Ã©, etc.
     list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
                           'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)',
                           'Votation', 'Vote', 'votation', '(AdoptÃ©s)', 'adoptÃ©s', 'adoptÃ©e', 'rejetÃ©e',
@@ -136,15 +135,13 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da
 
                         if textbox_texttype in ['text_col1', 'text_col2']:
 
-                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, bln_print=False)
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False)
                             if this_is_speech:
                                 prev_is_speech = True
-#                                print('stopped after finding speech start')
                                 continue
-                            XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, bln_print=False)
+                            XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, str_file_speakers, bln_print=False)
                             if this_is_vote:
                                 prev_is_speech = False
-#                                print('stopped after finding vote')
                                 continue
                             if prev_is_speech and (not this_is_vote):
                                 XML_new = label_speechcont(XML_new, ind_p, ind_t)
@@ -237,7 +234,7 @@ def get_complete_text(textbox):
 # - bln_print: whether to print during execution, default False
 # output:
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
-def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, bln_print=False):
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False):
 
     # lists of roles
     list_roles = ['PrÃ¤sident', 'PrÃ¤sidentin', 'VizeprÃ¤sident', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente',
@@ -264,6 +261,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
         # look at first few terms of that textbox
         text_start = re.sub(r'[\(\)]','',text[:colon_index_text])
         list_oi = tokenizer.tokenize(text_start)
+        list_oi_full = list_oi
 
         if bln_print:
             print('possible speech start: ', list_oi)
@@ -277,10 +275,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
         if (len(list_oi) < 9):
             if (len(list_oi) < 5) or (len(set(list_oi).intersection(list_roles)) > 0):
 
-                with open('data/lists/notunique.txt', 'a') as f:
-                    f.write(' '.join((str(list_oi), str(len(list_oi)), '\n')))
-                flag_print = True
-
                 # remove stopwords
                 list_oi = [term for term in list_oi if term.lower() not in list_stopwords]
 
@@ -300,10 +294,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                 # TODO: maybe exclude I and A to account for Appenzell
                 list_oi = [term for term in list_oi if len(term)>1]
 
-                if len(list_oi) > 4 or flag_print:
-                    with open('data/lists/notunique.txt', 'a') as f:
-                        f.write(' '.join((str(list_oi), str(len(list_oi)), '\n')))
-
                 # if possible, find a name from the list
                 str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str_council, str_date, bln_print=False)
 
@@ -313,10 +303,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                 if str_role == 'federalcouncil' and str_name == '':
                     str_role = ''
 
-                if len(list_uniqueID) > 1 or flag_print:
-                    with open('data/lists/notunique.txt', 'a') as f:
-                        f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), str(list_oi), '\n')))
-
                 # get rid of 'PrÃ¤sident stimmt nicht PrÃ©sident ne vote pas'
                 if set(str_role.split()).intersection(set(['PrÃ¤sident', 'PrÃ¤sidentin', 'PrÃ©sident', 'PrÃ©sidente'])) and not str_name:
                     if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
@@ -331,6 +317,13 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                             print('get rid of FÃ¼r den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi)
                         str_name = ''
 
+                with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+                    f.write(' '.join(('page', str(ind_p + 1), str(list_oi), '\n')))
+                    f.write(' '.join(('name:', str_name, '\n')))
+                    f.write(' '.join(('role:', str_role, '\n')))
+                    f.write(' '.join(('uniqueID(s):', str(list_uniqueID), '\n')))
+                    f.write(' '.join(('text:', text[colon_index_text+1:colon_index_text+100], '\n\n')))
+
                 # if a name has been found, add it to XML_new
                 if str_name or str_role:
                     # add attribute speech_start to textbox
@@ -416,7 +409,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
 # - bln_print: whether to print during execution, default False
 # output:
 # - XML_new: updated
-def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, bln_print=True):
+def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, str_file_speakers, bln_print=True):
 
     # get first terms of that text
     list_oi = tokenizer.tokenize(text)[:15]
@@ -428,6 +421,11 @@ def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, bln_print=T
 
         # set flag
         this_is_vote = True
+
+        with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+            f.write(' '.join(('page', str(ind_p + 1), text, '\n')))
+            f.write(' '.join(('is a vote', '\n\n')))
+
         if bln_print:
             print('found a vote:', list_oi)
     else:
@@ -584,19 +582,26 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
     # for every term
     for term in list_oi:
 
-        if term in list_roles:
+        term_approx_role = get_approximate_term(term, np.array(list_roles))
+
+        if term in list_roles or term_approx_role:
             # update str_role
             # TODO: also look for similar terms (misspellings)
             # TODO: what with BundesprÃ¤sident?
             # TODO: is Berichterstatter the same as Sprecher?
+            if term_approx_role:
+                term_ = term_approx_role
+            else:
+                term_ = term
+
             # assign role in English
-            if term in ['PrÃ¤sident', 'PrÃ¤sidentin', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente']:
+            if term_ in ['PrÃ¤sident', 'PrÃ¤sidentin', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente']:
                 str_assignedRole = 'president'
-            elif term in ['VizeprÃ¤sident']:
+            elif term_ in ['VizeprÃ¤sident']:
                 str_assignedRole = 'vice-president'
-            elif term in ['Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole']:
+            elif term_ in ['Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole']:
                 str_assignedRole = 'reporter'
-            elif term in ['Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller', 'fÃ©dÃ©ral', 'BundesprÃ¤sident', 'BundesprÃ¤sidentin']:
+            elif term_ in ['Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller', 'fÃ©dÃ©ral', 'BundesprÃ¤sident', 'BundesprÃ¤sidentin']:
                 str_assignedRole = 'federalcouncil'
                 str_council = 'Bundesrat'  # needs to be German to be used in dataframe
 
@@ -693,6 +698,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
             else:
                 print('could not be identified as a canton:', term, list_oi, str_name, str_role)
 
+        # if term is first name
+        # needed when people are referenced by FirstName LastName, e.g. Simon Kohler
         elif term in list_all_firstnames:
             str_firstname = term
             print('found a first name', str_firstname)
@@ -709,9 +716,6 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                     # get correct name and uniqueID, or role, for that term
                     str_name, list_uniqueID, name_type = get_string(term, df_names, str_name, list_uniqueID)
 
-                if bln_print:
-                    print('=== correct name', term)
-
             # if term is not in list_all_names
             else:
                 # look for similar names based on (normalized) Damerau-Levenshtein distance
-- 
GitLab