improve recognizing speakers by citizenship

933b4bab · Lili Gasser · 9f1c770e · 933b4bab · 933b4bab · 933b4bab
Commit 933b4bab authored 6 years ago by Lili Gasser
--- a/data/lists/not_names.txt
+++ b/data/lists/not_names.txt
@@ -12,6 +12,7 @@ Herr
 Kasse
 nicht
 Rath
+Schrit
 Seite
 selber
 Steuer

--- a/data/lists/wrongly_identified_speakers.txt
+++ b/data/lists/wrongly_identified_speakers.txt
@@ -2,6 +2,7 @@
 also check for council:
 -----------------------
 1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann
+1925/20029937: Schneider, Berichterstatter --> NR, not SR
 one MP not active in whole year, leads to other not uniquely identified
@@ -14,7 +15,13 @@ one MP not active in whole year, leads to other not uniquely identified
 identified as speech start but is in text:
 ------------------------------------------
+do I really need to look on the first two lines? maybe one is sufficient?
+look for typical terms such as gestellt, gesagt, etc.
+1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt:
+1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. Bühler folgend, die Erklärung gewählt:
 1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 :
+1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete:
 1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt:
 1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger
 1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text
@@ -23,6 +30,18 @@ identified as speech start but is in text:
 1925/20029981: Brügger möchte ich sagen: --> identified as speech start but is in text
+wrongly spelled city
+--------------------
+1925/20029963: Jenny Ennend (instead of Ennenda)
+1925/20029995,96: Keller Zurich (instead of Zürich)
+Appenzeller
+-----------
+1894/20026597: Sonderegger
+1894/20026618: Sonderegger
 some other persons wrongly identified as MP

--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -4,8 +4,8 @@
 # Code to extract discussions from corrected XML files
 #%%
 # to work with atom
-#%load_ext autoreload
+%load_ext autoreload
-#%autoreload 2
+%autoreload 2
 import pickle
 import time
@@ -26,7 +26,7 @@ from utils_proc import call_with_out
 # specify input and output files
 # needed for running in atom, can be ignored
-year = '1925'
+year = '1894'
 input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
@@ -111,20 +111,21 @@ utils_proc.compress_tar(output_annotatedxml)
 #%%
-## to test for one file
+# to test for one file
-#file_tarpath = './1893/20026526_datacorr.xml'
+file_tarpath = './1925/20029981_datacorr.xml'
-#
-#id_doc = file_tarpath.split('/')[-1][:8]
+id_doc = file_tarpath.split('/')[-1][:8]
-#
-## instantiate document object (always from original pdf)
+# instantiate document object (always from original pdf)
-#infile_aux = year + '/' + id_doc + '.pdf'
+infile_aux = year + '/' + id_doc + '.pdf'
-#file_doc = defc.Document(infile_aux, folder_database)
+file_doc = defc.Document(infile_aux, folder_database)
-#
-#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
+if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
-    #print(id_doc + '\n')
+    print(id_doc + '\n')
-#
-    #file_doc.df_lastnames = df_lastnames
+    file_doc.df_lastnames = df_lastnames
-    #file_doc.annotate_xml()
+    file_doc.list_notnames = list_notnames
+    file_doc.annotate_xml()
 #%%
@@ -134,3 +135,16 @@ utils_proc.compress_tar(output_annotatedxml)
 #id_doc
 #len(files_to_process)
+list_bla = [1, 2,3]
+list_bla.extend([4, 5])
+list_bla
+if 3 in [1,3, 4]:
+    print('yes')
+#%%
+str_name = 'Blumer' # (CANTON MISSING)'
+print(str_name.split(' '))
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -522,7 +522,7 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
                  'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
                  'Vizepräsident']
-    list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names)
+    list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
    # extract list and array of last names
    list_all_names = list(df_names['name_short'])
@@ -537,7 +537,21 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
        if bln_print:
            print('now is about: ------', term)
-        if name_type == 'canton':
+        if term in list_roles:
+            # get correct name and uniqueID, or role, for that term
+            str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
+            if bln_print:
+                print('found a role', term)
+            # TODO: also look for similar terms (misspellings)
+        elif term in list_roles_ext:
+            pass
+            # TODO: extract whether it is minority or majority and save that information
+        elif name_type == 'canton':
+            list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
            canton_type = ''
            if term in list_cantonname:
                str_canton = term
@@ -559,20 +573,34 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
            else:
                print('might be a canton', term, list_oi, str_name, str_role)
+            # if a canton or similar was found
            if canton_type:
                # get rid of CANTON MISSING
                str_name = str_name.split(' ')[0]
                # extract uniqueID
-#                list_temp = []
+                # if Citizenship, do proper comparison
-                list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                if canton_type == 'Citizenship':
+                    df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)]
+                    list_citizenship = [term for term in df_temp[canton_type] if str_canton in tokenizer_canton.tokenize(term)]
+                    try:
+                        if len(list_citizenship) == 1:
+                            str_citizenship = list_citizenship[0]
+                    except:
+                        print('found no or more than one person with citizenship', str_canton, str_name)
+                        pass
+                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                else:
+                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                print(list_temp, list_uniqueID)
                list_uniqueID = list_temp
        # if term is not easily mistaken as a name (avoid false positives)
-        if term not in list_notnames:
+        elif term not in list_notnames:
-            # if term is in the list of all names and roles
+            # if term is in the list of all names
-            if term in (list_all_names + list_roles):
+            if term in list_all_names:
                # get correct name and uniqueID, or role, for that term
                str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
@@ -841,9 +869,18 @@ def dict_only_text(dictionary):
 # function to get list of places
-def get_list_cantons(df_names):
+def get_list_cantons(df_names, str_name = ''):
-    df_temp = df_names.loc[df_names['type']=='canton']
+    if str_name:
+        df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)]
+    else:
+        df_temp = df_names.loc[df_names['type']=='canton']
+    print(df_temp)
    list_cantonname = list(df_temp['CantonName'])
+    for canton in ['Basel-Stadt', 'Basel-Landschaft']:
+        if canton in list_cantonname:
+            list_cantonname.extend(['Basel'])
+    if 'Graubünden' in list_cantonname:
+        list_cantonname.extend(['Bünden'])
    list_cantonabbr = list(df_temp['CantonAbbreviation'])
    list_citizenship = list(df_temp['Citizenship'])
    list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')]