Merge branch 'name-disambiguation' into 'master'

name-disambiguation See merge request !11

Merge branch 'name-disambiguation' into 'master'
name-disambiguation See merge request !11
86b84bab · Lili Gasser · 2b97c0f8 · 8473096a · 86b84bab · 86b84bab
Commit 86b84bab authored 6 years ago by Lili Gasser
--- a/data/lists/not_names.txt
+++ b/data/lists/not_names.txt
+Alinea
+Alter
+Ari
+Art
+besser
+bietet
+drehen
+Fällen
+fasse
+Ferner
+ferner
+findet
+Gallen
+Gründe
+hausen
+Herren
+Herr
+immer
+Kasse
+Kollege
+Kollega
+komme
+Leider
+lieber
+nehmen
+neu
+nicht
+Rath
+Schrit
+Seite
+selber
+Sinne
+später
+Steuer
+StGallen
+Stimmen
+Stimme
+stimmt
+tischen
+Tunnel
+Ueber
+Hans
+Walter
+Werner
+weiterer
+Wer
+wissen
+Ziel
+autre
+Biffer
+biffer
+cerner
+comme
+force
+cause
+dernier
+ouvert
+peu
+pilier
+poser
+projet
+Rédiger
+rédiger
+tirer
+vote
+delle
--- a/data/lists/wrongly_identified_speakers.txt
+++ b/data/lists/wrongly_identified_speakers.txt
+also check for council:
+-----------------------
+1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann
+1925/20029937: Schneider, Berichterstatter --> NR, not SR
+one MP not active in whole year, leads to other not uniquely identified
+-----------------------------------------------------------------------
+1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
+1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!)
+1925/20029967: Huber (in December, the second Huber already left) --> finds two!)
+1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc.
+identified as speech start but is in text:
+------------------------------------------
+do I really need to look on the first two lines? maybe one is sufficient?
+look for typical terms such as gestellt, gesagt, etc.
+1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt:
+1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. Bühler folgend, die Erklärung gewählt:
+1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 :
+1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete:
+1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt:
+1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger
+1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text
+1925/20029917: Herr Hauser sagt:
+1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt:
+1925/20029981: Brügger möchte ich sagen:
+1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout)
+1971/20000007: La  seconde  réaction  qu'a  suscité chez  moi  l'intervention  de  M. Weber  est  le  doute:
+1971/20000007: Herr  Kollega  Gut  hat  es  gesagt:
+1971/20000007: Noch  eine  Antwort  an  Kollege  Clottu
+1971/20000010: Nun  noch  etwas  zu  Richard  Müller.  Erstens
+1971/20000024: Noch  ein  Wort  zu  Herrn  Ständerat  Wenk
+1971/20000024: Herr  Kollege  Heimann  stellt  sich  schliesslich  gegen einen  Finanzausgleich  mit  dem  Hinweis
+wrongly spelled city
+--------------------
+1925/20029963: Jenny Ennend (instead of Ennenda)
+1925/20029995,96: Keller Zurich (instead of Zürich)
+1971/? : Berne instead of Bern
+doubled double names:
+---------------------
+1971/20000010: Meyer-Boller
+term very similar to one name is actually another name
+------------------------------------------------------
+1925/20029863: ganz --> finds Lanz, there is a Ganz
+1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer
+term is a name
+--------------
+1971/20000010: Ganz wenige Einzelfragen --> finds Ganz
+1971/20000024: Politisch  gesehen  ist  es  doch  ganz  einfach  so --> finds Ganz
+French name with special characters
+-----------------------------------
+1971/20000055: Debétaz
+Appenzeller
+-----------
+1894/20026597: Sonderegger
+1894/20026618: Sonderegger
+some other persons wrongly identified as MP
+-------------------------------------------
+1925/20029833: Sauser-Hall (not a MP)--> Hauser
--- a/requirements.txt
+++ b/requirements.txt
-numpy
+numpy<1.16
 scipy
 pandas
 xlrd

--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -107,7 +107,7 @@ class Document:
        else:
            print('Not saving to tar')
            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
        self.name_xml = [name_tar, name_xml]
        if flag_save:
            h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml)
@@ -119,10 +119,10 @@ class Document:
        self.n_pages = np.arange(len(self.XML_main))
        command = 'rm -rf ./' + str(self.year)
        #print(command)
-        utils_proc.call_with_out(command)        
+        utils_proc.call_with_out(command)
-    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, 
+    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None,
-                    ind_page = 0, textb_textl = 1):        
+                    ind_page = 0, textb_textl = 1):
        # The page refers here to the page of the imgobj, which might not correspond
        # to the one of the xml. For that reason we use n_pages to obtain the index
        # for the xml
@@ -357,12 +357,12 @@ class Document:
        if ind_page > (len(self.XML_main) - 1):
            flag_error = 1
            return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
        flag_central = 1
        if self.year > self.limit_year:
            flag_central = 0
-        flag_2col = 1        
+        flag_2col = 1
        XML_root = ET.Element('pages')
        XML_root.append(self.XML_main[ind_abs[0]])
        imarray = np.array(self.imgobj[ind_page])
@@ -380,10 +380,10 @@ class Document:
        XML_enrich = []
        if level_proc > 0:
-            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
+            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page,
-                                                                                   flag_2col, flag_central)  
+                                                                                   flag_2col, flag_central)
-        if level_proc > 1:            
+        if level_proc > 1:
            _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page)
        if level_proc > 2:
@@ -645,8 +645,8 @@ class Document:
            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                        + '_page' + str(ind_page) + '.' + format_fig)
            fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)       
+            plt.close(fig)
    def check_discussion(self):
        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
        flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
@@ -748,7 +748,7 @@ class Document:
        print('we have a main corr XML file')
        #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
-        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, bln_print=False)
        self.XML_main_annot = XML_main_annot
        # save xml file

--- a/src/python/extractMPs.py
+++ b/src/python/extractMPs.py
@@ -8,7 +8,7 @@ import sys
 input_file = sys.argv[1]    #'./data/politicians/Ratsmitglieder_1848_DE_corr.xlsx'
 output_file_csv = sys.argv[2]     #'./data/politicians/MPs_after1890.csv'
-output_folder_dict = sys.argv[3]    
+output_folder_dict = sys.argv[3]
 class MPs_Extractor(object):
@@ -19,12 +19,12 @@ class MPs_Extractor(object):
        self.output_folder_dict = output_folder_dict
        self.range_years = range(years[0], years[1] + 1)
        self.df_exc = df_exc
    # function to get lists of lastnames
    # input:
    # - df_year: dataframe for a year
    # output:
-    # - list_names: 
+    # - list_names:
    #      contains:
    #        - list of last names that appear only once and cannot be split
    #        - list of last name that are made up of two names such as 'Meier-Müller'
@@ -44,7 +44,7 @@ class MPs_Extractor(object):
        str_comp = 'comp'
        str_canton2 = 'canton'
-	# function to split lastname and save meaningful part(s) to list 
+	# function to split lastname and save meaningful part(s) to list
        def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''):
 	    # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
            lastname_split = lastname.split()
@@ -70,7 +70,7 @@ class MPs_Extractor(object):
                    list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton)
 		    # write double name without space into list
                    list_names.append((str_double, ''.join(lastname.split('-')), lastname, uniqueID) + tpl_canton)
-                else:       
+                else:
                    if str_canton:
                        list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton)
                    else:
@@ -82,66 +82,67 @@ class MPs_Extractor(object):
            str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
            str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
            str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
+            str_doublename = df_year['DoubleName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
-            return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname)
+            return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_doublename)
-	# create empty lists for last names
+    	# create empty lists for last names
        list_names = []
-	# for every last name
+    	# for every last name
        for lastname in df_year['LastName'].drop_duplicates():
 	    #print('name', lastname, type(lastname))
-	    # extract all entries with that last name
+    	    # extract all entries with that last name
            df_temp = df_year.loc[df_after1890['LastName']==lastname]
-	    #print(df_temp)
+    	    #print(df_temp)
-	    # if there is an extra double name
+    	    # if there is an extra double name
            if df_temp.iloc[0]['DoubleName'] != '':
-		# extract unique index
+    		# extract unique index
                uniqueID = df_temp.iloc[0]['uniqueIndex']
                # get canton information for that uniqueID
                tpl_canton = get_canton(df_year, uniqueID)
                #print('double name', df_temp)
                doublename = df_temp.iloc[0]['DoubleName']
-		# if last name is a double name, e.g. 'Meier-Müller'
+        		# if last name is a double name, e.g. 'Meier-Müller'
                lastname_split2 = doublename.replace('-', ' ').split()
                if len(lastname_split2) > 1:
-		    # write each part of double name into corresponding list
+        		    # write each part of double name into corresponding list
                    for item in lastname_split2:
                        list_names.append((str_double, item, lastname, uniqueID) + tpl_canton)
-		    # write double name into list
+        		    # write double name into list
                    list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton)
-		    # write double name without space into list
+        		    # write double name without space into list
                    list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton)
-	    # if only one person with that last name
+    	    # if only one person with that last name
            if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1:
-		# extract unique index
+    		# extract unique index
                uniqueID = df_temp.iloc[0]['uniqueIndex']
                # get canton information for that uniqueID
                tpl_canton = get_canton(df_year, uniqueID)
-		# write complete name to list of last names
+        		# write complete name to list of last names
                split_lastname(lastname, uniqueID, tpl_canton)
-	    # if there are several people with the same last name
+    	    # if there are several people with the same last name
            else:
-		# write last name and canton to correct list
+        		# write last name and canton to correct list
                for idx, row in df_temp.drop_duplicates(['uniqueIndex']).iterrows():
-		    # extract unique index
+        		    # extract unique index
                    uniqueID = df_temp.loc[idx]['uniqueIndex']
                    # get canton information for that uniqueID
                    tpl_canton = get_canton(df_year, uniqueID)
-		    # write the lastname to the list
+        		    # write the lastname to the list
-                    split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') 
+                    split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')')
        return list_names
    def extract(self):
@@ -172,7 +173,7 @@ class MPs_Extractor(object):
        # group by first and last name, and date of birth
        grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"])
-        # assign first index to all entries of a person 
+        # assign first index to all entries of a person
        for list_index in grouped.groups.values():
            df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0]
@@ -192,15 +193,15 @@ class MPs_Extractor(object):
            df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)]
            df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)]
            print(year, df_year.shape)
            # write df_year to a yearly csv file
        #    str_year = str(year)
        #    df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv')
            # create a pandas dataframe from list of names
            # !!! list contains errors, see definition of function
            list_lastnames = self.get_list_of_lastnames(df_year, df_after1890)
-            df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName'))
+            df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'DoubleName'))
            # dump dictionary of last names to a pickle file
 #           path = pathlib.
@@ -213,19 +214,18 @@ years = [1891, 2016]   #2016
 df_exc = pd.DataFrame(columns=['LastName', 'FirstName', 'DoubleName'])
 # exception: Konrad H. Cramer is also reffered to as Cramer-Frey. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'}
 # exception: Johannes Blumer SG is also reffered to as Blumer-Egloff. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'}
 # exception: Adolphe Jordan VD is also reffered to as Jordan-Martin. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'}
 # exception: Jakob Schmid LU is also reffered to as Schmid-Ronca. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'}
 # exception: Eduard Sulzer ZH is also reffered to as Sulzer-Ziegler. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'}
 # exception: Howard Eugster AR is also reffered to as Eugster-Züst. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'}
 #print(df_exc)
 mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_exc)
 mps_extractor.extract()
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -26,10 +26,11 @@ from utils_proc import call_with_out
 # specify input and output files
 # needed for running in atom, can be ignored
-year = '1891'
+year = '1971'
 input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
+input_notnames = "data/lists/not_names.txt"
 output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 #%%
@@ -37,7 +38,8 @@ output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 input_lastnames = sys.argv[1]
 input_correctedxml = sys.argv[2]
 input_correctedmeta = sys.argv[3]
-output_annotatedxml = sys.argv[4]
+input_notnames = sys.argv[4]
+output_annotatedxml = sys.argv[5]
 #%%
 # extract suffixes, year, folder_database
@@ -51,6 +53,7 @@ suffix_correctedmeta = '_metacorr'
 input_rawmeta = folder_database + '/' + year + '/' + '01_rawmeta.tar.gz'
 #%%
+# TODO: make it work!
 # git lfs pull necessary data
 for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]:
    command = 'git lfs pull -I ' + lfsfile
@@ -77,6 +80,11 @@ with open(input_lastnames, 'rb') as f:
 print('dataframe with lastnames loaded')
+with open(input_notnames) as f:
+    list_notnames = f.readlines()
+list_notnames = [term.rstrip() for term in list_notnames]
 #%%
 # for each file
 # TODO !!!! get rid of [66:]
@@ -92,6 +100,7 @@ for file_tarpath in files_to_process:
    if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
        print(id_doc + '\n')
        file_doc.df_lastnames = df_lastnames
+        file_doc.list_notnames = list_notnames
        file_doc.annotate_xml()
 # Commands to get the compressed version of the file
@@ -103,20 +112,21 @@ utils_proc.compress_tar(output_annotatedxml)
 #%%
-## to test for one file
+# to test for one file
-#file_tarpath = './1893/20026526_datacorr.xml'
+file_tarpath = './1971/20000619_datacorr.xml'
-#
-#id_doc = file_tarpath.split('/')[-1][:8]
+id_doc = file_tarpath.split('/')[-1][:8]
-#
-## instantiate document object (always from original pdf)
+# instantiate document object (always from original pdf)
-#infile_aux = year + '/' + id_doc + '.pdf'
+infile_aux = year + '/' + id_doc + '.pdf'
-#file_doc = defc.Document(infile_aux, folder_database)
+file_doc = defc.Document(infile_aux, folder_database)
-#
-#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
+if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
-    #print(id_doc + '\n')
+    print(id_doc + '\n')
-#
-    #file_doc.df_lastnames = df_lastnames
+    file_doc.df_lastnames = df_lastnames
-    #file_doc.annotate_xml()
+    file_doc.list_notnames = list_notnames
+    file_doc.annotate_xml()
 #%%

--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
--- a/src/sh/extract_discussions_yearly.sh
+++ b/src/sh/extract_discussions_yearly.sh
 #!/bin/bash
 year_start=1891
-year_end=1893
+year_end=1891
 for year in $(seq $year_start $year_end)
 do
    echo $year
-    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz
+    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz
 done