WIP: name disambiguation doubled double names

3a8ce1a1 · Lili Gasser · 8cbe4c86 · 3a8ce1a1
Commit 3a8ce1a1 authored 6 years ago by Lili Gasser
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -274,9 +274,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
        if bln_print:
            print('name', str_name, 'role', str_role)

-        # get rid of doubled double names
-        # TODO
-
        # get rid of 'Präsident stimmt nicht Président ne vote pas'
        if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name:
            if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
@@ -423,94 +420,58 @@ def flatten(l):
 # - str_name: string to which name should be attached
 # - list_uniqueID: list with one or several uniqueIDs
 # - list_tupels: list of tupels containing all types of names
-# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
 def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):

    def get_string(term, df_names, str_name, list_uniqueID):
        # get name type
        name_type = df_names['nameType'].loc[df_names['shortName']==term].iloc[0]
-        print(df_names[df_names['shortName']==term])
-        print(term)
-        print(name_type)
+        if name_type != 'simple':
+            print(df_names[df_names['shortName']==term])
+        print(term, name_type)

-        # extract uniqueID
+        # extract uniqueID and complete name for this term
        list_temp = []
-        # TODO might lead to doubled double names
        if name_type in ['simple', 'double', 'comp']:
            list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
            str_completeName = df_names['completeName'].loc[df_names['shortName']==term].iloc[0]
-            str_name = add_to_string(str_name, str_completeName)

        # TODO: how to handle for people mentioned in text???
        elif name_type in ['canton']:
            list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
-            str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
-
-        print(list_temp)
-        print(str_name)
-
-        ## if it is one of the simple names
-        #if term in list(df_names['shortName'].loc[df_names['nameType']=='simple']):
-            #str_name = add_to_string(str_name, term)
-            #name_type = 'simple'
-        ## if it is a double name
-        #elif term in list(df_names['shortName'].loc[df_names['nameType']=='double']):
-            #if bln_print:
-                #print(5*'\n', 'DOUBLE NAME')
-            ## get correct name
-            #correct_name = df_names.loc[(df_names['nameType']=='double') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
-            #if bln_print:
-                #print('double name', correct_name)
-            ## only add name if it is not there yet
-            ## if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries
-            #if correct_name not in str_name.split(' '):
-                #str_name = add_to_string(str_name, correct_name)
-            #name_type = 'double'
-        ## if it is a composite name
-        #elif term in list(df_names['shortName'].loc[df_names['nameType']=='comp']):
-            ## get correct name
-            #correct_name = df_names.loc[(df_names['nameType']=='comp') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
-            #if bln_print:
-                #print('composite name', correct_name)
-            #str_name = add_to_string(str_name, correct_name)
-            #name_type = 'comp'
-        ## if it contains a canton
-        ## TODO: how to handle for people mentioned in text???
-        #elif term in list(df_names['shortName'].loc[df_names['nameType']=='canton']):
-            #if bln_print:
-                #print('contains a canton', term)
-#
-            #str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
-            #name_type = 'canton'
-#
-#
-        ## extract uniqueID
-        #list_temp = []
-        #if name_type in ['simple', 'double', 'comp']:
-            #list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
-        #elif name_type in ['canton']:
-            #list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
-
-
-        if len(list_temp) > 0:
-            if bln_print:
-                print(list_temp, list_uniqueID)
-                print(type(list_temp), type(list_uniqueID))
+            str_completeName = term + ' (CANTON MISSING)'
+
+        print(list_temp, str_completeName)
+
+        # set or update unique ID and name
+        # if no unique ID and name has been assigned so far
+        if len(list_uniqueID) == 0 and str_name == '':
+            list_uniqueID = list_temp
+            str_name = add_to_string(str_name, str_completeName)

-            list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
+        # if there are already one or several peope
        else:
-            print('is this even possible??')
+            # if it is a double name,
+            if name_type == 'double':
+                if list_uniqueID == list_temp:
+                    # do nothing if person has already been found
+                    pass
+                else:
+                    # check whether we found a person with the same first part of the double lastname
+                    # and overwrite if this is the case
+                    # e.g. if we found a Meyer before we found a Meyer-Boller, e.g. 1971/20000010
+                    if str_completeName.split('-')[0] == str_name.split(' ')[0]:
+                        list_uniqueID = list_temp
+                        str_name = add_to_string('', str_completeName)
+
+            # if we have a new person, we append
+            elif len(set(list_temp).intersection(set(flatten(list_uniqueID)))) != 0:
+                list_uniqueID.append(list_temp)
+                str_name = add_to_string(str_name, str_completeName)

        return str_name, list_uniqueID, name_type

    def update_list_uniqueID(list_uniqueID, list_temp, name_type):
-        # if no unique ID has been assigned so far
-        if len(list_uniqueID) == 0:
-            list_uniqueID = list_temp
-        # if there are already one or several people and have a new person, we update
-        elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
-            list_uniqueID.append(list_temp)
-        # if name_type is canton
+        # if name_type is canton, we override other entries by correct one
        if name_type == 'canton' and len(list_temp) == 1 and list_temp[0] in list_uniqueID:
            list_uniqueID = list_temp

@@ -595,7 +556,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln

        # cannot happen for the first term in list_oi
        elif name_type == 'canton':
-            list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
+            list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0])
            canton_type = ''
            if term in list_cantonname:
                str_canton = term
@@ -613,14 +574,26 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
                str_canton = term
                canton_type = 'FirstName'
                print('!!! is a canton', term, list_oi, str_name, str_role)
+            elif term in list_additionalInfo:
+                str_canton = term
+                canton_type = 'additionalInfo'
+                print('!!! is a canton', term, list_oi, str_name, str_role)

            else:
-                print('might be a canton', term, list_oi, str_name, str_role)
+                # look for similar names based on (normalized) Damerau-Levenshtein distance
+                # TODO: might needs to be extended for other than cantonname
+                term_approx = get_approximate_term(term, np.array(list_cantonname))
+                if term_approx:
+                    str_canton = term_approx
+                    canton_type = 'CantonName'
+
+                print('might be a canton:', term, list_oi, str_name, str_role, term_approx)

            # if a canton or similar was found
            if canton_type:
                # get rid of CANTON MISSING
                str_name = str_name.split(' ')[0]
+
                # extract uniqueID
                # if Citizenship, do proper comparison
                if canton_type == 'Citizenship':
@@ -636,16 +609,19 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln

                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0]
-                    str_name = add_to_string(str_name, str_completeName)

                else:
                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0]
-                    str_name = add_to_string(str_name, str_completeName)
+
                print(list_temp, list_uniqueID)

                if len(list_temp) > 0:
                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
+                    if str_completeName.split(' ')[0] == str_name:
+                        str_name = add_to_string('', str_completeName)
+                    else:
+                        str_name = add_to_string(str_name, str_completeName)

        # if term is not easily mistaken as a name (avoid false positives)
        elif term not in list_notnames:
@@ -750,20 +726,27 @@ def get_list_cantons(df_names, str_name = ''):
    else:
        df_temp = df_names.loc[df_names['nameType']=='canton']
    #print(df_temp)
+    # list of cantons
    list_cantonname = list(df_temp['CantonName'])
+    # TODO this will lead to an error!
    for canton in ['Basel-Stadt', 'Basel-Landschaft']:
        if canton in list_cantonname:
            list_cantonname.extend(['Basel'])
-    if 'Graubünden' in list_cantonname:
-        list_cantonname.extend(['Bünden'])
-    if 'Bern' in list_cantonname:    # check how this works!!
-        list_cantonname.extend(['Berne'])
+
+    # list of canton abbreviations
    list_cantonabbr = list(df_temp['CantonAbbreviation'])
+
+    # list of citizenships
    list_citizenship = list(df_temp['Citizenship'])
    list_citizenship = get_cities(list_citizenship)
+
+    # list of first names
    list_firstname = list(df_temp['FirstName'])

-    return list_cantonname, list_cantonabbr, list_citizenship, list_firstname
+    # list of additional information
+    list_additionalInfo = list(df_temp['additionalInfo'])
+
+    return list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo