last name which is both canton and double now also finds person which is...

last name which is both canton and double now also finds person which is double (Schmid-Ruedin Philipp, 1951)

last name which is both canton and double now also finds person which is...
last name which is both canton and double now also finds person which is double (Schmid-Ruedin Philipp, 1951)
00d13ae7 · Lili Gasser · 2ba37261 · 00d13ae7 · 00d13ae7 · 00d13ae7
Commit 00d13ae7 authored 6 years ago by Lili Gasser
--- a/data/lists/not_names.txt
+++ b/data/lists/not_names.txt
@@ -51,6 +51,7 @@ StGallen
 Stimmen
 Stimme
 stimmt
+Studien
 tischen
 Tunnel
 Ueber

--- a/data/lists/wrongly_identified_speakers.txt
+++ b/data/lists/wrongly_identified_speakers.txt
@@ -68,6 +68,7 @@ term very similar to one name is actually another name
 ------------------------------------------------------
 1925/20029863: ganz --> finds Lanz, there is a Ganz
 1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer
+1951/20035112: Schweizer --> finds Schwizer
 term is a name
@@ -89,6 +90,10 @@ person has entry date 29.11.71 but is not yet active (presumably):
 1971/20000726: one Muheim starts 29.11.71, discussion is on 8.12.71 --> finds two!
+Firstname before LastName
+-------------------------
+1971/20000592: Simon Kohler rapporteur
 two people with same last name and same citizenship
 ---------------------------------------------------
 1951/20034993: Eggenberger Grabs

--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -467,7 +467,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
        # TODO: how to handle for people mentioned in text???
        elif name_type in ['canton']:
-            list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+            list_temp = list(df_names.loc[(df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
            str_completeName = term + ' (CANTON MISSING)'
        print(list_temp, str_completeName)
@@ -598,8 +598,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
            # if person was not uniquely identified, check for misspellings
            if not canton_type:
                # look for similar names based on (normalized) Damerau-Levenshtein distance
-                # only look at cantonname, citizenship and additionalinfo
+                # only look at cantonname, citizenship, firstname and additionalinfo
-                list_cantons_approx = [list_cantons[i] for i in (0, 2, 4)]
+                list_cantons_approx = [list_cantons[i] for i in (0, 2, 3, 4)]
                for list_, type_ in list_cantons_approx:
                    term_approx = get_approximate_term(term, np.array(list_))
                    if term_approx:
@@ -613,7 +613,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                # get rid of CANTON MISSING
                str_name = str_name.split(' ')[0]
-                df_temp = get_df_temp_canton(df_names, str_name, str_council)
+                df_temp = get_df_temp(df_names, str_name, str_council)
                # extract uniqueID
                # if Citizenship, get list of cities and compare each to term
                if canton_type == 'Citizenship':
@@ -622,20 +622,20 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                    str_citizenship = ''
                    if len(list_cities) == 1:
                        str_citizenship = list_cities[0]
-                        list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
+                        list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
-                        str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0]
+                        str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0]
                    elif len(list_cities) > 1:
                        print('found more than one person with citizenship', str_canton, str_name, list_cities)
                        # TODO what happens with these:?
-                        list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                        list_temp = list(df_names.loc[(df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                        str_completeName = str_name + ' (CANTON MISSING)'
                    else:
                        print('found no person with citizenship', str_canton, str_name, list_cities)
                else:
-                    list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
+                    list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
-                    str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]
+                    str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]
                print(list_temp, list_uniqueID, str_completeName)
@@ -799,12 +799,12 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
 def get_cities(list_citizenship):
    return [city[:-5] for item in list_citizenship for city in item.split(',')]
-def get_df_temp_canton(df_names, str_name, str_council):
+def get_df_temp(df_names, str_name, str_council):
    if str_council in ['Nationalrat', 'Ständerat']:
-        df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)]
+        df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)]
    else:
-        df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)]
+        df_temp = df_names.loc[(df_names['shortName']==str_name)]
    return df_temp
@@ -819,7 +819,7 @@ def get_list_cantons(df_names, str_name, str_council = ''):
    str_additionalInfo = 'additionalInfo'
    # get dataframe
-    df_temp = get_df_temp_canton(df_names, str_name, str_council)
+    df_temp = get_df_temp(df_names, str_name, str_council)
    # list of cantons
    list_cantonname = list(df_temp[str_CantonName])