diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt index 044045dcdb6931137c5a565ec16bc5e6b39e5936..1a44b305e7d8e5267e6140416581f5b397009058 100644 --- a/data/lists/not_names.txt +++ b/data/lists/not_names.txt @@ -51,6 +51,7 @@ StGallen Stimmen Stimme stimmt +Studien tischen Tunnel Ueber diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index db6bb1a1de5c566d89a730c99ba6aa95b098cef6..c46c3fe19a88256f573d41983299317e502749b8 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -68,6 +68,7 @@ term very similar to one name is actually another name ------------------------------------------------------ 1925/20029863: ganz --> finds Lanz, there is a Ganz 1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer +1951/20035112: Schweizer --> finds Schwizer term is a name @@ -89,6 +90,10 @@ person has entry date 29.11.71 but is not yet active (presumably): 1971/20000726: one Muheim starts 29.11.71, discussion is on 8.12.71 --> finds two! +Firstname before LastName +------------------------- +1971/20000592: Simon Kohler rapporteur + two people with same last name and same citizenship --------------------------------------------------- 1951/20034993: Eggenberger Grabs diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index ceedfc9b8d8c44d20972e5a6d29267f66efa0bb6..8572e209b340f644e120b0fe377fd1885a5ee240 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -467,7 +467,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # TODO: how to handle for people mentioned in text??? elif name_type in ['canton']: - list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + list_temp = list(df_names.loc[(df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) str_completeName = term + ' (CANTON MISSING)' print(list_temp, str_completeName) @@ -598,8 +598,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # if person was not uniquely identified, check for misspellings if not canton_type: # look for similar names based on (normalized) Damerau-Levenshtein distance - # only look at cantonname, citizenship and additionalinfo - list_cantons_approx = [list_cantons[i] for i in (0, 2, 4)] + # only look at cantonname, citizenship, firstname and additionalinfo + list_cantons_approx = [list_cantons[i] for i in (0, 2, 3, 4)] for list_, type_ in list_cantons_approx: term_approx = get_approximate_term(term, np.array(list_)) if term_approx: @@ -613,7 +613,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # get rid of CANTON MISSING str_name = str_name.split(' ')[0] - df_temp = get_df_temp_canton(df_names, str_name, str_council) + df_temp = get_df_temp(df_names, str_name, str_council) # extract uniqueID # if Citizenship, get list of cities and compare each to term if canton_type == 'Citizenship': @@ -622,20 +622,20 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str str_citizenship = '' if len(list_cities) == 1: str_citizenship = list_cities[0] - list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) - str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0] + list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0] elif len(list_cities) > 1: print('found more than one person with citizenship', str_canton, str_name, list_cities) # TODO what happens with these:? - list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + list_temp = list(df_names.loc[(df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) str_completeName = str_name + ' (CANTON MISSING)' else: print('found no person with citizenship', str_canton, str_name, list_cities) else: - list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) - str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] + list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] print(list_temp, list_uniqueID, str_completeName) @@ -799,12 +799,12 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): def get_cities(list_citizenship): return [city[:-5] for item in list_citizenship for city in item.split(',')] -def get_df_temp_canton(df_names, str_name, str_council): +def get_df_temp(df_names, str_name, str_council): if str_council in ['Nationalrat', 'Ständerat']: - df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] + df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] else: - df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)] + df_temp = df_names.loc[(df_names['shortName']==str_name)] return df_temp @@ -819,7 +819,7 @@ def get_list_cantons(df_names, str_name, str_council = ''): str_additionalInfo = 'additionalInfo' # get dataframe - df_temp = get_df_temp_canton(df_names, str_name, str_council) + df_temp = get_df_temp(df_names, str_name, str_council) # list of cantons list_cantonname = list(df_temp[str_CantonName])