Skip to content
Snippets Groups Projects
Commit 00d13ae7 authored by Lili Gasser's avatar Lili Gasser
Browse files

last name which is both canton and double now also finds person which is...

last name which is both canton and double now also finds person which is double (Schmid-Ruedin Philipp, 1951)
parent 2ba37261
No related branches found
No related tags found
No related merge requests found
...@@ -51,6 +51,7 @@ StGallen ...@@ -51,6 +51,7 @@ StGallen
Stimmen Stimmen
Stimme Stimme
stimmt stimmt
Studien
tischen tischen
Tunnel Tunnel
Ueber Ueber
......
...@@ -68,6 +68,7 @@ term very similar to one name is actually another name ...@@ -68,6 +68,7 @@ term very similar to one name is actually another name
------------------------------------------------------ ------------------------------------------------------
1925/20029863: ganz --> finds Lanz, there is a Ganz 1925/20029863: ganz --> finds Lanz, there is a Ganz
1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer 1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer
1951/20035112: Schweizer --> finds Schwizer
term is a name term is a name
...@@ -89,6 +90,10 @@ person has entry date 29.11.71 but is not yet active (presumably): ...@@ -89,6 +90,10 @@ person has entry date 29.11.71 but is not yet active (presumably):
1971/20000726: one Muheim starts 29.11.71, discussion is on 8.12.71 --> finds two! 1971/20000726: one Muheim starts 29.11.71, discussion is on 8.12.71 --> finds two!
Firstname before LastName
-------------------------
1971/20000592: Simon Kohler rapporteur
two people with same last name and same citizenship two people with same last name and same citizenship
--------------------------------------------------- ---------------------------------------------------
1951/20034993: Eggenberger Grabs 1951/20034993: Eggenberger Grabs
......
...@@ -467,7 +467,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str ...@@ -467,7 +467,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# TODO: how to handle for people mentioned in text??? # TODO: how to handle for people mentioned in text???
elif name_type in ['canton']: elif name_type in ['canton']:
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) list_temp = list(df_names.loc[(df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = term + ' (CANTON MISSING)' str_completeName = term + ' (CANTON MISSING)'
print(list_temp, str_completeName) print(list_temp, str_completeName)
...@@ -598,8 +598,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str ...@@ -598,8 +598,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# if person was not uniquely identified, check for misspellings # if person was not uniquely identified, check for misspellings
if not canton_type: if not canton_type:
# look for similar names based on (normalized) Damerau-Levenshtein distance # look for similar names based on (normalized) Damerau-Levenshtein distance
# only look at cantonname, citizenship and additionalinfo # only look at cantonname, citizenship, firstname and additionalinfo
list_cantons_approx = [list_cantons[i] for i in (0, 2, 4)] list_cantons_approx = [list_cantons[i] for i in (0, 2, 3, 4)]
for list_, type_ in list_cantons_approx: for list_, type_ in list_cantons_approx:
term_approx = get_approximate_term(term, np.array(list_)) term_approx = get_approximate_term(term, np.array(list_))
if term_approx: if term_approx:
...@@ -613,7 +613,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str ...@@ -613,7 +613,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# get rid of CANTON MISSING # get rid of CANTON MISSING
str_name = str_name.split(' ')[0] str_name = str_name.split(' ')[0]
df_temp = get_df_temp_canton(df_names, str_name, str_council) df_temp = get_df_temp(df_names, str_name, str_council)
# extract uniqueID # extract uniqueID
# if Citizenship, get list of cities and compare each to term # if Citizenship, get list of cities and compare each to term
if canton_type == 'Citizenship': if canton_type == 'Citizenship':
...@@ -622,20 +622,20 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str ...@@ -622,20 +622,20 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
str_citizenship = '' str_citizenship = ''
if len(list_cities) == 1: if len(list_cities) == 1:
str_citizenship = list_cities[0] str_citizenship = list_cities[0]
list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0] str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0]
elif len(list_cities) > 1: elif len(list_cities) > 1:
print('found more than one person with citizenship', str_canton, str_name, list_cities) print('found more than one person with citizenship', str_canton, str_name, list_cities)
# TODO what happens with these:? # TODO what happens with these:?
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) list_temp = list(df_names.loc[(df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = str_name + ' (CANTON MISSING)' str_completeName = str_name + ' (CANTON MISSING)'
else: else:
print('found no person with citizenship', str_canton, str_name, list_cities) print('found no person with citizenship', str_canton, str_name, list_cities)
else: else:
list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]
print(list_temp, list_uniqueID, str_completeName) print(list_temp, list_uniqueID, str_completeName)
...@@ -799,12 +799,12 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): ...@@ -799,12 +799,12 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
def get_cities(list_citizenship): def get_cities(list_citizenship):
return [city[:-5] for item in list_citizenship for city in item.split(',')] return [city[:-5] for item in list_citizenship for city in item.split(',')]
def get_df_temp_canton(df_names, str_name, str_council): def get_df_temp(df_names, str_name, str_council):
if str_council in ['Nationalrat', 'Ständerat']: if str_council in ['Nationalrat', 'Ständerat']:
df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)]
else: else:
df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)] df_temp = df_names.loc[(df_names['shortName']==str_name)]
return df_temp return df_temp
...@@ -819,7 +819,7 @@ def get_list_cantons(df_names, str_name, str_council = ''): ...@@ -819,7 +819,7 @@ def get_list_cantons(df_names, str_name, str_council = ''):
str_additionalInfo = 'additionalInfo' str_additionalInfo = 'additionalInfo'
# get dataframe # get dataframe
df_temp = get_df_temp_canton(df_names, str_name, str_council) df_temp = get_df_temp(df_names, str_name, str_council)
# list of cantons # list of cantons
list_cantonname = list(df_temp[str_CantonName]) list_cantonname = list(df_temp[str_CantonName])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment