Skip to content
Snippets Groups Projects
Commit 3a8ce1a1 authored by Lili Gasser's avatar Lili Gasser
Browse files

WIP: name disambiguation doubled double names

parent 8cbe4c86
No related branches found
No related tags found
No related merge requests found
......@@ -274,9 +274,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
if bln_print:
print('name', str_name, 'role', str_role)
# get rid of doubled double names
# TODO
# get rid of 'Präsident stimmt nicht Président ne vote pas'
if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name:
if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
......@@ -423,94 +420,58 @@ def flatten(l):
# - str_name: string to which name should be attached
# - list_uniqueID: list with one or several uniqueIDs
# - list_tupels: list of tupels containing all types of names
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):
def get_string(term, df_names, str_name, list_uniqueID):
# get name type
name_type = df_names['nameType'].loc[df_names['shortName']==term].iloc[0]
print(df_names[df_names['shortName']==term])
print(term)
print(name_type)
if name_type != 'simple':
print(df_names[df_names['shortName']==term])
print(term, name_type)
# extract uniqueID
# extract uniqueID and complete name for this term
list_temp = []
# TODO might lead to doubled double names
if name_type in ['simple', 'double', 'comp']:
list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
str_completeName = df_names['completeName'].loc[df_names['shortName']==term].iloc[0]
str_name = add_to_string(str_name, str_completeName)
# TODO: how to handle for people mentioned in text???
elif name_type in ['canton']:
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
print(list_temp)
print(str_name)
## if it is one of the simple names
#if term in list(df_names['shortName'].loc[df_names['nameType']=='simple']):
#str_name = add_to_string(str_name, term)
#name_type = 'simple'
## if it is a double name
#elif term in list(df_names['shortName'].loc[df_names['nameType']=='double']):
#if bln_print:
#print(5*'\n', 'DOUBLE NAME')
## get correct name
#correct_name = df_names.loc[(df_names['nameType']=='double') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
#if bln_print:
#print('double name', correct_name)
## only add name if it is not there yet
## if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries
#if correct_name not in str_name.split(' '):
#str_name = add_to_string(str_name, correct_name)
#name_type = 'double'
## if it is a composite name
#elif term in list(df_names['shortName'].loc[df_names['nameType']=='comp']):
## get correct name
#correct_name = df_names.loc[(df_names['nameType']=='comp') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
#if bln_print:
#print('composite name', correct_name)
#str_name = add_to_string(str_name, correct_name)
#name_type = 'comp'
## if it contains a canton
## TODO: how to handle for people mentioned in text???
#elif term in list(df_names['shortName'].loc[df_names['nameType']=='canton']):
#if bln_print:
#print('contains a canton', term)
#
#str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
#name_type = 'canton'
#
#
## extract uniqueID
#list_temp = []
#if name_type in ['simple', 'double', 'comp']:
#list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
#elif name_type in ['canton']:
#list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
if len(list_temp) > 0:
if bln_print:
print(list_temp, list_uniqueID)
print(type(list_temp), type(list_uniqueID))
str_completeName = term + ' (CANTON MISSING)'
print(list_temp, str_completeName)
# set or update unique ID and name
# if no unique ID and name has been assigned so far
if len(list_uniqueID) == 0 and str_name == '':
list_uniqueID = list_temp
str_name = add_to_string(str_name, str_completeName)
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
# if there are already one or several peope
else:
print('is this even possible??')
# if it is a double name,
if name_type == 'double':
if list_uniqueID == list_temp:
# do nothing if person has already been found
pass
else:
# check whether we found a person with the same first part of the double lastname
# and overwrite if this is the case
# e.g. if we found a Meyer before we found a Meyer-Boller, e.g. 1971/20000010
if str_completeName.split('-')[0] == str_name.split(' ')[0]:
list_uniqueID = list_temp
str_name = add_to_string('', str_completeName)
# if we have a new person, we append
elif len(set(list_temp).intersection(set(flatten(list_uniqueID)))) != 0:
list_uniqueID.append(list_temp)
str_name = add_to_string(str_name, str_completeName)
return str_name, list_uniqueID, name_type
def update_list_uniqueID(list_uniqueID, list_temp, name_type):
# if no unique ID has been assigned so far
if len(list_uniqueID) == 0:
list_uniqueID = list_temp
# if there are already one or several people and have a new person, we update
elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
list_uniqueID.append(list_temp)
# if name_type is canton
# if name_type is canton, we override other entries by correct one
if name_type == 'canton' and len(list_temp) == 1 and list_temp[0] in list_uniqueID:
list_uniqueID = list_temp
......@@ -595,7 +556,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
# cannot happen for the first term in list_oi
elif name_type == 'canton':
list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0])
canton_type = ''
if term in list_cantonname:
str_canton = term
......@@ -613,14 +574,26 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
str_canton = term
canton_type = 'FirstName'
print('!!! is a canton', term, list_oi, str_name, str_role)
elif term in list_additionalInfo:
str_canton = term
canton_type = 'additionalInfo'
print('!!! is a canton', term, list_oi, str_name, str_role)
else:
print('might be a canton', term, list_oi, str_name, str_role)
# look for similar names based on (normalized) Damerau-Levenshtein distance
# TODO: might needs to be extended for other than cantonname
term_approx = get_approximate_term(term, np.array(list_cantonname))
if term_approx:
str_canton = term_approx
canton_type = 'CantonName'
print('might be a canton:', term, list_oi, str_name, str_role, term_approx)
# if a canton or similar was found
if canton_type:
# get rid of CANTON MISSING
str_name = str_name.split(' ')[0]
# extract uniqueID
# if Citizenship, do proper comparison
if canton_type == 'Citizenship':
......@@ -636,16 +609,19 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0]
str_name = add_to_string(str_name, str_completeName)
else:
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0]
str_name = add_to_string(str_name, str_completeName)
print(list_temp, list_uniqueID)
if len(list_temp) > 0:
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
if str_completeName.split(' ')[0] == str_name:
str_name = add_to_string('', str_completeName)
else:
str_name = add_to_string(str_name, str_completeName)
# if term is not easily mistaken as a name (avoid false positives)
elif term not in list_notnames:
......@@ -750,20 +726,27 @@ def get_list_cantons(df_names, str_name = ''):
else:
df_temp = df_names.loc[df_names['nameType']=='canton']
#print(df_temp)
# list of cantons
list_cantonname = list(df_temp['CantonName'])
# TODO this will lead to an error!
for canton in ['Basel-Stadt', 'Basel-Landschaft']:
if canton in list_cantonname:
list_cantonname.extend(['Basel'])
if 'Graubünden' in list_cantonname:
list_cantonname.extend(['Bünden'])
if 'Bern' in list_cantonname: # check how this works!!
list_cantonname.extend(['Berne'])
# list of canton abbreviations
list_cantonabbr = list(df_temp['CantonAbbreviation'])
# list of citizenships
list_citizenship = list(df_temp['Citizenship'])
list_citizenship = get_cities(list_citizenship)
# list of first names
list_firstname = list(df_temp['FirstName'])
return list_cantonname, list_cantonabbr, list_citizenship, list_firstname
# list of additional information
list_additionalInfo = list(df_temp['additionalInfo'])
return list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment