diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index cd3504da7a302a2f0560644fa1475c0e311e0e99..01f87b7674b82d4895ce39566b243e5629bf0175 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -8,7 +8,7 @@ import sys input_file = sys.argv[1] #'./data/politicians/Ratsmitglieder_1848_DE_corr.xlsx' output_file_csv = sys.argv[2] #'./data/politicians/MPs_after1890.csv' -output_folder_dict = sys.argv[3] +output_folder_dict = sys.argv[3] class MPs_Extractor(object): @@ -19,12 +19,12 @@ class MPs_Extractor(object): self.output_folder_dict = output_folder_dict self.range_years = range(years[0], years[1] + 1) self.df_exc = df_exc - + # function to get lists of lastnames # input: # - df_year: dataframe for a year # output: - # - list_names: + # - list_names: # contains: # - list of last names that appear only once and cannot be split # - list of last name that are made up of two names such as 'Meier-Müller' @@ -44,7 +44,7 @@ class MPs_Extractor(object): str_comp = 'comp' str_canton2 = 'canton' - # function to split lastname and save meaningful part(s) to list + # function to split lastname and save meaningful part(s) to list def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''): # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' lastname_split = lastname.split() @@ -70,7 +70,7 @@ class MPs_Extractor(object): list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton) # write double name without space into list list_names.append((str_double, ''.join(lastname.split('-')), lastname, uniqueID) + tpl_canton) - else: + else: if str_canton: list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton) else: @@ -82,66 +82,67 @@ class MPs_Extractor(object): str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - - return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname) + str_doublename = df_year['DoubleName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + + return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_doublename) - # create empty lists for last names + # create empty lists for last names list_names = [] - - # for every last name + + # for every last name for lastname in df_year['LastName'].drop_duplicates(): #print('name', lastname, type(lastname)) - - # extract all entries with that last name + + # extract all entries with that last name df_temp = df_year.loc[df_after1890['LastName']==lastname] - #print(df_temp) - - # if there is an extra double name + #print(df_temp) + + # if there is an extra double name if df_temp.iloc[0]['DoubleName'] != '': - # extract unique index + # extract unique index uniqueID = df_temp.iloc[0]['uniqueIndex'] - + # get canton information for that uniqueID tpl_canton = get_canton(df_year, uniqueID) #print('double name', df_temp) doublename = df_temp.iloc[0]['DoubleName'] - # if last name is a double name, e.g. 'Meier-Müller' + # if last name is a double name, e.g. 'Meier-Müller' lastname_split2 = doublename.replace('-', ' ').split() if len(lastname_split2) > 1: - # write each part of double name into corresponding list + # write each part of double name into corresponding list for item in lastname_split2: list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) - # write double name into list + # write double name into list list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton) - # write double name without space into list + # write double name without space into list list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton) - # if only one person with that last name + # if only one person with that last name if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1: - # extract unique index + # extract unique index uniqueID = df_temp.iloc[0]['uniqueIndex'] # get canton information for that uniqueID tpl_canton = get_canton(df_year, uniqueID) - # write complete name to list of last names + # write complete name to list of last names split_lastname(lastname, uniqueID, tpl_canton) - - # if there are several people with the same last name + + # if there are several people with the same last name else: - # write last name and canton to correct list + # write last name and canton to correct list for idx, row in df_temp.drop_duplicates(['uniqueIndex']).iterrows(): - # extract unique index + # extract unique index uniqueID = df_temp.loc[idx]['uniqueIndex'] - + # get canton information for that uniqueID tpl_canton = get_canton(df_year, uniqueID) - # write the lastname to the list - split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') - + # write the lastname to the list + split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') + return list_names def extract(self): @@ -172,7 +173,7 @@ class MPs_Extractor(object): # group by first and last name, and date of birth grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"]) - # assign first index to all entries of a person + # assign first index to all entries of a person for list_index in grouped.groups.values(): df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0] @@ -192,15 +193,15 @@ class MPs_Extractor(object): df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] print(year, df_year.shape) - + # write df_year to a yearly csv file # str_year = str(year) # df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv') - + # create a pandas dataframe from list of names # !!! list contains errors, see definition of function list_lastnames = self.get_list_of_lastnames(df_year, df_after1890) - df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) + df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'DoubleName')) # dump dictionary of last names to a pickle file # path = pathlib. @@ -213,19 +214,18 @@ years = [1891, 2016] #2016 df_exc = pd.DataFrame(columns=['LastName', 'FirstName', 'DoubleName']) # exception: Konrad H. Cramer is also reffered to as Cramer-Frey. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} +df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} # exception: Johannes Blumer SG is also reffered to as Blumer-Egloff. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} +df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} # exception: Adolphe Jordan VD is also reffered to as Jordan-Martin. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} +df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} # exception: Jakob Schmid LU is also reffered to as Schmid-Ronca. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} +df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} # exception: Eduard Sulzer ZH is also reffered to as Sulzer-Ziegler. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} +df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} # exception: Howard Eugster AR is also reffered to as Eugster-Züst. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'} +df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'} #print(df_exc) mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_exc) mps_extractor.extract() - diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index e27d825a0c6d1270523ac4e185f9f021f0b5a5bb..66bedcc9ab6636f4ee534db0518f9869fd789c92 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -264,12 +264,13 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ list_oi = [term for term in list_oi if not term.isdigit()] # remove single characters + # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) list_oi = [term for term in list_oi if len(term)>1] - # for every term - for term in list_oi: - # if possible, find a name in a list - str_name, str_role, list_uniqueID, str_canton = find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False) +# # for every term +# for term in list_oi: + # if possible, find a name in a list + str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False) if bln_print: print('name', str_name, 'role', str_role) @@ -425,7 +426,7 @@ def flatten(l): # - list_tupels: list of tupels containing all types of names # TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) -def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False): +def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False): def get_string(term, str_name, str_role, list_uniqueID, str_canton): name_type = '' @@ -439,7 +440,7 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl # if it is a double name elif term in list(df_names['name_short'].loc[df_names['type']=='double']): if bln_print: - print(20*'\n', 'DOUBLE NAME') + print(5*'\n', 'DOUBLE NAME') # get correct name correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] if bln_print: @@ -463,18 +464,7 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl elif term in list(df_names['name_short'].loc[df_names['type']=='canton']): if bln_print: print('contains a canton', term) -# canton_missing = False -# df_temp = df_names.loc[df_names['name_short']==term] -# print('list_correct', df_temp) -# print(str_canton) -# if str_canton: -# str_correct = check_place(df_temp, str_canton) -# if str_correct in ['not found', 'too many']: -# str_name = add_to_string(str_name, term + ' (CANTONT MISSING)') -# canton_missing = True -# else: -# str_name = add_to_string(str_name, str_temp) -# name_type = 'canton' + str_name = add_to_string(str_name, term + ' (CANTON MISSING)') name_type = 'canton' @@ -485,10 +475,7 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]] elif name_type in ['canton']: list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) -# if canton_missing: -# temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) -# else: -# temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_correct']==str_correct)].iat[0, df_names.columns.get_loc('uniqueIndex')] + if len(list_temp) > 0: if bln_print: @@ -519,7 +506,7 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl #else: #int_uniqueID = (int_uniqueID, temp) - return str_name, str_role, list_uniqueID + return str_name, str_role, list_uniqueID, name_type # small function to add term to str_name def add_to_string(string, term): @@ -538,60 +525,96 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen', 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'Rédiger', 'rédiger', 'Wer', 'Fällen', 'Ari', 'bietet', 'autre'] - list_places = get_list_cantons(df_names) + list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names) - if bln_print: - print('now is about: ------', term) # extract list and array of last names list_all_names = list(df_names['name_short']) array_all_names = np.array(df_names['name_short']) - # if term is not easily mistaken as a name (avoid false positives) - if term not in list_notnames: + # initialize name_type + name_type = '' - # if term is in the list of all names and roles - if term in (list_all_names + list_roles): - # get correct name and uniqueID, or role, for that term - str_name, str_role, list_uniqueID = get_string(term, str_name, str_role, list_uniqueID, str_canton) + # for every term + for term in list_oi: - if bln_print: - print('=== correct name', term) - # if term in list of cantons - elif term in list_places: - str_canton = term - # if term is not in list_all_names - else: - # look for similar names based on (normalized) Damerau-Levenshtein distance - # !!! probably need to improve this procedure - # - find better values .... - if bln_print: - print(term) - array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35] - array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) - if bln_print: - print(array_normalized, array_normalized_values) - array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2] - array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) - if bln_print: - print(array_absolute, array_absolute_values) - set_intersection = set(array_normalized).intersection(set(array_absolute)) - # check if a similar name was found - term_approx = '' - if len(set_intersection) == 1: - term_approx = list(set_intersection)[0] + if bln_print: + print('now is about: ------', term) + + if name_type == 'canton': + canton_type = '' + if term in list_cantonname: + str_canton = term + canton_type = 'CantonName' + print('!!! is a canton', term, list_oi, str_name, str_role) + elif term in list_cantonabbr: + str_canton = term + canton_type = 'CantonAbbr' + print('!!! is a canton', term, list_oi, str_name, str_role) + elif term in list_citizenship: + str_canton = term + canton_type = 'Citizenship' + print('!!! is a canton', term, list_oi, str_name, str_role) + elif term in list_firstname: + str_canton = term + canton_type = 'FirstName' + print('!!! is a canton', term, list_oi, str_name, str_role) + + else: + print('might be a canton', term, list_oi, str_name, str_role) + + if canton_type: + # get rid of CANTON MISSING + str_name = str_name.split(' ')[0] + # extract uniqueID +# list_temp = [] + list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + print(list_temp, list_uniqueID) + list_uniqueID = list_temp + + # if term is not easily mistaken as a name (avoid false positives) + if term not in list_notnames: + + # if term is in the list of all names and roles + if term in (list_all_names + list_roles): + # get correct name and uniqueID, or role, for that term + str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) + + if bln_print: + print('=== correct name', term) + + # if term is not in list_all_names + else: + # look for similar names based on (normalized) Damerau-Levenshtein distance + # !!! probably need to improve this procedure + # - find better values .... if bln_print: - print('we found the name', set_intersection) - elif len(set_intersection) > 1: - # !!! we only look at normalized values - # !!! we don't account for names with same values !!! - array_min = array_normalized[array_normalized_values.argmin()] - term_approx = array_min#[0] + print(term) + array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35] + array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) if bln_print: - print('we found several possible names', set_intersection, 'and choose', array_min) - if term_approx: - str_name, str_role, list_uniqueID = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton) + print(array_normalized, array_normalized_values) + array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2] + array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) if bln_print: - print('*******************', str_name, term_approx) + print(array_absolute, array_absolute_values) + set_intersection = set(array_normalized).intersection(set(array_absolute)) + # check if a similar name was found + term_approx = '' + if len(set_intersection) == 1: + term_approx = list(set_intersection)[0] + if bln_print: + print('we found the name', set_intersection) + elif len(set_intersection) > 1: + # !!! we only look at normalized values + # !!! we don't account for names with same values !!! + array_min = array_normalized[array_normalized_values.argmin()] + term_approx = array_min#[0] + if bln_print: + print('we found several possible names', set_intersection, 'and choose', array_min) + if term_approx: + str_name, str_role, list_uniqueID, name_type = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton) + if bln_print: + print('*******************', str_name, term_approx) return str_name, str_role, list_uniqueID, str_canton @@ -829,4 +852,4 @@ def get_list_cantons(df_names): list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')] list_firstname = list(df_temp['FirstName']) - return list_cantonname + list_cantonabbr + list_citizenship + list_firstname + return list_cantonname, list_cantonabbr, list_citizenship, list_firstname