From 8473096a602923d39a826cf28bce071e81023f14 Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Tue, 22 Jan 2019 17:02:27 +0100 Subject: [PATCH] clean utils_annot --- src/python/utils_annot.py | 96 ++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 57 deletions(-) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 3b1ab135..b19c0640 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -224,6 +224,14 @@ def get_complete_text(textbox): # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False): + # lists of roles + list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', + 'Berichterstatter', 'Berichterstatterin', 'rapporteur', + 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', + 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', + 'Vizepräsident'] + list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] + # initialize flag this_is_speech = False @@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # at the beginning of a textbox and identifiying a name or a role in front # of that colon if ind_tl_colon >= 0: -# if ':' in text[:100]: # extract the index of the colon in the text colon_index_text = text.index(':') @@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # remove single characters # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) + # TODO: maybe exclude I and A to account for Appenzell list_oi = [term for term in list_oi if len(term)>1] -# # for every term -# for term in list_oi: - # if possible, find a name in a list - str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, df_names, list_notnames, bln_print=False) + # if possible, find a name from the list + str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False) if bln_print: print('name', str_name, 'role', str_role) @@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ thattext = XML_new[ind_p][ind_t][0].text colon_index = thattext.index(':') -# print(thattext) - try: # write speaker to first line XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend # get start of speech with correct font start -# print(thattext[colon_index+1:]) if thattext[colon_index+1:].startswith('[font'): startspeech = thattext[colon_index+1:] elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): @@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ else: startspeech = thattext[colon_index+1:] -# print(startspeech) - # write beginning of speech to second line # (create new ET element if necessary) if len(list(XML_new[ind_p][ind_t])) > 1: @@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ colon_index = thattext.index(':') # get start of speech with correct font start -# print(thattext[colon_index+1:]) if thattext[colon_index+1:].startswith('[font'): startspeech = thattext[colon_index+1:] elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): @@ -424,9 +424,9 @@ def flatten(l): # - list_uniqueID: list with one or several uniqueIDs # - list_tupels: list of tupels containing all types of names # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) -def find_names(list_oi, df_names, list_notnames, bln_print=False): +def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False): - def get_string(term, str_name, str_role, list_uniqueID, str_canton): + def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton): name_type = '' # if it is one of the simple names if term in list(df_names['name_short'].loc[df_names['type']=='simple']): @@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): str_name = add_to_string(str_name, correct_name) name_type = 'comp' # if it contains a canton - # !!! also pass list_oi to look for canton - # !!! how to handle for people mentioned in text??? + # TODO: how to handle for people mentioned in text??? elif term in list(df_names['name_short'].loc[df_names['type']=='canton']): if bln_print: print('contains a canton', term) @@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): print(list_temp, list_uniqueID) print(type(list_temp), type(list_uniqueID)) print(isinstance(list_uniqueID, list)) - # if no unique ID has been assigned so far - if len(list_uniqueID) == 0: - list_uniqueID = list_temp - # if there are already one or several people and have a new person, we update - elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0: - list_uniqueID.append(list_temp) - - ## if we already have several possible people, e.g. because of canton - #elif isinstance(int_uniqueID, tuple): - #print('I should be here') - ## and refound the uniqueID of one of those, don't update - #if temp in int_uniqueID: - #pass - ## and update if we don't have that uniqueID yet - #else: - #int_uniqueID = (int_uniqueID, temp) - ## if a person with that uniqueID exists already, don't update - #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID: - #print('but end up here.. not even.....') - #pass - ## if a different unique ID has been assigned already - #else: - #int_uniqueID = (int_uniqueID, temp) + + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp) return str_name, str_role, list_uniqueID, name_type + def update_list_uniqueID(list_uniqueID, list_temp): + # if no unique ID has been assigned so far + if len(list_uniqueID) == 0: + list_uniqueID = list_temp + # if there are already one or several people and have a new person, we update + elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0: + list_uniqueID.append(list_temp) + + return list_uniqueID + # function to find correct term (in case of misspellings, etc.) - def get_approximate_term(term, array_all_names): + def get_approximate_term(term, array_all): # TODO: probably need to improve this procedure # - find better values .... # initialize string term_approx = '' - # get normalize array - array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35] + # get normalized array + array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35] array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) # get absolute array - array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2] + array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2] array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) if bln_print: print(term) @@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): str_canton = '' name_type = '' - # lists of roles - list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', - 'Berichterstatter', 'Berichterstatterin', 'rapporteur', - 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', - 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', - 'Vizepräsident'] - list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] - # extract list and array of last names list_all_names = list(df_names['name_short']) array_all_names = np.array(df_names['name_short']) @@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): if term in list_roles: # get correct name and uniqueID, or role, for that term - str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) + str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton) if bln_print: print('found a role', term) # TODO: also look for similar terms (misspellings) + # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter elif term in list_roles_ext: pass # TODO: extract whether it is minority or majority and save that information - # can not happen for the first term + # cannot happen for the first term elif name_type == 'canton': list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0]) canton_type = '' @@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): else: print('might be a canton', term, list_oi, str_name, str_role) - # TODO: maybe: go to next elif? # if a canton or similar was found if canton_type: @@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): try: if len(list_cities) == 1: str_citizenship = list_cities[0] - # except: + except: print('found no or more than one person with citizenship', str_canton, str_name) pass @@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): else: list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) print(list_temp, list_uniqueID) - list_uniqueID = list_temp + + if len(list_temp) > 0: + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp) # if term is not easily mistaken as a name (avoid false positives) elif term not in list_notnames: @@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): # if term is in the list of all names if term in list_all_names: # get correct name and uniqueID, or role, for that term - str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) + str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton) if bln_print: print('=== correct name', term) @@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): # if one was found, get correct name, etc. if term_approx: - str_name, str_role, list_uniqueID, name_type = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton) + str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton) if bln_print: print('=== approximate name', str_name, term_approx) -- GitLab