Skip to content
Snippets Groups Projects
Commit 8473096a authored by Lili Gasser's avatar Lili Gasser
Browse files

clean utils_annot

parent 135db555
No related branches found
No related tags found
No related merge requests found
...@@ -224,6 +224,14 @@ def get_complete_text(textbox): ...@@ -224,6 +224,14 @@ def get_complete_text(textbox):
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False): def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False):
# lists of roles
list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente',
'Berichterstatter', 'Berichterstatterin', 'rapporteur',
'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
'Vizepräsident']
list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
# initialize flag # initialize flag
this_is_speech = False this_is_speech = False
...@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ ...@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# at the beginning of a textbox and identifiying a name or a role in front # at the beginning of a textbox and identifiying a name or a role in front
# of that colon # of that colon
if ind_tl_colon >= 0: if ind_tl_colon >= 0:
# if ':' in text[:100]:
# extract the index of the colon in the text # extract the index of the colon in the text
colon_index_text = text.index(':') colon_index_text = text.index(':')
...@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ ...@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# remove single characters # remove single characters
# TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
# TODO: maybe exclude I and A to account for Appenzell
list_oi = [term for term in list_oi if len(term)>1] list_oi = [term for term in list_oi if len(term)>1]
# # for every term # if possible, find a name from the list
# for term in list_oi: str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False)
# if possible, find a name in a list
str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, df_names, list_notnames, bln_print=False)
if bln_print: if bln_print:
print('name', str_name, 'role', str_role) print('name', str_name, 'role', str_role)
...@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ ...@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
thattext = XML_new[ind_p][ind_t][0].text thattext = XML_new[ind_p][ind_t][0].text
colon_index = thattext.index(':') colon_index = thattext.index(':')
# print(thattext)
try: try:
# write speaker to first line # write speaker to first line
XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend
# get start of speech with correct font start # get start of speech with correct font start
# print(thattext[colon_index+1:])
if thattext[colon_index+1:].startswith('[font'): if thattext[colon_index+1:].startswith('[font'):
startspeech = thattext[colon_index+1:] startspeech = thattext[colon_index+1:]
elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
...@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ ...@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
else: else:
startspeech = thattext[colon_index+1:] startspeech = thattext[colon_index+1:]
# print(startspeech)
# write beginning of speech to second line # write beginning of speech to second line
# (create new ET element if necessary) # (create new ET element if necessary)
if len(list(XML_new[ind_p][ind_t])) > 1: if len(list(XML_new[ind_p][ind_t])) > 1:
...@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ ...@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
colon_index = thattext.index(':') colon_index = thattext.index(':')
# get start of speech with correct font start # get start of speech with correct font start
# print(thattext[colon_index+1:])
if thattext[colon_index+1:].startswith('[font'): if thattext[colon_index+1:].startswith('[font'):
startspeech = thattext[colon_index+1:] startspeech = thattext[colon_index+1:]
elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
...@@ -424,9 +424,9 @@ def flatten(l): ...@@ -424,9 +424,9 @@ def flatten(l):
# - list_uniqueID: list with one or several uniqueIDs # - list_uniqueID: list with one or several uniqueIDs
# - list_tupels: list of tupels containing all types of names # - list_tupels: list of tupels containing all types of names
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
def find_names(list_oi, df_names, list_notnames, bln_print=False): def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):
def get_string(term, str_name, str_role, list_uniqueID, str_canton): def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton):
name_type = '' name_type = ''
# if it is one of the simple names # if it is one of the simple names
if term in list(df_names['name_short'].loc[df_names['type']=='simple']): if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
...@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_name = add_to_string(str_name, correct_name) str_name = add_to_string(str_name, correct_name)
name_type = 'comp' name_type = 'comp'
# if it contains a canton # if it contains a canton
# !!! also pass list_oi to look for canton # TODO: how to handle for people mentioned in text???
# !!! how to handle for people mentioned in text???
elif term in list(df_names['name_short'].loc[df_names['type']=='canton']): elif term in list(df_names['name_short'].loc[df_names['type']=='canton']):
if bln_print: if bln_print:
print('contains a canton', term) print('contains a canton', term)
...@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
print(list_temp, list_uniqueID) print(list_temp, list_uniqueID)
print(type(list_temp), type(list_uniqueID)) print(type(list_temp), type(list_uniqueID))
print(isinstance(list_uniqueID, list)) print(isinstance(list_uniqueID, list))
# if no unique ID has been assigned so far
if len(list_uniqueID) == 0: list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
list_uniqueID = list_temp
# if there are already one or several people and have a new person, we update
elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
list_uniqueID.append(list_temp)
## if we already have several possible people, e.g. because of canton
#elif isinstance(int_uniqueID, tuple):
#print('I should be here')
## and refound the uniqueID of one of those, don't update
#if temp in int_uniqueID:
#pass
## and update if we don't have that uniqueID yet
#else:
#int_uniqueID = (int_uniqueID, temp)
## if a person with that uniqueID exists already, don't update
#elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
#print('but end up here.. not even.....')
#pass
## if a different unique ID has been assigned already
#else:
#int_uniqueID = (int_uniqueID, temp)
return str_name, str_role, list_uniqueID, name_type return str_name, str_role, list_uniqueID, name_type
def update_list_uniqueID(list_uniqueID, list_temp):
# if no unique ID has been assigned so far
if len(list_uniqueID) == 0:
list_uniqueID = list_temp
# if there are already one or several people and have a new person, we update
elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
list_uniqueID.append(list_temp)
return list_uniqueID
# function to find correct term (in case of misspellings, etc.) # function to find correct term (in case of misspellings, etc.)
def get_approximate_term(term, array_all_names): def get_approximate_term(term, array_all):
# TODO: probably need to improve this procedure # TODO: probably need to improve this procedure
# - find better values .... # - find better values ....
# initialize string # initialize string
term_approx = '' term_approx = ''
# get normalize array # get normalized array
array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35] array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35]
array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized)
# get absolute array # get absolute array
array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2] array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2]
array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute)
if bln_print: if bln_print:
print(term) print(term)
...@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_canton = '' str_canton = ''
name_type = '' name_type = ''
# lists of roles
list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente',
'Berichterstatter', 'Berichterstatterin', 'rapporteur',
'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
'Vizepräsident']
list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
# extract list and array of last names # extract list and array of last names
list_all_names = list(df_names['name_short']) list_all_names = list(df_names['name_short'])
array_all_names = np.array(df_names['name_short']) array_all_names = np.array(df_names['name_short'])
...@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
if term in list_roles: if term in list_roles:
# get correct name and uniqueID, or role, for that term # get correct name and uniqueID, or role, for that term
str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
if bln_print: if bln_print:
print('found a role', term) print('found a role', term)
# TODO: also look for similar terms (misspellings) # TODO: also look for similar terms (misspellings)
# TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
elif term in list_roles_ext: elif term in list_roles_ext:
pass pass
# TODO: extract whether it is minority or majority and save that information # TODO: extract whether it is minority or majority and save that information
# can not happen for the first term # cannot happen for the first term
elif name_type == 'canton': elif name_type == 'canton':
list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0]) list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
canton_type = '' canton_type = ''
...@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else: else:
print('might be a canton', term, list_oi, str_name, str_role) print('might be a canton', term, list_oi, str_name, str_role)
# TODO: maybe: go to next elif?
# if a canton or similar was found # if a canton or similar was found
if canton_type: if canton_type:
...@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
try: try:
if len(list_cities) == 1: if len(list_cities) == 1:
str_citizenship = list_cities[0] str_citizenship = list_cities[0]
# except: except:
print('found no or more than one person with citizenship', str_canton, str_name) print('found no or more than one person with citizenship', str_canton, str_name)
pass pass
...@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else: else:
list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
print(list_temp, list_uniqueID) print(list_temp, list_uniqueID)
list_uniqueID = list_temp
if len(list_temp) > 0:
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
# if term is not easily mistaken as a name (avoid false positives) # if term is not easily mistaken as a name (avoid false positives)
elif term not in list_notnames: elif term not in list_notnames:
...@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if term is in the list of all names # if term is in the list of all names
if term in list_all_names: if term in list_all_names:
# get correct name and uniqueID, or role, for that term # get correct name and uniqueID, or role, for that term
str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
if bln_print: if bln_print:
print('=== correct name', term) print('=== correct name', term)
...@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): ...@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if one was found, get correct name, etc. # if one was found, get correct name, etc.
if term_approx: if term_approx:
str_name, str_role, list_uniqueID, name_type = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton) str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton)
if bln_print: if bln_print:
print('=== approximate name', str_name, term_approx) print('=== approximate name', str_name, term_approx)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment