Skip to content
Snippets Groups Projects
Commit 8473096a authored by Lili Gasser's avatar Lili Gasser
Browse files

clean utils_annot

parent 135db555
No related branches found
No related tags found
No related merge requests found
......@@ -224,6 +224,14 @@ def get_complete_text(textbox):
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False):
# lists of roles
list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente',
'Berichterstatter', 'Berichterstatterin', 'rapporteur',
'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
'Vizepräsident']
list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
# initialize flag
this_is_speech = False
......@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# at the beginning of a textbox and identifiying a name or a role in front
# of that colon
if ind_tl_colon >= 0:
# if ':' in text[:100]:
# extract the index of the colon in the text
colon_index_text = text.index(':')
......@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
# remove single characters
# TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
# TODO: maybe exclude I and A to account for Appenzell
list_oi = [term for term in list_oi if len(term)>1]
# # for every term
# for term in list_oi:
# if possible, find a name in a list
str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, df_names, list_notnames, bln_print=False)
# if possible, find a name from the list
str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False)
if bln_print:
print('name', str_name, 'role', str_role)
......@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
thattext = XML_new[ind_p][ind_t][0].text
colon_index = thattext.index(':')
# print(thattext)
try:
# write speaker to first line
XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend
# get start of speech with correct font start
# print(thattext[colon_index+1:])
if thattext[colon_index+1:].startswith('[font'):
startspeech = thattext[colon_index+1:]
elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
......@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
else:
startspeech = thattext[colon_index+1:]
# print(startspeech)
# write beginning of speech to second line
# (create new ET element if necessary)
if len(list(XML_new[ind_p][ind_t])) > 1:
......@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
colon_index = thattext.index(':')
# get start of speech with correct font start
# print(thattext[colon_index+1:])
if thattext[colon_index+1:].startswith('[font'):
startspeech = thattext[colon_index+1:]
elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
......@@ -424,9 +424,9 @@ def flatten(l):
# - list_uniqueID: list with one or several uniqueIDs
# - list_tupels: list of tupels containing all types of names
# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
def find_names(list_oi, df_names, list_notnames, bln_print=False):
def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):
def get_string(term, str_name, str_role, list_uniqueID, str_canton):
def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton):
name_type = ''
# if it is one of the simple names
if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
......@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_name = add_to_string(str_name, correct_name)
name_type = 'comp'
# if it contains a canton
# !!! also pass list_oi to look for canton
# !!! how to handle for people mentioned in text???
# TODO: how to handle for people mentioned in text???
elif term in list(df_names['name_short'].loc[df_names['type']=='canton']):
if bln_print:
print('contains a canton', term)
......@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
print(list_temp, list_uniqueID)
print(type(list_temp), type(list_uniqueID))
print(isinstance(list_uniqueID, list))
# if no unique ID has been assigned so far
if len(list_uniqueID) == 0:
list_uniqueID = list_temp
# if there are already one or several people and have a new person, we update
elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
list_uniqueID.append(list_temp)
## if we already have several possible people, e.g. because of canton
#elif isinstance(int_uniqueID, tuple):
#print('I should be here')
## and refound the uniqueID of one of those, don't update
#if temp in int_uniqueID:
#pass
## and update if we don't have that uniqueID yet
#else:
#int_uniqueID = (int_uniqueID, temp)
## if a person with that uniqueID exists already, don't update
#elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
#print('but end up here.. not even.....')
#pass
## if a different unique ID has been assigned already
#else:
#int_uniqueID = (int_uniqueID, temp)
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
return str_name, str_role, list_uniqueID, name_type
def update_list_uniqueID(list_uniqueID, list_temp):
# if no unique ID has been assigned so far
if len(list_uniqueID) == 0:
list_uniqueID = list_temp
# if there are already one or several people and have a new person, we update
elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
list_uniqueID.append(list_temp)
return list_uniqueID
# function to find correct term (in case of misspellings, etc.)
def get_approximate_term(term, array_all_names):
def get_approximate_term(term, array_all):
# TODO: probably need to improve this procedure
# - find better values ....
# initialize string
term_approx = ''
# get normalize array
array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35]
# get normalized array
array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35]
array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized)
# get absolute array
array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2]
array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2]
array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute)
if bln_print:
print(term)
......@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
str_canton = ''
name_type = ''
# lists of roles
list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente',
'Berichterstatter', 'Berichterstatterin', 'rapporteur',
'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
'Vizepräsident']
list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
# extract list and array of last names
list_all_names = list(df_names['name_short'])
array_all_names = np.array(df_names['name_short'])
......@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
if term in list_roles:
# get correct name and uniqueID, or role, for that term
str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
if bln_print:
print('found a role', term)
# TODO: also look for similar terms (misspellings)
# TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
elif term in list_roles_ext:
pass
# TODO: extract whether it is minority or majority and save that information
# can not happen for the first term
# cannot happen for the first term
elif name_type == 'canton':
list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
canton_type = ''
......@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else:
print('might be a canton', term, list_oi, str_name, str_role)
# TODO: maybe: go to next elif?
# if a canton or similar was found
if canton_type:
......@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
try:
if len(list_cities) == 1:
str_citizenship = list_cities[0]
# except:
except:
print('found no or more than one person with citizenship', str_canton, str_name)
pass
......@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
else:
list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
print(list_temp, list_uniqueID)
list_uniqueID = list_temp
if len(list_temp) > 0:
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
# if term is not easily mistaken as a name (avoid false positives)
elif term not in list_notnames:
......@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if term is in the list of all names
if term in list_all_names:
# get correct name and uniqueID, or role, for that term
str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
if bln_print:
print('=== correct name', term)
......@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
# if one was found, get correct name, etc.
if term_approx:
str_name, str_role, list_uniqueID, name_type = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton)
str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton)
if bln_print:
print('=== approximate name', str_name, term_approx)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment