diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index c989533374e94ef57c3dc2290d8f178168b8e97e..21fe6f3150fdc821f41157a74b227bee3d9abe75 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -362,7 +362,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ this_is_speech = True if bln_print: print('found a name:', text_start, list_oi, str_name, str_role, '\n') - print('found a name:', text_start, list_oi, str_name, str_role, list_uniqueID, '\n') + print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n') return XML_new, this_is_speech @@ -624,7 +624,10 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False): # if Citizenship, do proper comparison if canton_type == 'Citizenship': df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)] - list_citizenship = [term for term in df_temp[canton_type] if str_canton in tokenizer_canton.tokenize(term)] + list_citizenship = get_cities(list(df_temp[canton_type])) + print(list_citizenship) + list_citizenship = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])] + print(list_citizenship) str_citizenship = '' try: if len(list_citizenship) == 1: @@ -733,6 +736,8 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): return XML_new +def get_cities(list_citizenship): + return [city[:-5] for item in list_citizenship for city in item.split(',')] # function to get list of places def get_list_cantons(df_names, str_name = ''): @@ -751,7 +756,7 @@ def get_list_cantons(df_names, str_name = ''): list_cantonname.extend(['Berne']) list_cantonabbr = list(df_temp['CantonAbbreviation']) list_citizenship = list(df_temp['Citizenship']) - list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')] + list_citizenship = get_cities(list_citizenship) list_firstname = list(df_temp['FirstName']) return list_cantonname, list_cantonabbr, list_citizenship, list_firstname