diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 02fdbed765592503a953faa48cafcfe773539096..618f7b2cd2320889e25d4eb1336e1adafc7808f8 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -27,7 +27,7 @@ from utils_proc import call_with_out # needed for running in atom, can be ignored year = '1971' -input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" +input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" input_notnames = "data/lists/not_names.txt" @@ -113,7 +113,7 @@ utils_proc.compress_tar(output_annotatedxml) #%% # to test for one file -file_tarpath = './1971/20000323_datacorr.xml' +file_tarpath = './1971/20000010_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -132,7 +132,6 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20 #%% - #id_doc #len(files_to_process) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index b19c06404c47e65559ddeeb273fa0bbd271a38fd..56f14ce92d7e7fff0f55f814c96febf60b7228b0 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -419,78 +419,100 @@ def flatten(l): # function to find names # input: # - term: term that might be name +# - df_names: yearly dataframe with all MPs # - str_name: string to which name should be attached -# - str_role: string to which role should be attached # - list_uniqueID: list with one or several uniqueIDs # - list_tupels: list of tupels containing all types of names # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False): - def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton): - name_type = '' - # if it is one of the simple names - if term in list(df_names['name_short'].loc[df_names['type']=='simple']): - str_name = add_to_string(str_name, term) - name_type = 'simple' - # if it is a role - elif term in list_roles: - str_role = add_to_string(str_role, term) - # if it is a double name - elif term in list(df_names['name_short'].loc[df_names['type']=='double']): - if bln_print: - print(5*'\n', 'DOUBLE NAME') - # get correct name - correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] - if bln_print: - print('double name', correct_name) - # only add name if it is not there yet - # if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries - if correct_name not in str_name.split(' '): - str_name = add_to_string(str_name, correct_name) - name_type = 'double' - # if it is a composite name - elif term in list(df_names['name_short'].loc[df_names['type']=='comp']): - # get correct name - correct_name = df_names.loc[(df_names['type']=='comp') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] - if bln_print: - print('composite name', correct_name) - str_name = add_to_string(str_name, correct_name) - name_type = 'comp' - # if it contains a canton - # TODO: how to handle for people mentioned in text??? - elif term in list(df_names['name_short'].loc[df_names['type']=='canton']): - if bln_print: - print('contains a canton', term) - - str_name = add_to_string(str_name, term + ' (CANTON MISSING)') - name_type = 'canton' - + def get_string(term, df_names, str_name, list_uniqueID): + # get name type + name_type = df_names['nameType'].loc[df_names['shortName']==term].iloc[0] + print(df_names[df_names['shortName']==term]) + print(term) + print(name_type) # extract uniqueID list_temp = [] + # TODO might lead to doubled double names if name_type in ['simple', 'double', 'comp']: - list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]] + list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]] + str_completeName = df_names['completeName'].loc[df_names['shortName']==term].iloc[0] + str_name = add_to_string(str_name, str_completeName) + + # TODO: how to handle for people mentioned in text??? elif name_type in ['canton']: - list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + str_name = add_to_string(str_name, term + ' (CANTON MISSING)') + + print(list_temp) + print(str_name) + + ## if it is one of the simple names + #if term in list(df_names['shortName'].loc[df_names['nameType']=='simple']): + #str_name = add_to_string(str_name, term) + #name_type = 'simple' + ## if it is a double name + #elif term in list(df_names['shortName'].loc[df_names['nameType']=='double']): + #if bln_print: + #print(5*'\n', 'DOUBLE NAME') + ## get correct name + #correct_name = df_names.loc[(df_names['nameType']=='double') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')] + #if bln_print: + #print('double name', correct_name) + ## only add name if it is not there yet + ## if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries + #if correct_name not in str_name.split(' '): + #str_name = add_to_string(str_name, correct_name) + #name_type = 'double' + ## if it is a composite name + #elif term in list(df_names['shortName'].loc[df_names['nameType']=='comp']): + ## get correct name + #correct_name = df_names.loc[(df_names['nameType']=='comp') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')] + #if bln_print: + #print('composite name', correct_name) + #str_name = add_to_string(str_name, correct_name) + #name_type = 'comp' + ## if it contains a canton + ## TODO: how to handle for people mentioned in text??? + #elif term in list(df_names['shortName'].loc[df_names['nameType']=='canton']): + #if bln_print: + #print('contains a canton', term) +# + #str_name = add_to_string(str_name, term + ' (CANTON MISSING)') + #name_type = 'canton' +# +# + ## extract uniqueID + #list_temp = [] + #if name_type in ['simple', 'double', 'comp']: + #list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]] + #elif name_type in ['canton']: + #list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) if len(list_temp) > 0: if bln_print: print(list_temp, list_uniqueID) print(type(list_temp), type(list_uniqueID)) - print(isinstance(list_uniqueID, list)) - list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp) + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) + else: + print('is this even possible??') - return str_name, str_role, list_uniqueID, name_type + return str_name, list_uniqueID, name_type - def update_list_uniqueID(list_uniqueID, list_temp): + def update_list_uniqueID(list_uniqueID, list_temp, name_type): # if no unique ID has been assigned so far if len(list_uniqueID) == 0: list_uniqueID = list_temp # if there are already one or several people and have a new person, we update elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0: list_uniqueID.append(list_temp) + # if name_type is canton + if name_type == 'canton' and len(list_temp) == 1 and list_temp[0] in list_uniqueID: + list_uniqueID = list_temp return list_uniqueID @@ -549,8 +571,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln name_type = '' # extract list and array of last names - list_all_names = list(df_names['name_short']) - array_all_names = np.array(df_names['name_short']) + list_all_names = list(df_names['shortName']) + array_all_names = np.array(df_names['shortName']) # for every term for term in list_oi: @@ -559,20 +581,19 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln print('now is about: ------', term) if term in list_roles: - # get correct name and uniqueID, or role, for that term - str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton) + # update str_role + # TODO: also look for similar terms (misspellings) + # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter + str_role = add_to_string(str_role, term) if bln_print: print('found a role', term) - # TODO: also look for similar terms (misspellings) - # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter - elif term in list_roles_ext: pass # TODO: extract whether it is minority or majority and save that information - # cannot happen for the first term + # cannot happen for the first term in list_oi elif name_type == 'canton': list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0]) canton_type = '' @@ -603,9 +624,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln # extract uniqueID # if Citizenship, do proper comparison if canton_type == 'Citizenship': - df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)] + df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)] list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])] - print(list_cities) str_citizenship = '' try: if len(list_cities) == 1: @@ -614,14 +634,18 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln print('found no or more than one person with citizenship', str_canton, str_name) pass - list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0] + str_name = add_to_string(str_name, str_completeName) else: - list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0] + str_name = add_to_string(str_name, str_completeName) print(list_temp, list_uniqueID) if len(list_temp) > 0: - list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp) + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) # if term is not easily mistaken as a name (avoid false positives) elif term not in list_notnames: @@ -629,7 +653,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln # if term is in the list of all names if term in list_all_names: # get correct name and uniqueID, or role, for that term - str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton) + str_name, list_uniqueID, name_type = get_string(term, df_names, str_name, list_uniqueID) if bln_print: print('=== correct name', term) @@ -641,7 +665,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln # if one was found, get correct name, etc. if term_approx: - str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton) + str_name, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, list_uniqueID) if bln_print: print('=== approximate name', str_name, term_approx) @@ -722,10 +746,10 @@ def get_cities(list_citizenship): # function to get list of places def get_list_cantons(df_names, str_name = ''): if str_name: - df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)] + df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)] else: - df_temp = df_names.loc[df_names['type']=='canton'] - print(df_temp) + df_temp = df_names.loc[df_names['nameType']=='canton'] + #print(df_temp) list_cantonname = list(df_temp['CantonName']) for canton in ['Basel-Stadt', 'Basel-Landschaft']: if canton in list_cantonname: @@ -744,7 +768,6 @@ def get_list_cantons(df_names, str_name = ''): # tokenizer -# last part \S+ is needed to get colon, \S stands for white space tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') @@ -815,25 +838,6 @@ def get_last_item(list_tupels): - -# function to get data frame from lists of names -# input: -# - lists_names: lists of names (simple, double, comp, canton) -# output: -# - df: corresponding dataframe -def get_df_from_lists_names(lists_names): - list_types = ['simple', 'double', 'comp', 'canton'] - df = pd.DataFrame() - for i in range(4): - df_temp = pd.DataFrame(lists_names[i], - columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) - df_temp['type'] = list_types[i] - df = pd.concat([df, df_temp], ignore_index = True) - return df - - - - def tokenize_dictionary(dictionary, tokenizer, only_text=False): dictionary_tokenized = {} # if there is only text, e.g. when we look at all texts of a document at once (level 2 in flattened dictionary)