diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index 3c29b91aedde34092096ad1d56bd4e17d945d526..fb0fb51b4accb9b421ae4ed7dcb4fe31e1d64688 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -7,7 +7,7 @@ also check for council: one MP not active in whole year, leads to other not uniquely identified ----------------------------------------------------------------------- -1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) +1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) --> solved! 1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) --> solved! 1925/20029967: Huber (in December, the second Huber already left) --> finds two!) --> solved because only NR! 1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. --> solved! diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 5bd42697ad85a3950f9935d8e0123d429bc4b45c..a9df20a6ed84accfe02f2f56a084523936fb0b55 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -141,7 +141,6 @@ file_doc.get_council_date() #len(files_to_process) file_doc.check_discussion() -str_date = '22.09.1925' - +str_date = '1925-12-09 08:00' import datetime -datetime.datetime.strptime(str_date, '%d.%m.%Y') +datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M') diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index ae4f0875f1dd0f87b5d2ad35a70d101560e62f80..ec0564efd3123b2ddda2cff43194b03967727672 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -56,11 +56,11 @@ def get_council_and_date(path_meta_xml_file): # parse, get root and then part of interest XML_tree = ET.parse(path_meta_xml_file) XML_root = XML_tree.getroot() - XML_poi = XML_root[0].find('META_FROM_DB') + XML_poi = XML_root[0] # get council and date - str_council = XML_poi.attrib['RAT'] - str_date = XML_poi.attrib['DATUM'] + str_council = XML_poi.find('META_FROM_DB').attrib['RAT'] + str_date = XML_poi.attrib['PUBLIKATIONS_DATUM'] return (str_council, str_date) @@ -570,7 +570,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # cannot happen for the first term in list_oi elif name_type == 'canton': - list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0]) + list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0], str_council) canton_type = '' if term in list_cantonname: str_canton = term @@ -608,10 +608,11 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # get rid of CANTON MISSING str_name = str_name.split(' ')[0] + df_temp = get_df_temp_canton(df_names, str_name, str_council) # extract uniqueID - # if Citizenship, do proper comparison + # if Citizenship, get list of cities and compare each to term if canton_type == 'Citizenship': - df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)] + # get list of cities list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])] str_citizenship = '' try: @@ -621,12 +622,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str print('found no or more than one person with citizenship', str_canton, str_name) pass - list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) - str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0] + list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0] else: - list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) - str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0] + list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] print(list_temp, list_uniqueID) @@ -662,10 +663,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # initial checks for not uniquely identified peoples # TODO check for false positives of these procedures if name_type == 'canton': - df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0])] # check if person can be identified from council - list_councils = list(df_temp['CouncilName']) - if list_councils.count(str_council) == 1: + df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)] + if df_temp.shape[0] == 1: + ## check if person can be identified from council + #list_councils = list(df_temp['CouncilName']) + #if list_councils.count(str_council) == 1: list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0] @@ -675,36 +678,35 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: str_name = add_to_string(str_name, str_completeName) - # check if person can be identified from date of discussion - # TODO: is input dataformat always the same? - df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%d.%m.%Y')] - # TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0] - if df_temp_before.shape[0] == 1: - list_temp = list(df_temp_before['uniqueIndex']) - str_completeName = df_temp_before['completeName'].iloc[0] - - list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) - if str_completeName.split(' ')[0] == str_name.split(' ')[0]: - str_name = add_to_string('', str_completeName) - else: - str_name = add_to_string(str_name, str_completeName) - - df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%d.%m.%Y')] - # TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0] - if df_temp_after.shape[0] == 1: - list_temp = list(df_temp_after['uniqueIndex']) - str_completeName = df_temp_after['completeName'].iloc[0] - - list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) - if str_completeName.split(' ')[0] == str_name.split(' ')[0]: - str_name = add_to_string('', str_completeName) - else: - str_name = add_to_string(str_name, str_completeName) - - print(str_date, df_temp_before.shape, df_temp_after.shape) + else: + # check if person can be identified from date of discussion + # exclude people that joined after date of discussion + df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')] + if df_temp_before.shape[0] == 1: + list_temp = list(df_temp_before['uniqueIndex']) + str_completeName = df_temp_before['completeName'].iloc[0] + + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) + if str_completeName.split(' ')[0] == str_name.split(' ')[0]: + str_name = add_to_string('', str_completeName) + else: + str_name = add_to_string(str_name, str_completeName) + + # exclude people that left before date of discussion + df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')] + if df_temp_after.shape[0] == 1: + list_temp = list(df_temp_after['uniqueIndex']) + str_completeName = df_temp_after['completeName'].iloc[0] + + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) + if str_completeName.split(' ')[0] == str_name.split(' ')[0]: + str_name = add_to_string('', str_completeName) + else: + str_name = add_to_string(str_name, str_completeName) + + print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape) - # TODO: does this order make sense? council before date?? # TODO: function to update list unique ID and str_name @@ -783,13 +785,20 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): def get_cities(list_citizenship): return [city[:-5] for item in list_citizenship for city in item.split(',')] -# function to get list of places -def get_list_cantons(df_names, str_name = ''): - if str_name: - df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)] +def get_df_temp_canton(df_names, str_name, str_council): + + if str_council in ['Nationalrat', 'Ständerat']: + df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] else: - df_temp = df_names.loc[df_names['nameType']=='canton'] - #print(df_temp) + df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)] + + return df_temp + +# function to get list of places +def get_list_cantons(df_names, str_name, str_council = ''): + + df_temp = get_df_temp_canton(df_names, str_name, str_council) + # list of cantons list_cantonname = list(df_temp['CantonName']) # TODO this will lead to an error!