Skip to content
Snippets Groups Projects
Commit 14eb9b77 authored by Lili Gasser's avatar Lili Gasser
Browse files

implemented council and date disambiguation

parent 321563de
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,7 @@ also check for council:
one MP not active in whole year, leads to other not uniquely identified
-----------------------------------------------------------------------
1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) --> solved!
1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) --> solved!
1925/20029967: Huber (in December, the second Huber already left) --> finds two!) --> solved because only NR!
1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. --> solved!
......
......@@ -141,7 +141,6 @@ file_doc.get_council_date()
#len(files_to_process)
file_doc.check_discussion()
str_date = '22.09.1925'
str_date = '1925-12-09 08:00'
import datetime
datetime.datetime.strptime(str_date, '%d.%m.%Y')
datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')
......@@ -56,11 +56,11 @@ def get_council_and_date(path_meta_xml_file):
# parse, get root and then part of interest
XML_tree = ET.parse(path_meta_xml_file)
XML_root = XML_tree.getroot()
XML_poi = XML_root[0].find('META_FROM_DB')
XML_poi = XML_root[0]
# get council and date
str_council = XML_poi.attrib['RAT']
str_date = XML_poi.attrib['DATUM']
str_council = XML_poi.find('META_FROM_DB').attrib['RAT']
str_date = XML_poi.attrib['PUBLIKATIONS_DATUM']
return (str_council, str_date)
......@@ -570,7 +570,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# cannot happen for the first term in list_oi
elif name_type == 'canton':
list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0])
list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0], str_council)
canton_type = ''
if term in list_cantonname:
str_canton = term
......@@ -608,10 +608,11 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# get rid of CANTON MISSING
str_name = str_name.split(' ')[0]
df_temp = get_df_temp_canton(df_names, str_name, str_council)
# extract uniqueID
# if Citizenship, do proper comparison
# if Citizenship, get list of cities and compare each to term
if canton_type == 'Citizenship':
df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)]
# get list of cities
list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])]
str_citizenship = ''
try:
......@@ -621,12 +622,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
print('found no or more than one person with citizenship', str_canton, str_name)
pass
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0]
list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0]
else:
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0]
list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]
print(list_temp, list_uniqueID)
......@@ -662,10 +663,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
# initial checks for not uniquely identified peoples
# TODO check for false positives of these procedures
if name_type == 'canton':
df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0])]
# check if person can be identified from council
list_councils = list(df_temp['CouncilName'])
if list_councils.count(str_council) == 1:
df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)]
if df_temp.shape[0] == 1:
## check if person can be identified from council
#list_councils = list(df_temp['CouncilName'])
#if list_councils.count(str_council) == 1:
list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0]
......@@ -675,36 +678,35 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
else:
str_name = add_to_string(str_name, str_completeName)
# check if person can be identified from date of discussion
# TODO: is input dataformat always the same?
df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%d.%m.%Y')]
# TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0]
if df_temp_before.shape[0] == 1:
list_temp = list(df_temp_before['uniqueIndex'])
str_completeName = df_temp_before['completeName'].iloc[0]
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
str_name = add_to_string('', str_completeName)
else:
str_name = add_to_string(str_name, str_completeName)
df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%d.%m.%Y')]
# TODO: replace by (or add another condition) if df_temp_before.shape[0] < df_temp.shape[0]
if df_temp_after.shape[0] == 1:
list_temp = list(df_temp_after['uniqueIndex'])
str_completeName = df_temp_after['completeName'].iloc[0]
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
str_name = add_to_string('', str_completeName)
else:
str_name = add_to_string(str_name, str_completeName)
print(str_date, df_temp_before.shape, df_temp_after.shape)
else:
# check if person can be identified from date of discussion
# exclude people that joined after date of discussion
df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')]
if df_temp_before.shape[0] == 1:
list_temp = list(df_temp_before['uniqueIndex'])
str_completeName = df_temp_before['completeName'].iloc[0]
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
str_name = add_to_string('', str_completeName)
else:
str_name = add_to_string(str_name, str_completeName)
# exclude people that left before date of discussion
df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')]
if df_temp_after.shape[0] == 1:
list_temp = list(df_temp_after['uniqueIndex'])
str_completeName = df_temp_after['completeName'].iloc[0]
list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
if str_completeName.split(' ')[0] == str_name.split(' ')[0]:
str_name = add_to_string('', str_completeName)
else:
str_name = add_to_string(str_name, str_completeName)
print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape)
# TODO: does this order make sense? council before date??
# TODO: function to update list unique ID and str_name
......@@ -783,13 +785,20 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
def get_cities(list_citizenship):
return [city[:-5] for item in list_citizenship for city in item.split(',')]
# function to get list of places
def get_list_cantons(df_names, str_name = ''):
if str_name:
df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)]
def get_df_temp_canton(df_names, str_name, str_council):
if str_council in ['Nationalrat', 'Ständerat']:
df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)]
else:
df_temp = df_names.loc[df_names['nameType']=='canton']
#print(df_temp)
df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)]
return df_temp
# function to get list of places
def get_list_cantons(df_names, str_name, str_council = ''):
df_temp = get_df_temp_canton(df_names, str_name, str_council)
# list of cantons
list_cantonname = list(df_temp['CantonName'])
# TODO this will lead to an error!
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment