diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index b47fdb664c6e0019bb9e3a2b8dac434b96962196..d28e5208b2a30624df07801a66463da0ac6d9919 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -12,6 +12,7 @@ speaker not identifiable: 1951/20035171: Perrin (CANTON MISSING) rapporteur [3935, 3939] 1951-12-07 00:00 0 ['Perrin', 'rapporteur'] 1931/20031058: Pfister (CANTON MISSING) [3980, 3981, 3984] 1931-09-25 00:00 4 ['Pfister'] 1961/20037310: Berger (CANTON MISSING) rapporteur [368, 373, 375] 1961-09-21 00:00 1 ['Berger', 'rapporteur'] +1956/20036009: Steiner (CANTON MISSING) [5049, 5054] 1956-03-07 00:00 5 ['Steiner'] speaker not uniquely identified when he spoke the second time: @@ -31,6 +32,7 @@ identifier is split into two words 1971/20000498: ['M', 'Muf', 'ny', 'rapporteur', 'de', 'la', 'majorité'] 7 --> finds Muff but is Mugny 1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] --> solved by adding Schaff as additional Info 1941/ : Müller Aarb erg +1956/20036201: Berger (CANTON MISSING) reporter [368, 373, 375] 1956-12-10 00:00 1 ['Berger', 'Neuch', 'à tei', 'rapporteur'] identified as speech start but is in text: @@ -74,8 +76,9 @@ weird layout: bad OCR: -------- -1941/20033146: MüHer instead of Müller is not discovered +1941/20033146: MüHer --> Müller is not discovered 1911/20027998: ['UsterijBericbterstatter', 'Kommission'] 2 --> Usteri not found +1956/20036007: reporter [] 1956-03-06 00:00 1 ['Gtliliand', 'rapporteur'] --> Guinand not found not sure about place: @@ -105,12 +108,13 @@ person has been elected but not yet officially started (presumably): 1921/20029265: one Huber starts 5.12.21, discussion is on 6.12.21 --> finds two -Firstname before LastName +Firstname before LastName --> solved (but not for misspelled firstnames) ------------------------- 1971/20000592: Simon Kohler rapporteur 1911/20028008: Frey (CANTON MISSING) [1816, 1828] 1911-06-13 08:00 2 ['Alfred', 'Frey'] 1911/20028010: Eugster (CANTON MISSING) [1571, 1572] 1911-06-22 08:00 15 ['Arthur', 'Eugster'] 1961/20037222: Borel (CANTON MISSING) [590, 591] 1961-03-15 00:00 6 ['Georges', 'Borei'] +1956/20036021,22,23: Borel (CANTON MISSING) reporter [590, 591] 1956-03-16 00:00 1 ['Alfred', 'Borei', 'rapporteur'] two people with same last name and same citizenship diff --git a/data/politicians/MPs_additionalInfo.csv b/data/politicians/MPs_additionalInfo.csv index b76f6c08b99b3ba5795f7743bce0ad2071651bab..feb0543aa4b8ab46ffbf558d614d0068b526da1f 100644 --- a/data/politicians/MPs_additionalInfo.csv +++ b/data/politicians/MPs_additionalInfo.csv @@ -50,3 +50,6 @@ Sonderegger,Johann Jakob,AR,Ausserrhoden Sonderegger,Karl Justin,IR,Innerrhoden Müller,Emil,BL,Baselland König,Walter,BE,Biel +Meier,Ernst,AG,Baden +Gfeller,Hans,BE,Oppligen +Berger,Clause,NE,Neuchâtel diff --git a/data/politicians/Ratsmitglieder_1848_DE_corr.xlsx b/data/politicians/Ratsmitglieder_1848_DE_corr.xlsx index 88627f969d79b2c9c2142c021a01499dd42ba57f..5a23edca35923db696ac475e8601b36687bed29f 100644 Binary files a/data/politicians/Ratsmitglieder_1848_DE_corr.xlsx and b/data/politicians/Ratsmitglieder_1848_DE_corr.xlsx differ diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index c7eb9619e697784b4719599bb8b42087ab81ab56..299ff5782a40da408132ed0b20f3028000d85dbf 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -574,16 +574,16 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str list_uniqueID = [] str_canton = '' name_type = '' - str_council_federal = '' + str_firstname = '' - # extract list and array of last names + # extract lists and arrays of names list_all_names = list(df_names['shortName']) array_all_names = np.array(df_names['shortName']) + list_all_firstnames = list(df_names['FirstName']) # for every term for term in list_oi: - #list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] if term in list_roles: # update str_role # TODO: also look for similar terms (misspellings) @@ -625,7 +625,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # cannot happen for the first term in list_oi elif name_type == 'canton': - list_cantons = get_list_cantons(df_names, str_name.split(' ')[0], str_council) + list_cantons = get_list_cantons(df_names, str_name.split(' ')[0], str_council, str_firstname) canton_type = '' for list_, type_ in list_cantons: if term in list_: @@ -691,6 +691,10 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: print('could not be identified as a canton:', term, list_oi, str_name, str_role) + elif term in list_all_firstnames: + str_firstname = term + print('found a first name', str_firstname) + # if term is not easily mistaken as a name (avoid false positives) elif term not in list_notnames: @@ -716,11 +720,25 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # initial checks for not uniquely identified peoples # TODO check for false positives of these procedures if name_type == 'canton': + # check if person can be identified from firstname + print(str_firstname) + if str_firstname: + df_temp = df_names.loc[(df_names['shortName']==str_name.split(' ')[0]) & (df_names['FirstName']==str_firstname)] + if df_temp.shape[0] == 1: + list_temp = list(df_temp.loc[(df_temp['shortName']==str_name.split(' ')[0]) & (df_temp['FirstName']==str_firstname)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name.split(' ')[0]) & (df_temp['FirstName']==str_firstname)].iloc[0] + + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) + if str_completeName.split(' ')[0] == str_name.split(' ')[0] or str_completeName.split(' ')[1] == str_name.split(' ')[0]: + str_name = add_to_string('', str_completeName) + else: + str_name = add_to_string(str_name, str_completeName) + # check if person can be identified from council df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)] if df_temp.shape[0] == 1: - list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) - str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0] + list_temp = list(df_temp.loc[(df_temp['shortName']==str_name.split(' ')[0]) & (df_temp['CouncilName']==str_council)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name.split(' ')[0]) & (df_temp['CouncilName']==str_council)].iloc[0] list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) if str_completeName.split(' ')[0] == str_name.split(' ')[0] or str_completeName.split(' ')[1] == str_name.split(' ')[0]: @@ -759,6 +777,9 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # TODO: function to update list unique ID and str_name + + + # if a federal council is referenced as "Name Bundesrat", it is not found by the existing procedure if str_council == 'Bundesrat' and 'CANTON MISSING' in str_name: # check if person can be identified from council @@ -849,17 +870,23 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): def get_cities(list_citizenship): return [city[:-5] for item in list_citizenship for city in item.split(',')] -def get_df_temp(df_names, str_name, str_council): +def get_df_temp(df_names, str_name, str_council = '', str_firstname = ''): - if str_council in ['Nationalrat', 'Ständerat', 'Bundesrat']: - df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] + if str_firstname: + if str_council in ['Nationalrat', 'Ständerat', 'Bundesrat']: + df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['FirstName']==str_firstname) & (df_names['CouncilName']==str_council)] + else: + df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['FirstName']==str_firstname)] else: - df_temp = df_names.loc[(df_names['shortName']==str_name)] + if str_council in ['Nationalrat', 'Ständerat', 'Bundesrat']: + df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] + else: + df_temp = df_names.loc[(df_names['shortName']==str_name)] return df_temp # function to get list of places -def get_list_cantons(df_names, str_name, str_council = ''): +def get_list_cantons(df_names, str_name, str_council = '', str_firstname = ''): # specify strings as they are used in Ratsmitglieder_1848_DE_corr.xlsx and therefore in df_names str_CantonName = 'CantonName' @@ -869,7 +896,7 @@ def get_list_cantons(df_names, str_name, str_council = ''): str_additionalInfo = 'additionalInfo' # get dataframe - df_temp = get_df_temp(df_names, str_name, str_council) + df_temp = get_df_temp(df_names, str_name, str_council, str_firstname) # list of cantons list_cantonname = list(df_temp[str_CantonName])