diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt index 3c05f4fa40168b361d648d747d12c3b81e87a6cb..a3d5eaf6a98727e2531b45f0b4d6a6a274fad5e4 100644 --- a/data/lists/not_names.txt +++ b/data/lists/not_names.txt @@ -33,6 +33,7 @@ lassen Leider leider lieber +Lieber liegen Masse Minister diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index 14cc2e22b1bcb583a4786cb4b7aa39015fbc0098..b47fdb664c6e0019bb9e3a2b8dac434b96962196 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -11,7 +11,7 @@ speaker not identifiable: 1951/20035991: Dietschi (CANTON MISSING) Berichterstatter [1350, 1351] 1951-10-02 00:00 9 ['Dietschi', 'Berichterstatter'] 1951/20035171: Perrin (CANTON MISSING) rapporteur [3935, 3939] 1951-12-07 00:00 0 ['Perrin', 'rapporteur'] 1931/20031058: Pfister (CANTON MISSING) [3980, 3981, 3984] 1931-09-25 00:00 4 ['Pfister'] -1961/20037310: Berger (CANTON MISSING) rapporteur [368, 373, 375] 1961-09-21 00:00 1 ['Berger', 'rapporteur'] +1961/20037310: Berger (CANTON MISSING) rapporteur [368, 373, 375] 1961-09-21 00:00 1 ['Berger', 'rapporteur'] speaker not uniquely identified when he spoke the second time: @@ -24,11 +24,12 @@ speaker not uniquely identified when he spoke the second time: 1931/20031095: Pfister (CANTON MISSING) Berichterstatter [3980, 3981, 3984] 1931-12-10 00:00 3 ['Pf', 'ister', 'Berichterstatter'] 1921/20029087: Müller (CANTON MISSING) [3663, 3695] 1921-01-20 00:00 6 ['Müller'] + identifier is split into two words ---------------------------------- 1925/20029945, 1951/20035173: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING) [4639, 4660] 1971/20000498: ['M', 'Muf', 'ny', 'rapporteur', 'de', 'la', 'majorité'] 7 --> finds Muff but is Mugny -1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] +1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] --> solved by adding Schaff as additional Info 1941/ : Müller Aarb erg @@ -59,30 +60,22 @@ look for typical terms such as gestellt, gesagt, etc. 1971/20000093: found a name: In zwei wesentlichen Punkten bin ich mit Herrn Kollega Biel absolut einverstanden ['zwei', 'wesentlichen', 'Punkten', 'Kollega', 'Biel', 'absolut', 'einverstanden'] 1 Biel Walter (Zürich ZH) [426] 1971/20000614: Zu Herrn Fischer 1951/20035112: Schmid (CANTON MISSING) [4639, 4646, 4660] 1951-09-26 00:00 27 ['Antrag', 'Schmid'] -1941/20033145: Prof. Böhler erklärt --> finds Bühler -Bundesrat not found: --------------------- -1951/20035017,26: Petitpierre (CANTON MISSING) [3955, 3956] 1951-04-03 00:00 8 ['Petitpierre', 'conseiller', 'fédéral'] -1951/20035018,20,77,83: Rubattel (CANTON MISSING) [4381, 4382] 1951-04-03 00:00 6 ['Rubattel', 'conseiller', 'fédéral'] -1931/20030968: Häberlin (CANTON MISSING) [2290, 2287] 1931-03-24 00:00 6 ['Bundespräsident', 'Häberlin'] Häberlin,Heinrich,TG,Bundespräsident -1931/20031089: Meyer (CANTON MISSING) Bundesrat [3482, 3483, 3490, 3495] 1931-12-08 00:00 1 ['Meyer', 'Bundesrat'] + +President not found: +------------------- 1921/20029085: Müller (CANTON MISSING) Président [3663, 3695] 1921-01-19 00:00 9 ['Mlle', 'Président'] -1911/20027998: Forrer (CANTON MISSING) Bundesrat [1771, 1773] 1911-03-30 09:00 2 ['Bundesrat', 'Forrer'] -1911/20028039: Müller (CANTON MISSING) Bundesrat [3642, 3653, 3663, 3683] 1911-10-05 08:30 11 ['Bundesrat', 'Müller'] weird layout: ------------- 1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout) -wrong entries in xlsx: ----------------------- -1931/20030940,49: Scherer (CANTON MISSING) [4560, 4565] 1931-03-18 00:00 18 ['Scherer'] --> there are two entries for one person bad OCR: -------- 1941/20033146: MüHer instead of Müller is not discovered +1911/20027998: ['UsterijBericbterstatter', 'Kommission'] 2 --> Usteri not found not sure about place: @@ -104,12 +97,7 @@ term is a name 1971/20000024: Politisch gesehen ist es doch ganz einfach so --> finds Ganz -person not yet in council -------------------------- -1971/20000055: Debétaz - - -person has entry date 29.11.71 but is not yet active (presumably): +person has been elected but not yet officially started (presumably): ------------------------------------------------------------------ 1971/20000587: Tanner Paul starts officiall on 29.11.71, discussion is on 30.11.71 --> finds two! 1971/20000588: one Kohler starts 29.11.71, discussion is on 30.11.71 --> finds two! @@ -124,6 +112,7 @@ Firstname before LastName 1911/20028010: Eugster (CANTON MISSING) [1571, 1572] 1911-06-22 08:00 15 ['Arthur', 'Eugster'] 1961/20037222: Borel (CANTON MISSING) [590, 591] 1961-03-15 00:00 6 ['Georges', 'Borei'] + two people with same last name and same citizenship --------------------------------------------------- 1951/20034993: Eggenberger Grabs @@ -138,9 +127,10 @@ Appenzeller some other persons wrongly identified as MP ------------------------------------------- 1925/20029833: Sauser-Hall (not a MP)--> Hauser +1941/20033145: Prof. Böhler erklärt --> finds Bühler - +====================================================================================== solved: also check for council: ----------------------- 1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann --> solved! @@ -165,3 +155,19 @@ solved: wrongly spelled city 1925/20029963: Jenny Ennend (instead of Ennenda) --> solved! 1925/20029995,96: Keller Zurich (instead of Zürich) --> solved! 1971/? : Berne instead of Bern --> solved with using get_approximate_term for cantons + + +solved: Bundesrat not found: +-------------------- +1951/20035017,26: Petitpierre (CANTON MISSING) [3955, 3956] 1951-04-03 00:00 8 ['Petitpierre', 'conseiller', 'fédéral'] --> solved! +1951/20035018,20,77,83: Rubattel (CANTON MISSING) [4381, 4382] 1951-04-03 00:00 6 ['Rubattel', 'conseiller', 'fédéral'] --> solved! +1931/20031089: Meyer (CANTON MISSING) Bundesrat [3482, 3483, 3490, 3495] 1931-12-08 00:00 1 ['Meyer', 'Bundesrat'] --> solved! +1931/20030968: Häberlin (CANTON MISSING) [2290, 2287] 1931-03-24 00:00 6 ['Bundespräsident', 'Häberlin'] --> solved! +1911/20027998: Forrer (CANTON MISSING) Bundesrat [1771, 1773] 1911-03-30 09:00 2 ['Bundesrat', 'Forrer'] --> solved +1911/20028039: Müller (CANTON MISSING) Bundesrat [3642, 3653, 3663, 3683] 1911-10-05 08:30 11 ['Bundesrat', 'Müller'] --> solved! + + +solved: wrong entries in xlsx: +---------------------- +1931/20030940,49: Scherer (CANTON MISSING) [4560, 4565] 1931-03-18 00:00 18 ['Scherer'] --> there are two entries for one person +1971/20000055: Debétaz, was not there as a NR diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 15826cc8a065ce58a7f8613c616b9eeb0c9585d8..bf6a1808c7a2e176a6e9262ef7536a1cfec4b2b3 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -26,7 +26,7 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1951' +year = '1911' input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" @@ -106,7 +106,7 @@ for file_tarpath in files_to_process: file_doc.str_date = str_date file_doc.annotate_xml() -# Commands to get the compressed version of the file +# Commands to get the compressegid version of the file # (compressed file is around 5 times smaller than uncompressed file) #data/AB/${year}/05_annotatedxml.tar.gz utils_proc.compress_tar(output_annotatedxml) @@ -121,7 +121,7 @@ with open(input_notnames) as f: list_notnames = [term.rstrip() for term in list_notnames] # to test for one file -file_tarpath = './1951/20035006_datacorr.xml' +file_tarpath = './1931/20030968_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -160,8 +160,18 @@ listilist = ['a', 'b', 'c', 'd'] listilist[0,2] # OPTIMIZE - +list_1 = [1, 2,3, 4, 5] +list_2 = [2,7,8] +len(set(list_1).intersection(list_2)) if 'ab' in 'abc': print('yay') + +a = 10 +if a < 7 or a > 9: + print(a) + +a = 'asdf' +b = 'asdf' +a == b == 'asdf' diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index daffdb615a55714ee6afab016eb2634627df77f2..c7eb9619e697784b4719599bb8b42087ab81ab56 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -243,8 +243,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', 'Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', - 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller', 'fédéral', 'fédéral' - 'Vizepräsident', 'Bundespräsident'] + 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller', 'fédéral', 'fédéral', + 'Bundespräsident'] list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] # initialize flag @@ -307,6 +307,12 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # if possible, find a name from the list str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str_council, str_date, bln_print=False) + # get rid of role Bundesrat with no name associated to it + # helps to reduce false positives + # TODO: might lead to false negatives, i.e. if a person was not identified by its name but is referenced as federal council + if str_role == 'federalcouncil' and str_name == '': + str_role = '' + if len(list_uniqueID) > 1 or flag_print: with open('data/lists/notunique.txt', 'a') as f: f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), str(list_oi), '\n'))) @@ -556,6 +562,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str def add_to_string(string, term): if not string: string = term + elif string == term == 'federalcouncil': + pass else: string += ' ' + term return string @@ -566,6 +574,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str list_uniqueID = [] str_canton = '' name_type = '' + str_council_federal = '' # extract list and array of last names list_all_names = list(df_names['shortName']) @@ -574,21 +583,45 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # for every term for term in list_oi: - if bln_print: - print('now is about: ------', term) - + #list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] if term in list_roles: # update str_role # TODO: also look for similar terms (misspellings) - # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter - str_role = add_to_string(str_role, term) + # TODO: what with Bundespräsident? + # TODO: is Berichterstatter the same as Sprecher? + # assign role in English + if term in ['Präsident', 'Präsidentin', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente']: + str_assignedRole = 'president' + elif term in ['Vizepräsident']: + str_assignedRole = 'vice-president' + elif term in ['Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole']: + str_assignedRole = 'reporter' + elif term in ['Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller', 'fédéral', 'Bundespräsident', 'Bundespräsidentin']: + str_assignedRole = 'federalcouncil' + str_council = 'Bundesrat' # needs to be German to be used in dataframe + + # update str_role + str_role = add_to_string(str_role, str_assignedRole) if bln_print: - print('found a role', term) + print('found a role', term, str_assignedRole) elif term in list_roles_ext: - pass - # TODO: extract whether it is minority or majority and save that information + + # get more details on reporter + # TODO: could be refined for Minderheit I, II, III, etc... + # TODO: add italian + if term in ['Mehrheit', 'majorité']: + str_assignedRole = 'majority' + elif term in ['Minderheit', 'minorité']: + str_assignedRole = 'minority' + elif term in ['deutscher', 'deutsch', 'allemand', 'allemande']: + str_assignedRole = 'German' + elif term in ['français', 'française', 'französischer', 'französische']: + str_assignedRole = 'French' + + # update str_role + str_role = add_to_string(str_role, str_assignedRole) # cannot happen for the first term in list_oi elif name_type == 'canton': @@ -686,9 +719,6 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # check if person can be identified from council df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)] if df_temp.shape[0] == 1: - ## check if person can be identified from council - #list_councils = list(df_temp['CouncilName']) - #if list_councils.count(str_council) == 1: list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0] @@ -729,6 +759,20 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # TODO: function to update list unique ID and str_name + # if a federal council is referenced as "Name Bundesrat", it is not found by the existing procedure + if str_council == 'Bundesrat' and 'CANTON MISSING' in str_name: + # check if person can be identified from council + df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)] + if df_temp.shape[0] == 1: + list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0] + + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) + if str_completeName.split(' ')[0] == str_name.split(' ')[0] or str_completeName.split(' ')[1] == str_name.split(' ')[0]: + str_name = add_to_string('', str_completeName) + else: + str_name = add_to_string(str_name, str_completeName) + return str_name, str_role, list_uniqueID, str_canton @@ -807,7 +851,7 @@ def get_cities(list_citizenship): def get_df_temp(df_names, str_name, str_council): - if str_council in ['Nationalrat', 'Ständerat']: + if str_council in ['Nationalrat', 'Ständerat', 'Bundesrat']: df_temp = df_names.loc[(df_names['shortName']==str_name) & (df_names['CouncilName']==str_council)] else: df_temp = df_names.loc[(df_names['shortName']==str_name)]