diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index d28e5208b2a30624df07801a66463da0ac6d9919..3303c2891696b60eaaf254bfbee69fcba8567385 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -13,6 +13,18 @@ speaker not identifiable: 1931/20031058: Pfister (CANTON MISSING) [3980, 3981, 3984] 1931-09-25 00:00 4 ['Pfister'] 1961/20037310: Berger (CANTON MISSING) rapporteur [368, 373, 375] 1961-09-21 00:00 1 ['Berger', 'rapporteur'] 1956/20036009: Steiner (CANTON MISSING) [5049, 5054] 1956-03-07 00:00 5 ['Steiner'] +1952/20035320: Bühler (CANTON MISSING) reporter [811, 816] 1952-09-17 00:00 7 ['Bühler', 'Berichterstatter'] +1952/20035322: Albrecht (CANTON MISSING) reporter [76, 78] 1952-09-17 00:00 2 ['Albrecht', 'Berichterstatter'] +1948/20034218: Bringolf (CANTON MISSING) [707, 706] 1948-03-11 00:00 2 ['Bringolf'] +1948/20034315: Roth (CANTON MISSING) [4354, 4364] 1948-09-23 00:00 9 ['Rot'] +1948/20034328,29: Bucher (CANTON MISSING) [778, 779] 1948-10-07 00:00 1 ['Bucher'] (there is an Antrag Bucher-Zürich) +1944/20033495: Perrin (CANTON MISSING) reporter majority [3935, 3939] 1944-03-21 00:00 24 ['Perrin', 'rapporteur', 'majorité'] +1944/20033504,28: Schnyder (CANTON MISSING) [4712, 4716] 1944-03-29 00:00 19 ['Schnyder'] +1936/20031982: Bossi (CANTON MISSING) [632, 634] 1936-01-07 00:00 26 ['Bossi'] +1936/2003198x: Keller, Berichterstatter +1936/20032003: Widmer (CANTON MISSING) [5696, 5703] 1936-01-22 00:00 0 ['Widmer'] +1936/20032132: Huber (CANTON MISSING) [2591, 2599] 1936-09-23 00:00 9 ['Huber'] +1936/20032149: Odermatt (CANTON MISSING) [3818, 3819] 1936-10-06 00:00 10 ['Odermatt'] speaker not uniquely identified when he spoke the second time: @@ -24,6 +36,8 @@ speaker not uniquely identified when he spoke the second time: 1951/20034996: Studer 1931/20031095: Pfister (CANTON MISSING) Berichterstatter [3980, 3981, 3984] 1931-12-10 00:00 3 ['Pf', 'ister', 'Berichterstatter'] 1921/20029087: Müller (CANTON MISSING) [3663, 3695] 1921-01-20 00:00 6 ['Müller'] +1952/20025282,5330: Dietschi (CANTON MISSING) reporter majority [1350, 1351] 1952-06-18 00:00 19 ['Dietschi', 'Berichterstatter', 'Mehrheit'] +1940/20033001: Keller (CANTON MISSING) reporter majority [2868, 2871, 2890] 1940-03-27 00:00 16 ['Keller', 'Berichterstatter', 'Mehrheit'] identifier is split into two words @@ -33,35 +47,33 @@ identifier is split into two words 1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] --> solved by adding Schaff as additional Info 1941/ : Müller Aarb erg 1956/20036201: Berger (CANTON MISSING) reporter [368, 373, 375] 1956-12-10 00:00 1 ['Berger', 'Neuch', 'à tei', 'rapporteur'] +1936/20031984: Keller (CANTON MISSING) reporter majority [2868, 2871, 2890] 1936-01-08 00:00 10 ['Keller', 'Aar', 'Berichterstatter', 'Mehrheit'] Aarau split in Aar au +1936/20031998: Stähli (CANTON MISSING) [4967, 4964] 1936-01-17 00:00 13 ['Stähli', 'Sieb', 'nen'] +1936/20032015: Müller (CANTON MISSING) [3638, 3645, 3652, 3654, 3658, 3659, 3662] 1936-01-30 00:00 6 ['Müller', 'Grosshöchste', 'tten'] -identified as speech start but is in text: +identified as speech start but is in text: --> some of these might be solved by only looking at list_oi with less than 9 elements ------------------------------------------ -do I really need to look on the first two lines? maybe one is sufficient? --> no, it needs two lines -look for typical terms such as gestellt, gesagt, etc. - -1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt: -1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. Bühler folgend, die Erklärung gewählt: -1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 : 1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete: -1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt: -1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger -1925/20029891: J'en viens enfin à M. Belmont. M. Belmont a posé cette question --> finds Belmont twice -1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text 1925/20029917: Herr Hauser sagt: 1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt: and a second time in the same document with Naine 1925/20029981: Brügger möchte ich sagen: -1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout) -1971/20000007: La seconde réaction qu'a suscité chez moi l'intervention de M. Weber est le doute: 1971/20000007: Herr Kollega Gut hat es gesagt: 1971/20000007: Noch eine Antwort an Kollege Clottu 1971/20000010,11: Nun noch etwas zu Richard Müller. Erstens 1971/20000024: Noch ein Wort zu Herrn Ständerat Wenk -1971/20000024: Herr Kollege Heimann stellt sich schliesslich gegen einen Finanzausgleich mit dem Hinweis 1971/20000093: Meine erste Frage an den Bundesrat lautet 1971/20000093: found a name: In zwei wesentlichen Punkten bin ich mit Herrn Kollega Biel absolut einverstanden ['zwei', 'wesentlichen', 'Punkten', 'Kollega', 'Biel', 'absolut', 'einverstanden'] 1 Biel Walter (Zürich ZH) [426] 1971/20000614: Zu Herrn Fischer 1951/20035112: Schmid (CANTON MISSING) [4639, 4646, 4660] 1951-09-26 00:00 27 ['Antrag', 'Schmid'] +1932/20031152: Je comprends M. Bossi Bossi (CANTON MISSING) [632, 634] 1932-03-16 00:00 13 ['comprends', 'Bossi'] + + +list of people in a minority are recognized as speech starts: +------------------------------------------------------------- +1936/20031980,85,many more: Schmid (CANTON MISSING) [4639, 4660, 4672] 1936-01-06 00:00 0 ['Schmid', 'Schneider'] + Schmid (CANTON MISSING) [4639, 4660, 4672] 1936-01-06 00:00 0 ['Schmid', 'Soleure', 'Schneider'] +1936/20031990: many, e.g. Widmer (CANTON MISSING) [5696, 5703] 1936-01-14 00:00 0 ['Walter', 'Olten', 'Widmer', 'Rossi'] President not found: @@ -69,6 +81,14 @@ President not found: 1921/20029085: Müller (CANTON MISSING) Président [3663, 3695] 1921-01-19 00:00 9 ['Mlle', 'Président'] +misspelled role: +---------------- +1936/20031986: Meyer (CANTON MISSING) [3482, 3483, 3488, 3490] 1936-01-09 00:00 14 ['Rundespräsident', 'Meyer'] +1936/20031992: Meyer (CANTON MISSING) [3482, 3483, 3488, 3490] 1936-01-15 00:00 10 ['ßundespräsident', 'Meyer'] +1932/20031299: Häberlin (CANTON MISSING) [2290, 2287] 1932-09-21 00:00 6 ['Bimdesrat', 'Häberlin'] + + + weird layout: ------------- 1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout) @@ -79,12 +99,16 @@ bad OCR: 1941/20033146: MüHer --> Müller is not discovered 1911/20027998: ['UsterijBericbterstatter', 'Kommission'] 2 --> Usteri not found 1956/20036007: reporter [] 1956-03-06 00:00 1 ['Gtliliand', 'rapporteur'] --> Guinand not found +1952/20035242, and some others: reporter [] 1952-03-25 00:00 9 ['Spanier', 'Berichterstatter'] --> Spühler not found +1948/20034315: reporter [] 1948-09-23 00:00 5 ['Statili', 'Berichterstatter'] --> Stähli not found +1936/20032015: reporter majority [] 1936-01-30 00:00 2 ['Statili', 'Berichterstatter', 'Mehrheit'] --> Stähli not found not sure about place: --------------------- 1921/20029090: Seiler (CANTON MISSING) [4810, 4815] 1921-02-24 00:00 5 ['Seiler', 'Sitten'] found no connection between Seiler Hermann and Sitten + term very similar to one name is actually another name ------------------------------------------------------ 1925/20029863: ganz --> finds Lanz, there is a Ganz diff --git a/data/politicians/MPs_additionalInfo.csv b/data/politicians/MPs_additionalInfo.csv index 39e6a2996f12722d58fa9268adb5d4b42c716e3a..73781b6e90785e554538c54fb6aac9ad1371091e 100644 --- a/data/politicians/MPs_additionalInfo.csv +++ b/data/politicians/MPs_additionalInfo.csv @@ -1,55 +1,68 @@ -LastName,FirstName,CantonAbbreviation,Additional -Cramer,Konrad H.,ZH,Frey -Blumer,Johannes,SG,Egloff -Jordan,Adolphe,VD,Martin -Schmid,Jakob,LU,Ronca -Sulzer,Eduard,ZH,Ziegler -Eugster,Howard,AR,Züst -Bratschi,Peter,BE,Matten -Jenny,Johann,BE,Worblaufen -Seiler,Hermann,VS,Zermatt -Steiner,Ferdinand,LU,Malters -Naine,Charles-Théophile,VD,Préverenges -Naine,Charles-Théophile,NE,Préverenges -Fischer,Hanspeter,TG,Weinfelden -Hofer,Walther,BE,Flawil -Weber,Rolf,TG,Arbon -Zweifel,Esaja,GL,Landammann -Zweifel,Peter,GL,Regierungsrath -Bühler,Peter Theophil,GR,Bünden -Welti,Franz,BS,Basel -Schmid,Arthur,AG,Oberentfelden -Schmid,Jacques,SO,Olten -Müller,Alfred,TG,Amriswil -Müller,Hans Gottfried,BE,Aarberg -Müller,Alban,SO,Olten -Perrin,Tell,NE,Chaux -Perrin,Paul,VD,Corcelles -Schmid-Ruedin,Philipp,ZH,Philip -Studer,Ernst,BE,Burgdorf -Meier,Christian,GL,Netstal -Kunz,Alois,LU,Hergiswil -Cottier,Henry,VD,Lausanne -Bringolf,Walther,SH,Schaff -Bringolf,Richard,VD,Peilz -Roth,August,TG,Frauenfeld -Bossi,Bixio,TI,Lugano -Bossi,Johann,GR,Chur -Müller,Hans,BE,Grosshöchstetten -Roth,Hans,BE,Interlaken -Bühler,Rolf Theodor,SG,Uzwil -Pfister,Bruno,SG,Gallen -Roth,August,TG,Arbon -Rusca,Giovan-Battista,TI,Locarno -Moser,Arthur,SH,Schaff -Weber,Jakob Rudolf,BE,Grasswil -Weber,Heinrich Otto,SG,Gallen -Sigg,Jean-C.,GE,Genève -Suter,Johannes,BL,Baselland -Sonderegger,Johann Jakob,AR,Ausserrhoden -Sonderegger,Karl Justin,IR,Innerrhoden -Müller,Emil,BL,Baselland -König,Walter,BE,Biel -Meier,Ernst,AG,Baden -Gfeller,Hans,BE,Oppligen -Berger,Clause,NE,Neuchâtel +LastName,FirstName,CantonAbbreviation,Additional,Additional2,Additional3 +Berger,Claude,NE,Neuchâtel,Neuch, +Blumer,Johannes,SG,Egloff,, +Bossi,Bixio,TI,Lugano,, +Bossi,Johann,GR,Chur,, +Bratschi,Peter,BE,Matten,, +Bringolf,Walther,SH,Schaff,, +Bringolf,Richard,VD,Peilz,, +Bühler,René,SG,Uzwil,, +Bühler,Peter Theophil,GR,Bünden,, +Bühler,Rolf Theodor,SG,Uzwil,, +Cottier,Henry,VD,Lausanne,, +Cottier,Fernand,GE,Genève,, +Cramer,Konrad H.,ZH,Frey,, +Eggenberger,Matthias,SG,Niederuzwil,, +Eugster,Howard,AR,Züst,, +Fischer,Hanspeter,TG,Weinfelden,, +Gfeller,Hans,BE,Oppligen,, +Gfeller,Arnold,BS,Basel,, +Hofer,Walther,BE,Flawil,, +Jenny,Johann,BE,Worblaufen,, +Jordan,Adolphe,VD,Martin,, +Kunz,Alois,LU,Hergiswil,, +Kunz,Paul,BE,Thun,, +König,Walter,BE,Biel,, +Meier,Christian,GL,Netstal,, +Meier,Ernst,AG,Baden,, +Moser,Arthur,SH,Schaff,, +Müller,Alfred,TG,Amriswil,, +Müller,Hans Gottfried,BE,Aarberg,Aarb, +Müller,Hans,BE,Grosshöchstetten,höchstetten,Grosshöchste +Müller,Alban,SO,Olten,, +Müller,Emil,BL,Baselland,, +Müller,Johannes,SG,Gallen,, +Naine,Charles-Théophile,VD,Préverenges,, +Naine,Charles-Théophile,NE,Préverenges,, +Odermatt,Gottfried,NW,Ennetbürgen,, +Perrin,Tell,NE,Chaux,, +Perrin,Paul,VD,Corcelles,, +Pfister,Bruno,SG,Gallen,, +Pfister,Eduard,TG,Frauenfeld,, +Roth,August,TG,Frauenfeld,Arbon, +Roth,Hans,BE,Interlaken,, +Rusca,Giovan-Battista,TI,Locarno,, +Schirmer,L. August,SG,Gallen,, +Schirmer,August,AG,Baden,, +Schmid,Jakob,LU,Ronca,, +Schmid,Arthur,AG,Oberentfelden,Oberentf,entfelden +Schmid,Jacques,SO,Olten,, +Schmid,Ernst,BE,Dieterswil,, +Schmid-Ruedin,Philipp,ZH,Philip,, +Seiler,Hermann,VS,Zermatt,, +Sigg,Jean-C.,GE,Genève,, +Sonderegger,Johann Jakob,AR,Ausserrhoden,, +Sonderegger,Karl Justin,IR,Innerrhoden,, +Stähli,Fritz,SZ,Siebnen,Sieb, +Steiner,Ferdinand,LU,Malters,, +Studer,Ernst,BE,Burgdorf,, +Sulzer,Eduard,ZH,Ziegler,, +Suter,Johannes,BL,Baselland,, +von Moos,Ludwig,OW,MOOS,, +Weber,Rolf,TG,Arbon,, +Weber,Jakob Rudolf,BE,Grasswil,, +Weber,Heinrich Otto,SG,Gallen,, +Welti,Franz,BS,Basel,, +Welti,Adolf,AG,Rheinfelden,, +Zweifel,Esaja,GL,Landammann,, +Zweifel,Peter,GL,Regierungsrath,, diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index a246a3ef690e20142f60c1d933432cab3a322d39..ffb22273d02ed619cab3227a6eceb9fa19420d3d 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -172,9 +172,11 @@ class MPs_Extractor(object): _df_after1890 = pd.concat([_df1, _df2]) # generate unique ID for every person - # generate two now columns + # generate now columns _df_after1890 = _df_after1890.assign(uniqueIndex=0) _df_after1890 = _df_after1890.assign(additionalInfo='') + _df_after1890 = _df_after1890.assign(additionalInfo2='') + _df_after1890 = _df_after1890.assign(additionalInfo3='') # group by first and last name, and date of birth _grouped = _df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"]) @@ -187,6 +189,8 @@ class MPs_Extractor(object): df_addInfo = pd.read_csv(self.input_file_addInfo) for row in df_addInfo.itertuples(index=False, name='Pandas'): _df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]) & (_df_after1890['CantonAbbreviation'] == row[2]), 'additionalInfo'] = row[3] + _df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]) & (_df_after1890['CantonAbbreviation'] == row[2]), 'additionalInfo2'] = row[4] + _df_after1890.loc[(_df_after1890['LastName'] == row[0]) & (_df_after1890['FirstName'] == row[1]) & (_df_after1890['CantonAbbreviation'] == row[2]), 'additionalInfo3'] = row[5] # write dataframe to csv _df_after1890.to_csv(self.output_file_csv) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 299ff5782a40da408132ed0b20f3028000d85dbf..8f81d0723b7af07ac6bf647211768198e70d0f22 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -608,6 +608,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str elif term in list_roles_ext: + str_assignedRole = '' + # get more details on reporter # TODO: could be refined for Minderheit I, II, III, etc... # TODO: add italian @@ -700,8 +702,12 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # if term is in the list of all names if term in list_all_names: - # get correct name and uniqueID, or role, for that term - str_name, list_uniqueID, name_type = get_string(term, df_names, str_name, list_uniqueID) + # if term is not in str_name already, e.g. if second part of double name is also a name + # e.g. 1952/20035242 Widmer-Kunz (it is the same as Widmer) + # TODO: maybe also add to term_approx?? + if term not in str_name: + # get correct name and uniqueID, or role, for that term + str_name, list_uniqueID, name_type = get_string(term, df_names, str_name, list_uniqueID) if bln_print: print('=== correct name', term) @@ -915,11 +921,12 @@ def get_list_cantons(df_names, str_name, str_council = '', str_firstname = ''): list_additionalInfo = list(df_temp[str_additionalInfo]) # generate list of cantons including string + # additionalinfo should be before citizenship (helps to find people which have same citizenship but a specified addtionalinfo) list_cantons = [(list_cantonname, str_CantonName), (list_cantonabbr, str_CantonAbbreviation), + (list_additionalInfo, str_additionalInfo), (list_citizenship, str_Citizenship), (list_firstname, str_FirstName), - (list_additionalInfo, str_additionalInfo), ] # return list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo