diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt index 1a44b305e7d8e5267e6140416581f5b397009058..3c05f4fa40168b361d648d747d12c3b81e87a6cb 100644 --- a/data/lists/not_names.txt +++ b/data/lists/not_names.txt @@ -1,3 +1,4 @@ +Abs Alinea Alter Ari @@ -8,6 +9,7 @@ bietet darin drehen eher +Erster ess Fällen fasse @@ -43,6 +45,7 @@ Recht Schrit Seite selber +sicher Sinne später Ständer @@ -68,6 +71,7 @@ Wort Worten Ziel Zuerst +Zusatz allemand autre Berne @@ -88,6 +92,7 @@ poser projet Rédiger rédiger +sais tirer vote delle diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index c46c3fe19a88256f573d41983299317e502749b8..14cc2e22b1bcb583a4786cb4b7aa39015fbc0098 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -10,6 +10,8 @@ speaker not identifiable: 1951/20034996: found a name: Studer ['Studer'] 0 Studer (CANTON MISSING) [5130, 5141] 1951/20035991: Dietschi (CANTON MISSING) Berichterstatter [1350, 1351] 1951-10-02 00:00 9 ['Dietschi', 'Berichterstatter'] 1951/20035171: Perrin (CANTON MISSING) rapporteur [3935, 3939] 1951-12-07 00:00 0 ['Perrin', 'rapporteur'] +1931/20031058: Pfister (CANTON MISSING) [3980, 3981, 3984] 1931-09-25 00:00 4 ['Pfister'] +1961/20037310: Berger (CANTON MISSING) rapporteur [368, 373, 375] 1961-09-21 00:00 1 ['Berger', 'rapporteur'] speaker not uniquely identified when he spoke the second time: @@ -19,12 +21,15 @@ speaker not uniquely identified when he spoke the second time: 1951/20034982: Perrin-Corcelles rapporteur (first time), after: found a name: M. Pétrin, rapporteur ['Pétrin', 'rapporteur'] 0 Perrin (CANTON MISSING) rapporteur [3935, 3939] 1951/20034995: after: Kunz (CANTON MISSING) [3017, 3019] 1951-04-03 00:00 21 ['Kunz'] 1951/20034996: Studer +1931/20031095: Pfister (CANTON MISSING) Berichterstatter [3980, 3981, 3984] 1931-12-10 00:00 3 ['Pf', 'ister', 'Berichterstatter'] +1921/20029087: Müller (CANTON MISSING) [3663, 3695] 1921-01-20 00:00 6 ['Müller'] identifier is split into two words ---------------------------------- 1925/20029945, 1951/20035173: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING) [4639, 4660] 1971/20000498: ['M', 'Muf', 'ny', 'rapporteur', 'de', 'la', 'majorité'] 7 --> finds Muff but is Mugny 1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] +1941/ : Müller Aarb erg identified as speech start but is in text: @@ -54,21 +59,42 @@ look for typical terms such as gestellt, gesagt, etc. 1971/20000093: found a name: In zwei wesentlichen Punkten bin ich mit Herrn Kollega Biel absolut einverstanden ['zwei', 'wesentlichen', 'Punkten', 'Kollega', 'Biel', 'absolut', 'einverstanden'] 1 Biel Walter (Zürich ZH) [426] 1971/20000614: Zu Herrn Fischer 1951/20035112: Schmid (CANTON MISSING) [4639, 4646, 4660] 1951-09-26 00:00 27 ['Antrag', 'Schmid'] +1941/20033145: Prof. Böhler erklärt --> finds Bühler Bundesrat not found: -------------------- 1951/20035017,26: Petitpierre (CANTON MISSING) [3955, 3956] 1951-04-03 00:00 8 ['Petitpierre', 'conseiller', 'fédéral'] 1951/20035018,20,77,83: Rubattel (CANTON MISSING) [4381, 4382] 1951-04-03 00:00 6 ['Rubattel', 'conseiller', 'fédéral'] +1931/20030968: Häberlin (CANTON MISSING) [2290, 2287] 1931-03-24 00:00 6 ['Bundespräsident', 'Häberlin'] Häberlin,Heinrich,TG,Bundespräsident +1931/20031089: Meyer (CANTON MISSING) Bundesrat [3482, 3483, 3490, 3495] 1931-12-08 00:00 1 ['Meyer', 'Bundesrat'] +1921/20029085: Müller (CANTON MISSING) Président [3663, 3695] 1921-01-19 00:00 9 ['Mlle', 'Président'] +1911/20027998: Forrer (CANTON MISSING) Bundesrat [1771, 1773] 1911-03-30 09:00 2 ['Bundesrat', 'Forrer'] +1911/20028039: Müller (CANTON MISSING) Bundesrat [3642, 3653, 3663, 3683] 1911-10-05 08:30 11 ['Bundesrat', 'Müller'] + weird layout: ------------- 1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout) +wrong entries in xlsx: +---------------------- +1931/20030940,49: Scherer (CANTON MISSING) [4560, 4565] 1931-03-18 00:00 18 ['Scherer'] --> there are two entries for one person + +bad OCR: +-------- +1941/20033146: MüHer instead of Müller is not discovered + + +not sure about place: +--------------------- +1921/20029090: Seiler (CANTON MISSING) [4810, 4815] 1921-02-24 00:00 5 ['Seiler', 'Sitten'] found no connection between Seiler Hermann and Sitten + term very similar to one name is actually another name ------------------------------------------------------ 1925/20029863: ganz --> finds Lanz, there is a Ganz 1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer 1951/20035112: Schweizer --> finds Schwizer +1921/20029145: Seiler (CANTON MISSING) [4810, 4815] 1921-04-13 00:00 10 ['Nationalrat', 'Speiser', 'sagte'] finds Seiler for Speiser term is a name @@ -88,11 +114,15 @@ person has entry date 29.11.71 but is not yet active (presumably): 1971/20000587: Tanner Paul starts officiall on 29.11.71, discussion is on 30.11.71 --> finds two! 1971/20000588: one Kohler starts 29.11.71, discussion is on 30.11.71 --> finds two! 1971/20000726: one Muheim starts 29.11.71, discussion is on 8.12.71 --> finds two! +1921/20029265: one Huber starts 5.12.21, discussion is on 6.12.21 --> finds two Firstname before LastName ------------------------- 1971/20000592: Simon Kohler rapporteur +1911/20028008: Frey (CANTON MISSING) [1816, 1828] 1911-06-13 08:00 2 ['Alfred', 'Frey'] +1911/20028010: Eugster (CANTON MISSING) [1571, 1572] 1911-06-22 08:00 15 ['Arthur', 'Eugster'] +1961/20037222: Borel (CANTON MISSING) [590, 591] 1961-03-15 00:00 6 ['Georges', 'Borei'] two people with same last name and same citizenship --------------------------------------------------- diff --git a/data/politicians/MPs_additionalInfo.csv b/data/politicians/MPs_additionalInfo.csv index 83d56dafb44dc980eb1db280f9f78938faecaa8d..b76f6c08b99b3ba5795f7743bce0ad2071651bab 100644 --- a/data/politicians/MPs_additionalInfo.csv +++ b/data/politicians/MPs_additionalInfo.csv @@ -46,3 +46,7 @@ Weber,Jakob Rudolf,BE,Grasswil Weber,Heinrich Otto,SG,Gallen Sigg,Jean-C.,GE,Genève Suter,Johannes,BL,Baselland +Sonderegger,Johann Jakob,AR,Ausserrhoden +Sonderegger,Karl Justin,IR,Innerrhoden +Müller,Emil,BL,Baselland +König,Walter,BE,Biel diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 8572e209b340f644e120b0fe377fd1885a5ee240..daffdb615a55714ee6afab016eb2634627df77f2 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -243,8 +243,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', 'Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', - 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', - 'Vizepräsident'] + 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller', 'fédéral', 'fédéral' + 'Vizepräsident', 'Bundespräsident'] list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] # initialize flag @@ -265,132 +265,138 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) list_oi = tokenizer.tokenize(text_start) - if len(list_oi) > 4: - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) - flag_print = True - if bln_print: print('possible speech start: ', list_oi) - # remove stopwords - list_oi = [term for term in list_oi if term.lower() not in list_stopwords] + # to avoid false positives, the number of elements in list_oi is checked + # - if it is too long, it is part of a speech and not a speech start + # - for intermediate lengths between 5 and 8, it can either be a speech + # start (if it contains a role) or part of a speech + # - short lengths typically indicate a speech start, but not always. These + # false positives cannot be avoided with this procedure. + if (len(list_oi) < 9): + if (len(list_oi) < 5) or (len(set(list_oi).intersection(list_roles)) > 0): - # remove punctuation - list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi] - list_oi = [s for s in list_oi if s] + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) + flag_print = True - # remove lower case terms -# list_oi = [term for term in list_oi if not term.islower()] + # remove stopwords + list_oi = [term for term in list_oi if term.lower() not in list_stopwords] - # remove numbers - list_oi = [term for term in list_oi if not term.isdigit()] + # remove punctuation + list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi] + list_oi = [s for s in list_oi if s] - # remove single characters - # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) - # TODO: needs to be changed to include 'I' for Minderheit I 1891/20000093 - # TODO: maybe exclude I and A to account for Appenzell - list_oi = [term for term in list_oi if len(term)>1] + # remove lower case terms + # list_oi = [term for term in list_oi if not term.islower()] - if len(list_oi) > 4 or flag_print: - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) + # remove numbers + list_oi = [term for term in list_oi if not term.isdigit()] - # if possible, find a name from the list - str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str_council, str_date, bln_print=False) - if bln_print: - print('name', str_name, 'role', str_role) + # remove single characters + # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) + # TODO: needs to be changed to include 'I' for Minderheit I 1891/20000093 + # TODO: maybe exclude I and A to account for Appenzell + list_oi = [term for term in list_oi if len(term)>1] - if len(list_uniqueID) > 1 or flag_print: - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), str(list_oi), '\n'))) + if len(list_oi) > 4 or flag_print: + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) - # get rid of 'Präsident stimmt nicht Président ne vote pas' - if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: - if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): - if bln_print: - print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi) - str_role = '' + # if possible, find a name from the list + str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str_council, str_date, bln_print=False) - # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' - if str_name: - if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: - if bln_print: - print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) - str_name = '' - - # if a name has been found, add it to XML_new - if str_name or str_role: - # add attribute speech_start to textbox - XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' - - # add speaker as attribute to first textline - XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton) - - # update text of XML (speaker is on first line, actual speech start on second line of speech_start textbox) - # if colon is on first line - if ind_tl_colon == 0: - # get text of that line and colon index - thattext = XML_new[ind_p][ind_t][0].text - colon_index = thattext.index(':') - - try: - # write speaker to first line - XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend - - # get start of speech with correct font start - if thattext[colon_index+1:].startswith('[font'): - startspeech = thattext[colon_index+1:] - elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): - startspeech = '' - elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): - startspeech = thattext[colon_index+8:] - else: - startspeech = thattext[colon_index+1:] + if len(list_uniqueID) > 1 or flag_print: + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), str(list_oi), '\n'))) - # write beginning of speech to second line - # (create new ET element if necessary) - if len(list(XML_new[ind_p][ind_t])) > 1: - XML_new[ind_p][ind_t][1].text = startspeech + ' ' + XML_new[ind_p][ind_t][1].text - else: - XML_new[ind_p][ind_t].append(copy.deepcopy(XML_new[ind_p][ind_t][0])) - XML_new[ind_p][ind_t][1].attrib.pop('speaker') - XML_new[ind_p][ind_t][1].text = startspeech - except: - print('error in self.input_file when splitting speaker') - #print(thattext) - #print(len(list(XML_new[ind_p][ind_t]))) - #print(list(XML_new[ind_p][ind_t])) - #print(XML_new[ind_p][ind_t]) - #print('gefundener Name:', str_name, str_role) - pass + # get rid of 'Präsident stimmt nicht Président ne vote pas' + if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: + if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): + if bln_print: + print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi) + str_role = '' - # if colon is on second line - if ind_tl_colon == 1: - # get text of that line and colon index - thattext = XML_new[ind_p][ind_t][1].text - colon_index = thattext.index(':') - - # get start of speech with correct font start - if thattext[colon_index+1:].startswith('[font'): - startspeech = thattext[colon_index+1:] - elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): - startspeech = '' - elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): - startspeech = thattext[colon_index+8:] - else: - startspeech = thattext[colon_index+1:] + # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' + if str_name: + if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: + if bln_print: + print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) + str_name = '' + + # if a name has been found, add it to XML_new + if str_name or str_role: + # add attribute speech_start to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' + + # add speaker as attribute to first textline + XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton) + + # update text of XML (speaker is on first line, actual speech start on second line of speech_start textbox) + # if colon is on first line + if ind_tl_colon == 0: + # get text of that line and colon index + thattext = XML_new[ind_p][ind_t][0].text + colon_index = thattext.index(':') + + try: + # write speaker to first line + XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend + + # get start of speech with correct font start + if thattext[colon_index+1:].startswith('[font'): + startspeech = thattext[colon_index+1:] + elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): + startspeech = '' + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = thattext[colon_index+8:] + else: + startspeech = thattext[colon_index+1:] + + # write beginning of speech to second line + # (create new ET element if necessary) + if len(list(XML_new[ind_p][ind_t])) > 1: + XML_new[ind_p][ind_t][1].text = startspeech + ' ' + XML_new[ind_p][ind_t][1].text + else: + XML_new[ind_p][ind_t].append(copy.deepcopy(XML_new[ind_p][ind_t][0])) + XML_new[ind_p][ind_t][1].attrib.pop('speaker') + XML_new[ind_p][ind_t][1].text = startspeech + except: + print('error in self.input_file when splitting speaker') + #print(thattext) + #print(len(list(XML_new[ind_p][ind_t]))) + #print(list(XML_new[ind_p][ind_t])) + #print(XML_new[ind_p][ind_t]) + #print('gefundener Name:', str_name, str_role) + pass + + # if colon is on second line + if ind_tl_colon == 1: + # get text of that line and colon index + thattext = XML_new[ind_p][ind_t][1].text + colon_index = thattext.index(':') + + # get start of speech with correct font start + if thattext[colon_index+1:].startswith('[font'): + startspeech = thattext[colon_index+1:] + elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): + startspeech = '' + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = thattext[colon_index+8:] + else: + startspeech = thattext[colon_index+1:] - # write speaker to first line - XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend - # write beginning of speech to second line - XML_new[ind_p][ind_t][1].text = startspeech + # write speaker to first line + XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend + # write beginning of speech to second line + XML_new[ind_p][ind_t][1].text = startspeech - # set flag - this_is_speech = True - if bln_print: - print('found a name:', text_start, list_oi, str_name, str_role, '\n') - print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n') + # set flag + this_is_speech = True + if bln_print: + print('found a name:', text_start, list_oi, str_name, str_role, '\n') + print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n') return XML_new, this_is_speech @@ -687,7 +693,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name.split(' ')[0]) & (df_names['CouncilName']==str_council)].iloc[0] list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) - if str_completeName.split(' ')[0] == str_name.split(' ')[0]: + if str_completeName.split(' ')[0] == str_name.split(' ')[0] or str_completeName.split(' ')[1] == str_name.split(' ')[0]: str_name = add_to_string('', str_completeName) else: str_name = add_to_string(str_name, str_completeName)