diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index fb0fb51b4accb9b421ae4ed7dcb4fe31e1d64688..b2bd1b7706cfbab542b97012fb2727aaa9b3f2b8 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -1,21 +1,24 @@ +speaker not identifiable: +------------------------- +1891/20026455: Dufour (CANTON MISSING) [1420, 1421] 1891-06-22 15:00 7 twice in same document +1891/20026465: Zweifel one time not identified --> is it a different one (not the Landammann) or was he already mentioned before? +1925/20029836,37,87: Seiler (CANTON MISSING) Berichterstatter [4810, 4815] 1925-03-28 00:00 9 +1925/20029943: Welti (CANTON MISSING) [5655, 5656] 1925-09-29 00:00 6 -also check for council: ------------------------ -1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann --> solved! -1925/20029937: Schneider, Berichterstatter --> NR, not SR --> solved! +speaker not uniquely identified when he spoke the second time: +-------------------------------------------------------------- +1925/20029924: Keller-Aargau Berichterstatter (first time), Keller Berichterstatter (after) + 1925/20029928,29: Keller Berichterstatter (also first time), maybe check title of document... -one MP not active in whole year, leads to other not uniquely identified ------------------------------------------------------------------------ -1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) --> solved! -1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) --> solved! -1925/20029967: Huber (in December, the second Huber already left) --> finds two!) --> solved because only NR! -1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. --> solved! +identifier is split into two words +---------------------------------- +1925/20029945: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING) [4639, 4660] identified as speech start but is in text: ------------------------------------------ -do I really need to look on the first two lines? maybe one is sufficient? +do I really need to look on the first two lines? maybe one is sufficient? --> no, it needs two lines look for typical terms such as gestellt, gesagt, etc. 1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt: @@ -24,9 +27,10 @@ look for typical terms such as gestellt, gesagt, etc. 1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete: 1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt: 1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger +1925/20029891: J'en viens enfin à M. Belmont. M. Belmont a posé cette question --> finds Belmont twice 1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text 1925/20029917: Herr Hauser sagt: -1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt: +1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt: and a second time in the same document with Naine 1925/20029981: Brügger möchte ich sagen: 1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout) 1971/20000007: La seconde réaction qu'a suscité chez moi l'intervention de M. Weber est le doute: @@ -37,18 +41,6 @@ look for typical terms such as gestellt, gesagt, etc. 1971/20000024: Herr Kollege Heimann stellt sich schliesslich gegen einen Finanzausgleich mit dem Hinweis -wrongly spelled city --------------------- -1925/20029963: Jenny Ennend (instead of Ennenda) -1925/20029995,96: Keller Zurich (instead of Zürich) -1971/? : Berne instead of Bern --> solved with using get_approximate_term for cantons - - -doubled double names: ---------------------- -1971/20000010: Meyer-Boller --> solved! - - term very similar to one name is actually another name ------------------------------------------------------ 1925/20029863: ganz --> finds Lanz, there is a Ganz @@ -57,6 +49,7 @@ term very similar to one name is actually another name term is a name -------------- +1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 --> finds Zweifel 1971/20000010: Ganz wenige Einzelfragen --> finds Ganz 1971/20000024: Politisch gesehen ist es doch ganz einfach so --> finds Ganz @@ -72,7 +65,33 @@ Appenzeller 1894/20026618: Sonderegger - some other persons wrongly identified as MP ------------------------------------------- 1925/20029833: Sauser-Hall (not a MP)--> Hauser + + + +solved: also check for council: +----------------------- +1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann --> solved! +1925/20029937: Schneider, Berichterstatter --> NR, not SR --> solved! + + +solved: one MP not active in whole year, leads to other not uniquely identified +----------------------------------------------------------------------- +1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) --> solved! +1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) --> solved! +1925/20029967: Huber (in December, the second Huber already left) --> finds two!) --> solved because only NR! +1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. --> solved! + + +solved: doubled double names: +--------------------- +1971/20000010: Meyer-Boller --> solved! + + +solved: wrongly spelled city +-------------------- +1925/20029963: Jenny Ennend (instead of Ennenda) --> solved! +1925/20029995,96: Keller Zurich (instead of Zürich) --> solved! +1971/? : Berne instead of Bern --> solved with using get_approximate_term for cantons diff --git a/src/python/def_classes.py b/src/python/def_classes.py index b572503a90d9df3b4d93ca89809386f790f951c1..03355186e36e6c17bca2cb0cfa94da83253d56e3 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -761,6 +761,8 @@ class Document: #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, #name_outcorrxml = self.name_outcorrxml) + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str(self.year), self.id_doc, '\n'))) print('we have a main corr XML file') #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 23747f0fbb6a806184581477f67ae316b98665c7..0bb90c94bdaa6425eb85826f40cccdf34d336a9f 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -116,7 +116,7 @@ utils_proc.compress_tar(output_annotatedxml) #%% # to test for one file -file_tarpath = './1925/20029967_datacorr.xml' +file_tarpath = './1925/20029981_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 0c19077eedca544c6b616c4cf2c01612dfcefc0e..525eeb8288f2d24b8f9a2e0c6a4c4f9114d13faf 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -263,6 +263,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # look at first few terms of that textbox text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) list_oi = tokenizer.tokenize(text_start) + + if len(list_oi) > 5: + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str(list_oi), '\n'))) + if bln_print: print('possible speech start: ', list_oi) @@ -289,6 +294,10 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ if bln_print: print('name', str_name, 'role', str_role) + if len(list_uniqueID) > 1: + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), '\n'))) + # get rid of 'Präsident stimmt nicht Président ne vote pas' if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): @@ -570,38 +579,27 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # cannot happen for the first term in list_oi elif name_type == 'canton': - list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo = get_list_cantons(df_names, str_name.split(' ')[0], str_council) + list_cantons = get_list_cantons(df_names, str_name.split(' ')[0], str_council) canton_type = '' - if term in list_cantonname: - str_canton = term - canton_type = 'CantonName' - print('!!! is a canton', term, list_oi, str_name, str_role) - elif term in list_cantonabbr: - str_canton = term - canton_type = 'CantonAbbr' - print('!!! is a canton', term, list_oi, str_name, str_role) - elif term in list_citizenship: - str_canton = term - canton_type = 'Citizenship' - print('!!! is a canton', term, list_oi, str_name, str_role) - elif term in list_firstname: - str_canton = term - canton_type = 'FirstName' - print('!!! is a canton', term, list_oi, str_name, str_role) - elif term in list_additionalInfo: - str_canton = term - canton_type = 'additionalInfo' - print('!!! is a canton', term, list_oi, str_name, str_role) - - else: + for list_, type_ in list_cantons: + if term in list_: + str_canton = term + canton_type = type_ + print('!!! is a canton', term, list_oi, str_name, str_role) + break + + # if person was not uniquely identified, check for misspellings + if not canton_type: # look for similar names based on (normalized) Damerau-Levenshtein distance - # TODO: might needs to be extended for other than cantonname - term_approx = get_approximate_term(term, np.array(list_cantonname)) - if term_approx: - str_canton = term_approx - canton_type = 'CantonName' - - print('might be a canton:', term, list_oi, str_name, str_role, term_approx) + # only look at cantonname, citizenship and additionalinfo + list_cantons_approx = [list_cantons[i] for i in (0, 2, 4)] + for list_, type_ in list_cantons_approx: + term_approx = get_approximate_term(term, np.array(list_)) + if term_approx: + str_canton = term_approx + canton_type = type_ + print('!!! is a canton', term, list_oi, str_name, str_role) + break # if a canton or similar was found if canton_type: @@ -638,6 +636,9 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: str_name = add_to_string(str_name, str_completeName) + else: + print('could not be identified as a canton:', term, list_oi, str_name, str_role) + # if term is not easily mistaken as a name (avoid false positives) elif term not in list_notnames: @@ -681,7 +682,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: # check if person can be identified from date of discussion # exclude people that joined after date of discussion - df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining']) <= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')] + df_temp_before = df_temp[pd.to_datetime(df_temp['DateJoining'], format='%d.%m.%Y') <= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')] if df_temp_before.shape[0] == 1: list_temp = list(df_temp_before['uniqueIndex']) str_completeName = df_temp_before['completeName'].iloc[0] @@ -693,7 +694,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str str_name = add_to_string(str_name, str_completeName) # exclude people that left before date of discussion - df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving']) >= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')] + df_temp_after = df_temp[pd.to_datetime(df_temp['DateLeaving'], format='%d.%m.%Y') >= datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M')] if df_temp_after.shape[0] == 1: list_temp = list(df_temp_after['uniqueIndex']) str_completeName = df_temp_after['completeName'].iloc[0] @@ -797,25 +798,42 @@ def get_df_temp_canton(df_names, str_name, str_council): # function to get list of places def get_list_cantons(df_names, str_name, str_council = ''): + # specify strings as they are used in Ratsmitglieder_1848_DE_corr.xlsx and therefore in df_names + str_CantonName = 'CantonName' + str_CantonAbbreviation = 'CantonAbbreviation' + str_Citizenship = 'Citizenship' + str_FirstName = 'FirstName' + str_additionalInfo = 'additionalInfo' + + # get dataframe df_temp = get_df_temp_canton(df_names, str_name, str_council) # list of cantons - list_cantonname = list(df_temp['CantonName']) + list_cantonname = list(df_temp[str_CantonName]) # list of canton abbreviations - list_cantonabbr = list(df_temp['CantonAbbreviation']) + list_cantonabbr = list(df_temp[str_CantonAbbreviation]) # list of citizenships - list_citizenship = list(df_temp['Citizenship']) + list_citizenship = list(df_temp[str_Citizenship]) list_citizenship = get_cities(list_citizenship) # list of first names - list_firstname = list(df_temp['FirstName']) + list_firstname = list(df_temp[str_FirstName]) # list of additional information - list_additionalInfo = list(df_temp['additionalInfo']) - - return list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo + list_additionalInfo = list(df_temp[str_additionalInfo]) + + # generate list of cantons including string + list_cantons = [(list_cantonname, str_CantonName), + (list_cantonabbr, str_CantonAbbreviation), + (list_citizenship, str_Citizenship), + (list_firstname, str_FirstName), + (list_additionalInfo, str_additionalInfo), + ] + +# return list_cantonname, list_cantonabbr, list_citizenship, list_firstname, list_additionalInfo + return list_cantons