From 6d499ef5d5d04ade014e42afcd17afe6179d2bf9 Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Mon, 4 Feb 2019 18:17:42 +0100 Subject: [PATCH] create output file with speakers --- data/lists/wrongly_identified_speakers.txt | 25 ++++++--- src/python/def_classes.py | 16 ++++-- src/python/run_extract_discussions.py | 39 +------------ src/python/utils_annot.py | 64 ++++++++++++---------- 4 files changed, 64 insertions(+), 80 deletions(-) diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index 038c497e..972c36c5 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -40,16 +40,10 @@ speaker not uniquely identified when he spoke the second time: 1940/20033001: Keller (CANTON MISSING) reporter majority [2868, 2871, 2890] 1940-03-27 00:00 16 ['Keller', 'Berichterstatter', 'Mehrheit'] -identifier is split into two words +identifier is split into two words --> partly solved by adding more additional info columns to csv ---------------------------------- -1925/20029945, 1951/20035173: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING) [4639, 4660] 1971/20000498: ['M', 'Muf', 'ny', 'rapporteur', 'de', 'la', 'majorité'] 7 --> finds Muff but is Mugny -1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] --> solved by adding Schaff as additional Info -1941/ : Müller Aarb erg -1956/20036201: Berger (CANTON MISSING) reporter [368, 373, 375] 1956-12-10 00:00 1 ['Berger', 'Neuch', 'à tei', 'rapporteur'] 1936/20031984: Keller (CANTON MISSING) reporter majority [2868, 2871, 2890] 1936-01-08 00:00 10 ['Keller', 'Aar', 'Berichterstatter', 'Mehrheit'] Aarau split in Aar au -1936/20031998: Stähli (CANTON MISSING) [4967, 4964] 1936-01-17 00:00 13 ['Stähli', 'Sieb', 'nen'] -1936/20032015: Müller (CANTON MISSING) [3638, 3645, 3652, 3654, 3658, 3659, 3662] 1936-01-30 00:00 6 ['Müller', 'Grosshöchste', 'tten'] identified as speech start but is in text: --> some of these might be solved by only looking at list_oi with less than 9 elements @@ -78,9 +72,9 @@ list of people in a minority are recognized as speech starts: misspelled role: ---------------- -1936/20031986: Meyer (CANTON MISSING) [3482, 3483, 3488, 3490] 1936-01-09 00:00 14 ['Rundespräsident', 'Meyer'] +1936/20031986: Meyer (CANTON MISSING) [3482, 3483, 3488, 3490] 1936-01-09 00:00 14 ['Rundespräsident', 'Meyer'] ---> solved 1936/20031992: Meyer (CANTON MISSING) [3482, 3483, 3488, 3490] 1936-01-15 00:00 10 ['ßundespräsident', 'Meyer'] -1932/20031299: Häberlin (CANTON MISSING) [2290, 2287] 1932-09-21 00:00 6 ['Bimdesrat', 'Häberlin'] +1932/20031299: Häberlin (CANTON MISSING) [2290, 2287] 1932-09-21 00:00 6 ['Bimdesrat', 'Häberlin'] --> solved weird layout: @@ -96,6 +90,8 @@ bad OCR: 1952/20035242, and some others: reporter [] 1952-03-25 00:00 9 ['Spanier', 'Berichterstatter'] --> Spühler not found 1948/20034315: reporter [] 1948-09-23 00:00 5 ['Statili', 'Berichterstatter'] --> Stähli not found 1936/20032015: reporter majority [] 1936-01-30 00:00 2 ['Statili', 'Berichterstatter', 'Mehrheit'] --> Stähli not found +1936/20032189: page 4 ['Bundesrat', 'Bautltann'] 1936-12-17 00:00 --> Baumann not found +1936/20031985: page 1 ['DollîUS', 'rapporteur'] 1936-01-09 00:00 --> Dollfus not found not sure about place: @@ -193,3 +189,14 @@ solved: wrong entries in xlsx: ---------------------- 1931/20030940,49: Scherer (CANTON MISSING) [4560, 4565] 1931-03-18 00:00 18 ['Scherer'] --> there are two entries for one person 1971/20000055: Debétaz, was not there as a NR + + + +solved: identifier is split into two words +---------------------------------- +1925/20029945, 1951/20035173: found a name: Schmid-Oberentf elden ['Schmid', 'Oberentf', 'elden'] 0 Schmid (CANTON MISSING) [4639, 4660] +1951/20034978,79,94: found a name: Bringolf- Schaff hausen ['Bringolf', 'Schaff', 'hausen'] 0 Bringolf (CANTON MISSING) [707, 706] --> solved by adding Schaff as additional Info +1941/ : Müller Aarb erg +1956/20036201: Berger (CANTON MISSING) reporter [368, 373, 375] 1956-12-10 00:00 1 ['Berger', 'Neuch', 'à tei', 'rapporteur'] +1936/20031998: Stähli (CANTON MISSING) [4967, 4964] 1936-01-17 00:00 13 ['Stähli', 'Sieb', 'nen'] +1936/20032015: Müller (CANTON MISSING) [3638, 3645, 3652, 3654, 3658, 3659, 3662] 1936-01-30 00:00 6 ['Müller', 'Grosshöchste', 'tten'] diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 6ee0909d..041d1b66 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -761,12 +761,20 @@ class Document: #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, #name_outcorrxml = self.name_outcorrxml) - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join(('\n\n-------', str(self.year), self.id_doc, '\n'))) - print('we have a main corr XML file') + + # get council and date + (str_council, str_date) = self.get_council_date() + self.str_council = str_council + self.str_date = str_date + + # file to track speakers + self.name_speakers = '_'.join((str(self.year), self.id_doc, 'speakers.txt')) + with open('data/lists/speakers/' + self.name_speakers, 'w') as f: + f.write(' '.join((str(self.year), self.id_doc, str_date, '\n'))) + #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) - XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, bln_print=False) + XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, bln_print=False) self.XML_main_annot = XML_main_annot # save xml file diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 88020f47..512cc94b 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -26,7 +26,7 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1956' +year = '1936' input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" @@ -98,10 +98,6 @@ for file_tarpath in files_to_process: print(id_doc + '\n') file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames - # TODO: add this to next deeper level - (str_council, str_date) = file_doc.get_council_date() - file_doc.str_council = str_council - file_doc.str_date = str_date file_doc.annotate_xml() # Commands to get the compressegid version of the file @@ -119,7 +115,7 @@ with open(input_notnames) as f: list_notnames = [term.rstrip() for term in list_notnames] # to test for one file -file_tarpath = './1956/20036021_datacorr.xml' +file_tarpath = './1936/20031986_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -132,10 +128,6 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20 file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames - # TODO: add this to next deeper level - (str_council, str_date) = file_doc.get_council_date() - file_doc.str_council = str_council - file_doc.str_date = str_date file_doc.annotate_xml() @@ -151,30 +143,3 @@ file_doc.check_discussion() str_date = '1925-12-09 08:00' import datetime datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M') - - - -listilist = ['a', 'b', 'c', 'd'] -'a' in listilist -listilist[0,2] -# OPTIMIZE - -list_1 = [1, 2,3, 4, 5] -list_2 = [2,7,8] -len(set(list_1).intersection(list_2)) - - -if 'ab' in 'abc': - print('yay') - -a = 10 -if a < 7 or a > 9: - print(a) - -a = 'asdf' -b = 'asdf' -a == b == 'asdf' - -'a' in a - -'as' in a diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 63622c1b..8e8f554f 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -19,11 +19,11 @@ import datetime # function to check whether a file containts discussions # achieved by excluding title pages, table of content, etc. -# !!! function works well for 1891 - 1900, not checked after that !!! +# TODO: function works well for 1891 - 1900, not checked after that !!! def check_if_discussion(path_meta_xml_file, list_attributes = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'], list_nondiscussion = ['inhaltsverzeiGGchnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste', - 'umschlag', 'sachregister', 'titelblatt', 'numerierung'], + 'jahres-rednerliste', 'umschlag', 'sachregister', 'titelblatt', 'numerierung'], list_nondiscussion2 = ['table', 'matières', 'répertoire', 'procès-verbaux']): # parse, get root and then part of interest @@ -84,10 +84,9 @@ def get_text(sometext): # function to annotated corrected XML -def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, bln_print=False): +def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, bln_print=False): # list of votation terms - # TODO: make it work for é, etc. list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', 'Votation', 'Vote', 'votation', '(Adoptés)', 'adoptés', 'adoptée', 'rejetée', @@ -136,15 +135,13 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da if textbox_texttype in ['text_col1', 'text_col2']: - XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, bln_print=False) + XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False) if this_is_speech: prev_is_speech = True -# print('stopped after finding speech start') continue - XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, bln_print=False) + XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, str_file_speakers, bln_print=False) if this_is_vote: prev_is_speech = False -# print('stopped after finding vote') continue if prev_is_speech and (not this_is_vote): XML_new = label_speechcont(XML_new, ind_p, ind_t) @@ -237,7 +234,7 @@ def get_complete_text(textbox): # - bln_print: whether to print during execution, default False # output: # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID -def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, bln_print=False): +def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False): # lists of roles list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', @@ -264,6 +261,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # look at first few terms of that textbox text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) list_oi = tokenizer.tokenize(text_start) + list_oi_full = list_oi if bln_print: print('possible speech start: ', list_oi) @@ -277,10 +275,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ if (len(list_oi) < 9): if (len(list_oi) < 5) or (len(set(list_oi).intersection(list_roles)) > 0): - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) - flag_print = True - # remove stopwords list_oi = [term for term in list_oi if term.lower() not in list_stopwords] @@ -300,10 +294,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # TODO: maybe exclude I and A to account for Appenzell list_oi = [term for term in list_oi if len(term)>1] - if len(list_oi) > 4 or flag_print: - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) - # if possible, find a name from the list str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str_council, str_date, bln_print=False) @@ -313,10 +303,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ if str_role == 'federalcouncil' and str_name == '': str_role = '' - if len(list_uniqueID) > 1 or flag_print: - with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), str(list_oi), '\n'))) - # get rid of 'Präsident stimmt nicht Président ne vote pas' if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): @@ -331,6 +317,13 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) str_name = '' + with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + f.write(' '.join(('page', str(ind_p + 1), str(list_oi), '\n'))) + f.write(' '.join(('name:', str_name, '\n'))) + f.write(' '.join(('role:', str_role, '\n'))) + f.write(' '.join(('uniqueID(s):', str(list_uniqueID), '\n'))) + f.write(' '.join(('text:', text[colon_index_text+1:colon_index_text+100], '\n\n'))) + # if a name has been found, add it to XML_new if str_name or str_role: # add attribute speech_start to textbox @@ -416,7 +409,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # - bln_print: whether to print during execution, default False # output: # - XML_new: updated -def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, bln_print=True): +def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, str_file_speakers, bln_print=True): # get first terms of that text list_oi = tokenizer.tokenize(text)[:15] @@ -428,6 +421,11 @@ def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, bln_print=T # set flag this_is_vote = True + + with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + f.write(' '.join(('page', str(ind_p + 1), text, '\n'))) + f.write(' '.join(('is a vote', '\n\n'))) + if bln_print: print('found a vote:', list_oi) else: @@ -584,19 +582,26 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # for every term for term in list_oi: - if term in list_roles: + term_approx_role = get_approximate_term(term, np.array(list_roles)) + + if term in list_roles or term_approx_role: # update str_role # TODO: also look for similar terms (misspellings) # TODO: what with Bundespräsident? # TODO: is Berichterstatter the same as Sprecher? + if term_approx_role: + term_ = term_approx_role + else: + term_ = term + # assign role in English - if term in ['Präsident', 'Präsidentin', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente']: + if term_ in ['Präsident', 'Präsidentin', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente']: str_assignedRole = 'president' - elif term in ['Vizepräsident']: + elif term_ in ['Vizepräsident']: str_assignedRole = 'vice-president' - elif term in ['Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole']: + elif term_ in ['Berichterstatter', 'Berichterstatterin', 'rapporteur', 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole']: str_assignedRole = 'reporter' - elif term in ['Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller', 'fédéral', 'Bundespräsident', 'Bundespräsidentin']: + elif term_ in ['Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller', 'fédéral', 'Bundespräsident', 'Bundespräsidentin']: str_assignedRole = 'federalcouncil' str_council = 'Bundesrat' # needs to be German to be used in dataframe @@ -693,6 +698,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: print('could not be identified as a canton:', term, list_oi, str_name, str_role) + # if term is first name + # needed when people are referenced by FirstName LastName, e.g. Simon Kohler elif term in list_all_firstnames: str_firstname = term print('found a first name', str_firstname) @@ -709,9 +716,6 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # get correct name and uniqueID, or role, for that term str_name, list_uniqueID, name_type = get_string(term, df_names, str_name, list_uniqueID) - if bln_print: - print('=== correct name', term) - # if term is not in list_all_names else: # look for similar names based on (normalized) Damerau-Levenshtein distance -- GitLab