diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt new file mode 100644 index 0000000000000000000000000000000000000000..8205b48ecd010c778db9a790dc49ace82d0d1c40 --- /dev/null +++ b/data/lists/not_names.txt @@ -0,0 +1,66 @@ +Alinea +Alter +Ari +Art +besser +bietet +drehen +Fällen +fasse +Ferner +ferner +findet +Gallen +Gründe +hausen +Herren +Herr +immer +Kasse +Kollege +Kollega +komme +Leider +lieber +nehmen +neu +nicht +Rath +Schrit +Seite +selber +Sinne +später +Steuer +StGallen +Stimmen +Stimme +stimmt +tischen +Tunnel +Ueber +Hans +Walter +Werner +weiterer +Wer +wissen +Ziel +autre +Biffer +biffer +cerner +comme +force +cause +dernier +ouvert +peu +pilier +poser +projet +Rédiger +rédiger +tirer +vote +delle diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt new file mode 100644 index 0000000000000000000000000000000000000000..38f3fd03a1853c6f6872ae916bbd851380ede921 --- /dev/null +++ b/data/lists/wrongly_identified_speakers.txt @@ -0,0 +1,78 @@ + +also check for council: +----------------------- +1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann +1925/20029937: Schneider, Berichterstatter --> NR, not SR + + +one MP not active in whole year, leads to other not uniquely identified +----------------------------------------------------------------------- +1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!) +1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!) +1925/20029967: Huber (in December, the second Huber already left) --> finds two!) +1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc. + + +identified as speech start but is in text: +------------------------------------------ +do I really need to look on the first two lines? maybe one is sufficient? +look for typical terms such as gestellt, gesagt, etc. + +1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt: +1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. Bühler folgend, die Erklärung gewählt: +1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 : +1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete: +1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt: +1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger +1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text +1925/20029917: Herr Hauser sagt: +1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt: +1925/20029981: Brügger möchte ich sagen: +1971/20000663: de MM. Knüsel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout) +1971/20000007: La seconde réaction qu'a suscité chez moi l'intervention de M. Weber est le doute: +1971/20000007: Herr Kollega Gut hat es gesagt: +1971/20000007: Noch eine Antwort an Kollege Clottu +1971/20000010: Nun noch etwas zu Richard Müller. Erstens +1971/20000024: Noch ein Wort zu Herrn Ständerat Wenk +1971/20000024: Herr Kollege Heimann stellt sich schliesslich gegen einen Finanzausgleich mit dem Hinweis + + +wrongly spelled city +-------------------- +1925/20029963: Jenny Ennend (instead of Ennenda) +1925/20029995,96: Keller Zurich (instead of Zürich) +1971/? : Berne instead of Bern + + +doubled double names: +--------------------- +1971/20000010: Meyer-Boller + + +term very similar to one name is actually another name +------------------------------------------------------ +1925/20029863: ganz --> finds Lanz, there is a Ganz +1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer + + +term is a name +-------------- +1971/20000010: Ganz wenige Einzelfragen --> finds Ganz +1971/20000024: Politisch gesehen ist es doch ganz einfach so --> finds Ganz + + +French name with special characters +----------------------------------- +1971/20000055: Debétaz + + +Appenzeller +----------- +1894/20026597: Sonderegger +1894/20026618: Sonderegger + + + +some other persons wrongly identified as MP +------------------------------------------- +1925/20029833: Sauser-Hall (not a MP)--> Hauser diff --git a/requirements.txt b/requirements.txt index ae3be3df42108fd3054cad7466f082091af00f57..dec9aa297abfa705fc6e700baca1769ffe3d6f10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy<1.16 scipy pandas xlrd diff --git a/src/python/def_classes.py b/src/python/def_classes.py index cce7282200b409284b6b30a108244026631e6ab1..8e2a480bd129c7676056182cfea23f0029b1a19e 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -107,7 +107,7 @@ class Document: else: print('Not saving to tar') name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' - + self.name_xml = [name_tar, name_xml] if flag_save: h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml) @@ -119,10 +119,10 @@ class Document: self.n_pages = np.arange(len(self.XML_main)) command = 'rm -rf ./' + str(self.year) #print(command) - utils_proc.call_with_out(command) - - def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, - ind_page = 0, textb_textl = 1): + utils_proc.call_with_out(command) + + def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, + ind_page = 0, textb_textl = 1): # The page refers here to the page of the imgobj, which might not correspond # to the one of the xml. For that reason we use n_pages to obtain the index # for the xml @@ -357,12 +357,12 @@ class Document: if ind_page > (len(self.XML_main) - 1): flag_error = 1 return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error - + flag_central = 1 if self.year > self.limit_year: flag_central = 0 - flag_2col = 1 - + flag_2col = 1 + XML_root = ET.Element('pages') XML_root.append(self.XML_main[ind_abs[0]]) imarray = np.array(self.imgobj[ind_page]) @@ -380,10 +380,10 @@ class Document: XML_enrich = [] if level_proc > 0: - coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, - flag_2col, flag_central) - - if level_proc > 1: + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col, flag_central) + + if level_proc > 1: _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page) if level_proc > 2: @@ -645,8 +645,8 @@ class Document: name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + '_page' + str(ind_page) + '.' + format_fig) fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - + plt.close(fig) + def check_discussion(self): utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) @@ -748,7 +748,7 @@ class Document: print('we have a main corr XML file') #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) - XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False) + XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, bln_print=False) self.XML_main_annot = XML_main_annot # save xml file diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py index cd3504da7a302a2f0560644fa1475c0e311e0e99..01f87b7674b82d4895ce39566b243e5629bf0175 100644 --- a/src/python/extractMPs.py +++ b/src/python/extractMPs.py @@ -8,7 +8,7 @@ import sys input_file = sys.argv[1] #'./data/politicians/Ratsmitglieder_1848_DE_corr.xlsx' output_file_csv = sys.argv[2] #'./data/politicians/MPs_after1890.csv' -output_folder_dict = sys.argv[3] +output_folder_dict = sys.argv[3] class MPs_Extractor(object): @@ -19,12 +19,12 @@ class MPs_Extractor(object): self.output_folder_dict = output_folder_dict self.range_years = range(years[0], years[1] + 1) self.df_exc = df_exc - + # function to get lists of lastnames # input: # - df_year: dataframe for a year # output: - # - list_names: + # - list_names: # contains: # - list of last names that appear only once and cannot be split # - list of last name that are made up of two names such as 'Meier-Müller' @@ -44,7 +44,7 @@ class MPs_Extractor(object): str_comp = 'comp' str_canton2 = 'canton' - # function to split lastname and save meaningful part(s) to list + # function to split lastname and save meaningful part(s) to list def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''): # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani' lastname_split = lastname.split() @@ -70,7 +70,7 @@ class MPs_Extractor(object): list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton) # write double name without space into list list_names.append((str_double, ''.join(lastname.split('-')), lastname, uniqueID) + tpl_canton) - else: + else: if str_canton: list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton) else: @@ -82,66 +82,67 @@ class MPs_Extractor(object): str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] - - return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname) + str_doublename = df_year['DoubleName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0] + + return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_doublename) - # create empty lists for last names + # create empty lists for last names list_names = [] - - # for every last name + + # for every last name for lastname in df_year['LastName'].drop_duplicates(): #print('name', lastname, type(lastname)) - - # extract all entries with that last name + + # extract all entries with that last name df_temp = df_year.loc[df_after1890['LastName']==lastname] - #print(df_temp) - - # if there is an extra double name + #print(df_temp) + + # if there is an extra double name if df_temp.iloc[0]['DoubleName'] != '': - # extract unique index + # extract unique index uniqueID = df_temp.iloc[0]['uniqueIndex'] - + # get canton information for that uniqueID tpl_canton = get_canton(df_year, uniqueID) #print('double name', df_temp) doublename = df_temp.iloc[0]['DoubleName'] - # if last name is a double name, e.g. 'Meier-Müller' + # if last name is a double name, e.g. 'Meier-Müller' lastname_split2 = doublename.replace('-', ' ').split() if len(lastname_split2) > 1: - # write each part of double name into corresponding list + # write each part of double name into corresponding list for item in lastname_split2: list_names.append((str_double, item, lastname, uniqueID) + tpl_canton) - # write double name into list + # write double name into list list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton) - # write double name without space into list + # write double name without space into list list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton) - # if only one person with that last name + # if only one person with that last name if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1: - # extract unique index + # extract unique index uniqueID = df_temp.iloc[0]['uniqueIndex'] # get canton information for that uniqueID tpl_canton = get_canton(df_year, uniqueID) - # write complete name to list of last names + # write complete name to list of last names split_lastname(lastname, uniqueID, tpl_canton) - - # if there are several people with the same last name + + # if there are several people with the same last name else: - # write last name and canton to correct list + # write last name and canton to correct list for idx, row in df_temp.drop_duplicates(['uniqueIndex']).iterrows(): - # extract unique index + # extract unique index uniqueID = df_temp.loc[idx]['uniqueIndex'] - + # get canton information for that uniqueID tpl_canton = get_canton(df_year, uniqueID) - # write the lastname to the list - split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') - + # write the lastname to the list + split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') + return list_names def extract(self): @@ -172,7 +173,7 @@ class MPs_Extractor(object): # group by first and last name, and date of birth grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"]) - # assign first index to all entries of a person + # assign first index to all entries of a person for list_index in grouped.groups.values(): df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0] @@ -192,15 +193,15 @@ class MPs_Extractor(object): df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)] df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)] print(year, df_year.shape) - + # write df_year to a yearly csv file # str_year = str(year) # df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv') - + # create a pandas dataframe from list of names # !!! list contains errors, see definition of function list_lastnames = self.get_list_of_lastnames(df_year, df_after1890) - df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) + df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'DoubleName')) # dump dictionary of last names to a pickle file # path = pathlib. @@ -213,19 +214,18 @@ years = [1891, 2016] #2016 df_exc = pd.DataFrame(columns=['LastName', 'FirstName', 'DoubleName']) # exception: Konrad H. Cramer is also reffered to as Cramer-Frey. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} +df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} # exception: Johannes Blumer SG is also reffered to as Blumer-Egloff. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} +df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} # exception: Adolphe Jordan VD is also reffered to as Jordan-Martin. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} +df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} # exception: Jakob Schmid LU is also reffered to as Schmid-Ronca. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} +df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} # exception: Eduard Sulzer ZH is also reffered to as Sulzer-Ziegler. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} +df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} # exception: Howard Eugster AR is also reffered to as Eugster-Züst. Add double name in extra-column -df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'} +df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-Züst'} #print(df_exc) mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_exc) mps_extractor.extract() - diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 495b23ceb7f8b38e2849c5d16119e76cf177d48f..86ae417f57d8d0c4e6e08cd1f1f2c91bcf618f44 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -26,10 +26,11 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1891' +year = '1971' input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" +input_notnames = "data/lists/not_names.txt" output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" #%% @@ -37,7 +38,8 @@ output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" input_lastnames = sys.argv[1] input_correctedxml = sys.argv[2] input_correctedmeta = sys.argv[3] -output_annotatedxml = sys.argv[4] +input_notnames = sys.argv[4] +output_annotatedxml = sys.argv[5] #%% # extract suffixes, year, folder_database @@ -51,6 +53,7 @@ suffix_correctedmeta = '_metacorr' input_rawmeta = folder_database + '/' + year + '/' + '01_rawmeta.tar.gz' #%% +# TODO: make it work! # git lfs pull necessary data for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]: command = 'git lfs pull -I ' + lfsfile @@ -77,6 +80,11 @@ with open(input_lastnames, 'rb') as f: print('dataframe with lastnames loaded') +with open(input_notnames) as f: + list_notnames = f.readlines() + +list_notnames = [term.rstrip() for term in list_notnames] + #%% # for each file # TODO !!!! get rid of [66:] @@ -92,6 +100,7 @@ for file_tarpath in files_to_process: if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): print(id_doc + '\n') file_doc.df_lastnames = df_lastnames + file_doc.list_notnames = list_notnames file_doc.annotate_xml() # Commands to get the compressed version of the file @@ -103,20 +112,21 @@ utils_proc.compress_tar(output_annotatedxml) #%% -## to test for one file -#file_tarpath = './1893/20026526_datacorr.xml' -# -#id_doc = file_tarpath.split('/')[-1][:8] -# -## instantiate document object (always from original pdf) -#infile_aux = year + '/' + id_doc + '.pdf' -#file_doc = defc.Document(infile_aux, folder_database) -# -#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): - #print(id_doc + '\n') -# - #file_doc.df_lastnames = df_lastnames - #file_doc.annotate_xml() +# to test for one file +file_tarpath = './1971/20000619_datacorr.xml' + +id_doc = file_tarpath.split('/')[-1][:8] + +# instantiate document object (always from original pdf) +infile_aux = year + '/' + id_doc + '.pdf' +file_doc = defc.Document(infile_aux, folder_database) + +if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): + print(id_doc + '\n') + + file_doc.df_lastnames = df_lastnames + file_doc.list_notnames = list_notnames + file_doc.annotate_xml() #%% diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index e41d934ac7914234ab6fae1f425123999971c413..b19c06404c47e65559ddeeb273fa0bbd271a38fd 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -21,7 +21,7 @@ import collections # !!! function works well for 1891 - 1900, not checked after that !!! def check_if_discussion(path_meta_xml_file, list_attributes = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'], - list_nondiscussion = ['inhaltsverzeichnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste', + list_nondiscussion = ['inhaltsverzeiGGchnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste', 'umschlag', 'sachregister', 'titelblatt', 'numerierung'], list_nondiscussion2 = ['table', 'matières', 'répertoire', 'procès-verbaux']): @@ -69,13 +69,13 @@ def get_text(sometext): # function to annotated corrected XML -def get_annotated_xml(XML_root, df_lastnames, bln_print=False): +def get_annotated_xml(XML_root, df_lastnames, list_notnames, bln_print=False): # list of votation terms # TODO: make it work for é, etc. list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', - 'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e', + 'Votation', 'Vote', 'votation', '(Adoptés)', 'adoptés', 'adoptée', 'rejetée', "D'accord", 'voix'] # list of stopwords @@ -121,7 +121,7 @@ def get_annotated_xml(XML_root, df_lastnames, bln_print=False): if textbox_texttype in ['text_col1', 'text_col2']: - XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False) + XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, bln_print=False) if this_is_speech: prev_is_speech = True # print('stopped after finding speech start') @@ -207,7 +207,7 @@ def get_complete_text(textbox): complete_text += thattext # in first two textlines of textbox, check for colon - if ind_tl < 3: + if ind_tl < 2: if ':' in thattext: ind_tl_colon = ind_tl @@ -222,17 +222,19 @@ def get_complete_text(textbox): # - bln_print: whether to print during execution, default False # output: # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID -def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False): +def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False): + + # lists of roles + list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', + 'Berichterstatter', 'Berichterstatterin', 'rapporteur', + 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', + 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', + 'Vizepräsident'] + list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] # initialize flag this_is_speech = False - # initialize strings and ID - str_name = '' - str_role = '' - list_uniqueID = [] - str_canton = '' - # font text end fontend = '[/font]' @@ -240,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # at the beginning of a textbox and identifiying a name or a role in front # of that colon if ind_tl_colon >= 0: -# if ':' in text[:100]: # extract the index of the colon in the text colon_index_text = text.index(':') @@ -264,12 +265,12 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ list_oi = [term for term in list_oi if not term.isdigit()] # remove single characters + # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) + # TODO: maybe exclude I and A to account for Appenzell list_oi = [term for term in list_oi if len(term)>1] - # for every term - for term in list_oi: - # if possible, find a name in a list - str_name, str_role, list_uniqueID, str_canton = find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False) + # if possible, find a name from the list + str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False) if bln_print: print('name', str_name, 'role', str_role) @@ -305,9 +306,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ thattext = XML_new[ind_p][ind_t][0].text colon_index = thattext.index(':') - # get last font information of thattext - fontstart = re.findall('\[font.*?\]', thattext)[-1] - try: # write speaker to first line XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend @@ -315,10 +313,12 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # get start of speech with correct font start if thattext[colon_index+1:].startswith('[font'): startspeech = thattext[colon_index+1:] - elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): startspeech = '' + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = thattext[colon_index+8:] else: - startspeech = fontstart + thattext[colon_index+1:] + startspeech = thattext[colon_index+1:] # write beginning of speech to second line # (create new ET element if necessary) @@ -343,16 +343,15 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ thattext = XML_new[ind_p][ind_t][1].text colon_index = thattext.index(':') - # get last font information of thattext - fontstart = re.findall('\[font.*?\]', thattext)[-1] - # get start of speech with correct font start if thattext[colon_index+1:].startswith('[font'): startspeech = thattext[colon_index+1:] - elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]): startspeech = '' + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = thattext[colon_index+8:] else: - startspeech = fontstart + thattext[colon_index+1:] + startspeech = thattext[colon_index+1:] # write speaker to first line XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend @@ -362,7 +361,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # set flag this_is_speech = True if bln_print: - print('found a name:', list_oi, str_name, str_role, '\n') + print('found a name:', text_start, list_oi, str_name, str_role, '\n') + print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n') return XML_new, this_is_speech @@ -423,11 +423,10 @@ def flatten(l): # - str_role: string to which role should be attached # - list_uniqueID: list with one or several uniqueIDs # - list_tupels: list of tupels containing all types of names -# TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) -def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False): +def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False): - def get_string(term, str_name, str_role, list_uniqueID, str_canton): + def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton): name_type = '' # if it is one of the simple names if term in list(df_names['name_short'].loc[df_names['type']=='simple']): @@ -439,7 +438,7 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl # if it is a double name elif term in list(df_names['name_short'].loc[df_names['type']=='double']): if bln_print: - print(20*'\n', 'DOUBLE NAME') + print(5*'\n', 'DOUBLE NAME') # get correct name correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] if bln_print: @@ -458,23 +457,11 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl str_name = add_to_string(str_name, correct_name) name_type = 'comp' # if it contains a canton - # !!! also pass list_oi to look for canton - # !!! how to handle for people mentioned in text??? + # TODO: how to handle for people mentioned in text??? elif term in list(df_names['name_short'].loc[df_names['type']=='canton']): if bln_print: print('contains a canton', term) -# canton_missing = False -# df_temp = df_names.loc[df_names['name_short']==term] -# print('list_correct', df_temp) -# print(str_canton) -# if str_canton: -# str_correct = check_place(df_temp, str_canton) -# if str_correct in ['not found', 'too many']: -# str_name = add_to_string(str_name, term + ' (CANTONT MISSING)') -# canton_missing = True -# else: -# str_name = add_to_string(str_name, str_temp) -# name_type = 'canton' + str_name = add_to_string(str_name, term + ' (CANTON MISSING)') name_type = 'canton' @@ -485,41 +472,66 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]] elif name_type in ['canton']: list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) -# if canton_missing: -# temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) -# else: -# temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_correct']==str_correct)].iat[0, df_names.columns.get_loc('uniqueIndex')] + if len(list_temp) > 0: if bln_print: print(list_temp, list_uniqueID) print(type(list_temp), type(list_uniqueID)) print(isinstance(list_uniqueID, list)) - # if no unique ID has been assigned so far - if len(list_uniqueID) == 0: - list_uniqueID = list_temp - # if there are already one or several people and have a new person, we update - elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0: - list_uniqueID.append(list_temp) - - ## if we already have several possible people, e.g. because of canton - #elif isinstance(int_uniqueID, tuple): - #print('I should be here') - ## and refound the uniqueID of one of those, don't update - #if temp in int_uniqueID: - #pass - ## and update if we don't have that uniqueID yet - #else: - #int_uniqueID = (int_uniqueID, temp) - ## if a person with that uniqueID exists already, don't update - #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID: - #print('but end up here.. not even.....') - #pass - ## if a different unique ID has been assigned already - #else: - #int_uniqueID = (int_uniqueID, temp) - - return str_name, str_role, list_uniqueID + + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp) + + return str_name, str_role, list_uniqueID, name_type + + def update_list_uniqueID(list_uniqueID, list_temp): + # if no unique ID has been assigned so far + if len(list_uniqueID) == 0: + list_uniqueID = list_temp + # if there are already one or several people and have a new person, we update + elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0: + list_uniqueID.append(list_temp) + + return list_uniqueID + + # function to find correct term (in case of misspellings, etc.) + def get_approximate_term(term, array_all): + # TODO: probably need to improve this procedure + # - find better values .... + + # initialize string + term_approx = '' + + # get normalized array + array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35] + array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) + + # get absolute array + array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2] + array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) + if bln_print: + print(term) + print(array_normalized, array_normalized_values) + print(array_absolute, array_absolute_values) + + # intersection + set_intersection = set(array_normalized).intersection(set(array_absolute)) + + # if a similar name was found + if len(set_intersection) == 1: + term_approx = list(set_intersection)[0] + + # or several + elif len(set_intersection) > 1: + # !!! we only look at normalized values + # !!! we don't account for names with same values !!! + array_min = array_normalized[array_normalized_values.argmin()] + term_approx = array_min + if bln_print: + print('we found several possible names', set_intersection, 'and choose', array_min) + + return term_approx + # small function to add term to str_name def add_to_string(string, term): @@ -529,70 +541,109 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl string += ' ' + term return string - list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', - 'Berichterstatter', 'Berichterstatterin', 'rapporteur', - 'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole', - 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', - 'Vizepräsident'] - - list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen', - 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'Rédiger', 'rédiger', 'Wer', 'Fällen'] - - list_places = get_list_cantons(df_names) + # initialize strings and ID + str_name = '' + str_role = '' + list_uniqueID = [] + str_canton = '' + name_type = '' - if bln_print: - print('now is about: ------', term) # extract list and array of last names list_all_names = list(df_names['name_short']) array_all_names = np.array(df_names['name_short']) - # if term is not easily mistaken as a name (avoid false positives) - if term not in list_notnames: + # for every term + for term in list_oi: - # if term is in the list of all names and roles - if term in (list_all_names + list_roles): + if bln_print: + print('now is about: ------', term) + + if term in list_roles: # get correct name and uniqueID, or role, for that term - str_name, str_role, list_uniqueID = get_string(term, str_name, str_role, list_uniqueID, str_canton) + str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton) if bln_print: - print('=== correct name', term) - # if term in list of cantons - elif term in list_places: - str_canton = term - # if term is not in list_all_names - else: - # look for similar names based on (normalized) Damerau-Levenshtein distance - # !!! probably need to improve this procedure - # - find better values .... - if bln_print: - print(term) - array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35] - array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized) - if bln_print: - print(array_normalized, array_normalized_values) - array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2] - array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute) - if bln_print: - print(array_absolute, array_absolute_values) - set_intersection = set(array_normalized).intersection(set(array_absolute)) - # check if a similar name was found - term_approx = '' - if len(set_intersection) == 1: - term_approx = list(set_intersection)[0] - if bln_print: - print('we found the name', set_intersection) - elif len(set_intersection) > 1: - # !!! we only look at normalized values - # !!! we don't account for names with same values !!! - array_min = array_normalized[array_normalized_values.argmin()] - term_approx = array_min#[0] - if bln_print: - print('we found several possible names', set_intersection, 'and choose', array_min) - if term_approx: - str_name, str_role, list_uniqueID = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton) + print('found a role', term) + + # TODO: also look for similar terms (misspellings) + # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter + + elif term in list_roles_ext: + pass + # TODO: extract whether it is minority or majority and save that information + + # cannot happen for the first term + elif name_type == 'canton': + list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0]) + canton_type = '' + if term in list_cantonname: + str_canton = term + canton_type = 'CantonName' + print('!!! is a canton', term, list_oi, str_name, str_role) + elif term in list_cantonabbr: + str_canton = term + canton_type = 'CantonAbbr' + print('!!! is a canton', term, list_oi, str_name, str_role) + elif term in list_citizenship: + str_canton = term + canton_type = 'Citizenship' + print('!!! is a canton', term, list_oi, str_name, str_role) + elif term in list_firstname: + str_canton = term + canton_type = 'FirstName' + print('!!! is a canton', term, list_oi, str_name, str_role) + + else: + print('might be a canton', term, list_oi, str_name, str_role) + + # if a canton or similar was found + if canton_type: + # get rid of CANTON MISSING + str_name = str_name.split(' ')[0] + # extract uniqueID + # if Citizenship, do proper comparison + if canton_type == 'Citizenship': + df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)] + list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])] + print(list_cities) + str_citizenship = '' + try: + if len(list_cities) == 1: + str_citizenship = list_cities[0] + except: + print('found no or more than one person with citizenship', str_canton, str_name) + pass + + list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + + else: + list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + print(list_temp, list_uniqueID) + + if len(list_temp) > 0: + list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp) + + # if term is not easily mistaken as a name (avoid false positives) + elif term not in list_notnames: + + # if term is in the list of all names + if term in list_all_names: + # get correct name and uniqueID, or role, for that term + str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton) + if bln_print: - print('*******************', str_name, term_approx) + print('=== correct name', term) + + # if term is not in list_all_names + else: + # look for similar names based on (normalized) Damerau-Levenshtein distance + term_approx = get_approximate_term(term, array_all_names) + # if one was found, get correct name, etc. + if term_approx: + str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton) + if bln_print: + print('=== approximate name', str_name, term_approx) return str_name, str_role, list_uniqueID, str_canton @@ -665,14 +716,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l): return XML_new +def get_cities(list_citizenship): + return [city[:-5] for item in list_citizenship for city in item.split(',')] +# function to get list of places +def get_list_cantons(df_names, str_name = ''): + if str_name: + df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)] + else: + df_temp = df_names.loc[df_names['type']=='canton'] + print(df_temp) + list_cantonname = list(df_temp['CantonName']) + for canton in ['Basel-Stadt', 'Basel-Landschaft']: + if canton in list_cantonname: + list_cantonname.extend(['Basel']) + if 'Graubünden' in list_cantonname: + list_cantonname.extend(['Bünden']) + if 'Bern' in list_cantonname: # check how this works!! + list_cantonname.extend(['Berne']) + list_cantonabbr = list(df_temp['CantonAbbreviation']) + list_citizenship = list(df_temp['Citizenship']) + list_citizenship = get_cities(list_citizenship) + list_firstname = list(df_temp['FirstName']) + return list_cantonname, list_cantonabbr, list_citizenship, list_firstname - - - +# tokenizer +# last part \S+ is needed to get colon, \S stands for white space +tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') @@ -724,11 +797,6 @@ def exclude_overlaps(dict_text, dict_overlaps): return dict_text -# tokenizer -tokenizer_canton = RegexpTokenizer(r'\w+') # only leaves words -#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+') -# last part \S+ is needed to get colon, \S stands for white space -tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') @@ -766,13 +834,6 @@ def get_df_from_lists_names(lists_names): - - - - - - - def tokenize_dictionary(dictionary, tokenizer, only_text=False): dictionary_tokenized = {} # if there is only text, e.g. when we look at all texts of a document at once (level 2 in flattened dictionary) @@ -815,18 +876,3 @@ def dict_only_text(dictionary): dictionary_only_text[key] = tupel[1] return dictionary_only_text - - - - - -# function to get list of places -def get_list_cantons(df_names): - df_temp = df_names.loc[df_names['type']=='canton'] - list_cantonname = list(df_temp['CantonName']) - list_cantonabbr = list(df_temp['CantonAbbreviation']) - list_citizenship = list(df_temp['Citizenship']) - list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')] - list_firstname = list(df_temp['FirstName']) - - return list_cantonname + list_cantonabbr + list_citizenship + list_firstname diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh index dbec0daf596094cf95f50dad868b250320eab8b9..e769de17cdb4db4b746d143be02e283bcf0e9e50 100755 --- a/src/sh/extract_discussions_yearly.sh +++ b/src/sh/extract_discussions_yearly.sh @@ -1,10 +1,10 @@ #!/bin/bash year_start=1891 -year_end=1893 +year_end=1891 for year in $(seq $year_start $year_end) do echo $year - python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz + python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz done