diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py index 082289b305219f6495943e8098e924dab8ea6723..e0a855fc6689cf31e6d37306737d0bb85b01801c 100644 --- a/src/python/bla_tryreadxml.py +++ b/src/python/bla_tryreadxml.py @@ -4,422 +4,22 @@ %load_ext autoreload %autoreload 2 -import xml.etree.ElementTree as ET -import re import pickle -import string -from nltk.corpus import stopwords -from nltk.tokenize import RegexpTokenizer -import copy -import sys -sys.path.append('src/python/') -import utils_annot -tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') - -xml_file = 'data/AB/1893/1893/20026528_datacorr.xml' -input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle" - -XML_tree = ET.parse(xml_file) -XML_root = XML_tree.getroot() - -# list of stopwords -list_stopwords = stopwords.words('german') -list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) -list_stopwords.extend(stopwords.words('french')) -list_stopwords.extend(['ils', 'les', 'celle']) - -# add a few terms to list_stopwords that are easily mistaken as last names -list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen', - 'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter']) - -# list of votation terms -# TODO: make it work for é, etc. -list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', - 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', - 'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e', - "D'accord", 'voix'] - -# open dataframe of last names from pickle file -with open(input_lastnames, 'rb') as f: - df_lastnames = pickle.load(f) - -#%% -# create new XML as a copy of the corrected one -XML_new = copy.deepcopy(XML_root) - -# initialize flags to distinguish speeches from votes -this_is_speech = False -prev_is_speech = False -this_is_vote = False - -# for every page -for ind_p, page in enumerate(XML_root): - print(page.tag, page.attrib) - # for every textbox on that page - for ind_t, textbox in enumerate(page): - if (textbox.tag == 'textbox'): - if 'type_textbox' in textbox.attrib.keys(): - if (textbox.attrib['type_textbox'] == 'text'): - print(textbox.tag, textbox.attrib) - - # get complete text of that textbox - complete_text = get_complete_text(textbox) - - # identify and label language in XML - dict_lang = identify_language(complete_text) - XML_new = label_language(XML_new, dict_lang) - - # get texttype of that textbox by majority vote - textbox_texttype = get_textbox_type(textbox) - print(textbox_texttype) - - if textbox_texttype in ['text_col1', 'text_col2']: - - print(complete_text) - XML_new, this_is_speech = label_speechstart(XML_new, complete_text, df_lastnames, list_stopwords, bln_print=False) - if this_is_speech: - prev_is_speech = True - print('stopped after finding speech start') - continue - XML_new, this_is_vote = label_votations(XML_new, complete_text, list_votationterms, bln_print=False) - if this_is_vote: - prev_is_speech = False - print('stopped after finding vote') - continue - if prev_is_speech and (not this_is_vote): - XML_new = label_speechcont(XML_new) - - print('go to next textbox \n') - - -name_xml = 'data/AB/1893/id_doc_previewannotated.xml' -tree = ET.ElementTree(XML_new) -tree.write(name_xml, encoding = 'utf-8') - - - -#%% -sometext = '[font face="8.071" size="Times-Bold"]Für die Bedaktion verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud. SdMarst[/font][font face="8.071" size="Times-Bold"] —• Druck und Expedition von[/font][font face="7.973" size="Times-BoldItalic"] Jmi è Éeineft[/font][font face="8.071" size="Times-Bold"] fa[/font][font face="7.973" size="Times-BoldItalic"] Seìrit. [/font]' - -#re.split('[ | ]', sometext) -def get_text(sometext): - newtext = '' - for text in re.findall('\].*?\[',sometext): - #print(text) - if text.startswith(']') and text.endswith('['): - newtext += text[1:-1] - #print(newtext) - return newtext -get_text(sometext) #%% +path_data = '/home/lili/NLP_DemocraSci/data_from_nlp-democracy/results_overlap/' +# open dictionary of overlaps +with open(path_data + 'DictOverlap1891to1930.pkl', 'rb') as f: + dict_overlaps_1 = pickle.load(f) +with open(path_data + 'DictOverlap1931to1995.pkl', 'rb') as f: + dict_overlaps_2 = pickle.load(f) +with open(path_data + 'DictOverlap1991to1995.pkl', 'rb') as f: + dict_overlaps_3 = pickle.load(f) +dict_overlaps = {**dict_overlaps_1, **dict_overlaps_2, **dict_overlaps_3} -# helper function to get type of textbox_type -# corresponds to majority vote of types of textlines -# input: -# - textbox -# output: -# - textbox_type: string -def get_textbox_type(textbox): - - # initialize empty dictionary - dict_type = {} - - # for every textline in that textbox - for ind_tl, textline in enumerate(textbox): - if textline.tag == 'textline': -# print(textline.tag, textline.attrib) - - # count types - if textline.attrib['type'] not in dict_type.keys(): - dict_type[textline.attrib['type']] = 1 - else: - dict_type[textline.attrib['type']] += 1 - -# print(dict_type) - # list of all types with maximum count - list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())] -# print(list_types) - # if only one with maximum value - if len(list_types) == 1: - textbox_type = list_types[0] - # if several with same maximum value - else: - textbox_type = 'notdistinct' - return textbox_type -#%% - -# helper function to get complete text of a textbox -# input: -# - textbox -# output: -# - complete_text: string -def get_complete_text(textbox): - - # helper function to get text without font information - def get_text(sometext): - newtext = '' - for text in re.findall('\].*?\[',sometext): - #print(text) - if text.startswith(']') and text.endswith('['): - newtext += text[1:-1] - #print(newtext) - return newtext - # initialize empty string - complete_text = '' - - # for every textline in that textbox - for ind_tl, textline in enumerate(textbox): - if textline.tag == 'textline': - # append text to string - complete_text += get_text(textline.text) - - return complete_text - - -#%% - -# function to label speech starts -# input: -# - text: stringt to be analyzed -# - df_names: dataframe of politicians -# - list_stopwords: list of german and french stopwords -# - bln_print: whether to print during execution, default False -# output: -# - (str_name, str_role, int_uniqueID, str_canton): tuple with strings and ID -# TODO: speakers with double get recognized twice (1893, 20026528, p2, Scherrer-Füllemann) -def label_speechstart(XML_new, text, df_names, list_stopwords, bln_print=False): - - # initialize strings and ID - str_name = '' - str_role = '' - int_uniqueID = int(0) - str_canton = '' - - # very consistently, a speaker can be identified by looking for a colon - # at the beginning of a textbox and identifiying a name or a role in front - # of that colon - if ':' in text[:100]: - # extract the index of the colon in the text - colon_index_text = text.index(':') - - # look at first few terms of that textbox - text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) - list_oi = tokenizer.tokenize(text_start) - print('possible speech start: ', list_oi) - - # remove stopwords - list_oi = [term for term in list_oi if term.lower() not in list_stopwords] - - # remove punctuation - list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi] - list_oi = [s for s in list_oi if s] - - # remove lower case terms -# list_oi = [term for term in list_oi if not term.islower()] - - # remove numbers - list_oi = [term for term in list_oi if not term.isdigit()] - - # remove single characters - list_oi = [term for term in list_oi if len(term)>1] - - # for every term, reversed finds canton before it finds name - for term in reversed(list_oi): - # if possible, find a name in a list - str_name, str_role, int_uniqueID, str_canton = utils_annot.find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True) - print('name', str_name, 'role', str_role) - - # get rid of doubled double names - - - # get rid of 'Präsident stimmt nicht Président ne vote pas' - if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: - if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): - print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi) - str_role = '' - - # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' - if str_name: - if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: - print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) - str_name = '' - - # if a name has been found, add it to XML_new - if str_name or str_role: - # add attribute speech_start to textbox - XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' - - # add speaker to first textline - XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, int_uniqueID, str_canton) - # TODO: split speaker from text (check on which line and split that line accordingly) -# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], -# text[colon_index_text+1:]) - - # set flag - this_is_speech = True - if bln_print: - print('found a name:', list_oi, str_name, str_role, '\n') - else: - # set flag - this_is_speech = False - - return XML_new, this_is_speech -# %% - -# function to extract votation paragraphs -# !!! error prone, possible improvements see notebook extract_discussions -# input: -# - XML_new: -# - text: string -# - list_votationterms: list of votation terms -# - bln_print: whether to print during execution, default False -# output: -# - XML_new: updated -def label_votations(XML_new, text, list_votationterms, bln_print=True): - - # get first terms of that text - list_oi = tokenizer.tokenize(text)[:15] -# if len(set(list_oi).intersection(set(list_votationterms))) > 1: - # if there is an overlap with typical votation terms: - if set(list_oi).intersection(set(list_votationterms)): - # add attribute vote to textbox - XML_new[ind_p][ind_t].attrib['text_type'] = 'vote' - - # set flag - this_is_vote = True - if bln_print: - print('found a vote:', list_oi) - else: - #pass - # set flag - this_is_vote = False - if bln_print: - print('not a vote', list_oi) - - return XML_new, this_is_vote - -#%% - - - -def label_speechcont(XML_new): - - XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont' - - return XML_new - -#%% -# two functions for language identification -# Author: Luis Salamanca -# small modifications by Lili Gasser -# Using stopwords -# input: -# - text: string -# - valid_lang: tuple of valid languages -# output: -# - dict_language_counts: dictionary of stopword counts for each valid language -def identify_language(text, valid_lang = ('german', 'french', 'italian')): - - # tokenize - tokens = text.split(' ') - # all lowercase - test_words = [word.lower() for word in tokens] - # make a set - test_words_set = set(test_words) - - # initialize dictionary of language elements - dict_language_counts = {} - - # iterate through languages of stopwords - for language in stopwords.fileids(): - if language in valid_lang: - # get stopword set - stopwords_set = set(stopwords.words(language)) - # get intersection between text of interest and stopword set for this language - common_elements = test_words_set.intersection(stopwords_set) - # save number of common elements to dictionary - dict_language_counts[language] = len(common_elements) - - return dict_language_counts - - -# Simply, given the number of ocurrences of the stopwords, it assigns a label -# to a specific textbox, also considering the possibility of textboxes -# mixing languages. For this case, the value ratio_similar is intended -# input: -# - XML_new: XML file to update -# - aux_dict_l: corresponds to dict_language_counts -# output: -# - lang_max: string -def label_language(XML_new, aux_dict_l): - - # specify a similarity ratio - ratio_similar = 0.8 - # if there are counts, determine language - if sum(aux_dict_l.values()): - aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()} - lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key])) - lang_max = '' - count_l = 0 - for lang in aux_dict_l_norm.keys(): - if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar): - if count_l > 0: - lang_max += '_' - lang_max += lang - count_l += 1 - if count_l > 1: - lang_max = 'mixed_' + lang_max - else: - lang_max = 'languageNotIdentified' - - # add attribute to textbox - XML_new[ind_p][ind_t].attrib['language'] = lang_max - - return XML_new - - - -#%% - -int_uniqueID = (123, 123) -print(type(int_uniqueID)) -print(isinstance(int_uniqueID, tuple)) - - -tpl = () -tpl2 = (tpl, 2) -tpl2 -tpl.append(2) - - -lst = [] -list2 = [lst, 2] -list2 -lst.append(2) -lst - - -lst -tuple(lst) -(1, 2, lst, 3) -('a', 'b', 'c', lst) - -[2] -[1, 2, 3].append([4]) -lst = [1,2,3] -lst -lst.append(3) -lst -lst.append([4, 5, 6]) -lst -len(lst) -set(lst) - - -lst = [2, 3] +print(dict_overlaps.keys()) +print(dict_overlaps[1891]) -list_temptemp = [] -for item in lst: - list_temptemp.extend(item) +with open(path_data + 'dict_overlaps.pickle', 'wb') as f: + pickle.dump(f) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 041d1b66a7ca485b37934aab52aacf3a45d4acf7..4e55da1317e3683ec3799e015085be48a3b57f55 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -724,13 +724,40 @@ class Document: self.name_outmeta = name_outmeta utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta) - (str_council, str_date) = utils_annot.get_council_and_date(self.name_meta_corr[1]) + (self.str_council, self.str_date) = utils_annot.get_council_and_date(self.name_meta_corr[1]) command = 'rm -rf ./' + str(self.year) #print(command) utils_proc.call_with_out(command) - return (str_council, str_date) + + # function to exclude overlapping textboxes between documents + # input: + # - dict_overlaps: dictionary with overlaps + # output: + # - (first_entry, last_entry): tuple of first and last textbox id + def get_first_last_textbox(self, dict_overlaps): + + # get yearly dictionary + dict_overlaps_year = dict_overlaps[self.year] + + # initialize to impossible values + first_entry = -1 + last_entry = 1000 + + # get index of textbox from first and last page + # the overlap dictionary only contains an entry, if an overlap was detected + if self.id_doc in dict_overlaps_year.keys(): + for entry, array in dict_overlaps_year[self.id_doc].items(): + if entry == 'first': + first_entry = int(array[0]) + if entry == 'last': + last_entry = int(array[0]) + + return (first_entry, last_entry) + + + def annotate_xml(self, flag_save = 1, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'): @@ -764,17 +791,25 @@ class Document: print('we have a main corr XML file') # get council and date - (str_council, str_date) = self.get_council_date() - self.str_council = str_council - self.str_date = str_date + self.get_council_date() + + # get start and end of document + path_data = '/home/lili/NLP_DemocraSci/data_from_nlp-democracy/results_overlap/' + with open (path_data + 'dict_overlaps.pickle', 'rb') as f: + dict_overlaps = pickle.load(f) + self.entries = self.get_first_last_textbox(dict_overlaps) + print(self.entries) # file to track speakers self.name_speakers = '_'.join((str(self.year), self.id_doc, 'speakers.txt')) - with open('data/lists/speakers/' + self.name_speakers, 'w') as f: - f.write(' '.join((str(self.year), self.id_doc, str_date, '\n'))) + path_speakers = 'data/lists/speakers/' + if not os.path.exists(path_speakers): + os.makedirs(path_speakers) + with open(path_speakers + self.name_speakers, 'w') as f: + f.write(' '.join((str(self.year), self.id_doc, self.str_date, '\n'))) #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) - XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, bln_print=False) + XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, self.entries, bln_print=False) self.XML_main_annot = XML_main_annot # save xml file diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 512cc94b0289e23292c75a59926ed6008973b885..812e0ff5ee2419f17bcd58fbb2868f64ae4fb1c5 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -115,7 +115,7 @@ with open(input_notnames) as f: list_notnames = [term.rstrip() for term in list_notnames] # to test for one file -file_tarpath = './1936/20031986_datacorr.xml' +file_tarpath = './1936/20031982_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -123,6 +123,7 @@ id_doc = file_tarpath.split('/')[-1][:8] infile_aux = year + '/' + id_doc + '.pdf' file_doc = defc.Document(infile_aux, folder_database) + if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): print(id_doc + '\n') diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 8e8f554f55a3104cd32874d39502269394382d49..a4cf53ff2146fbc49add9c4fb28657e71c39c3cc 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -84,7 +84,7 @@ def get_text(sometext): # function to annotated corrected XML -def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, bln_print=False): +def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, entries, bln_print=False): # list of votation terms list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', @@ -100,6 +100,7 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da # create new XML as a copy of the corrected one XML_new = copy.deepcopy(XML_root) + last_page = len(XML_root) # initialize flags to distinguish speeches from votes this_is_speech = False @@ -108,10 +109,24 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da # for every page for ind_p, page in enumerate(XML_root): + print('page index:', ind_p) if bln_print: print(page.tag, page.attrib) + # for every textbox on that page for ind_t, textbox in enumerate(page): + + try: + if ind_p == 0 and entries[0] == int(textbox.attrib['id']): + with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + f.write(' '.join(('<<<=====================', 'the document starts here', '\n\n'))) + + if ind_p == last_page - 1 and entries[1] == int(textbox.attrib['id']): + with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + f.write(' '.join(('=====================>>>', 'the document ends here', '\n\n'))) + except KeyError: + pass + if (textbox.tag == 'textbox'): if 'type_textbox' in textbox.attrib.keys(): if (textbox.attrib['type_textbox'] == 'text'): diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh index e769de17cdb4db4b746d143be02e283bcf0e9e50..9495dc345a9e7425d8adfc2de7e9bd62206e128a 100755 --- a/src/sh/extract_discussions_yearly.sh +++ b/src/sh/extract_discussions_yearly.sh @@ -6,5 +6,6 @@ year_end=1891 for year in $(seq $year_start $year_end) do echo $year + # renku run --isolation python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz done