diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py index dcee82d9c9b6c507aafaa81df11b0b16335c710f..dfeaa6f14cdb06002c1cc87e67ef2b40fd53f7d1 100644 --- a/src/python/bla_tryreadxml.py +++ b/src/python/bla_tryreadxml.py @@ -1,14 +1,58 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- #%% +%load_ext autoreload +%autoreload 2 + import xml.etree.ElementTree as ET import re +import pickle +import string +from nltk.corpus import stopwords +from nltk.tokenize import RegexpTokenizer +import copy + +import sys +sys.path.append('src/python/') +import utils_annot -xml_file = '/home/lili/Downloads/1891/20026448_datacorr.xml' +tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') + +xml_file = 'data/AB/1893/1893/20026528_datacorr.xml' +input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle" XML_tree = ET.parse(xml_file) XML_root = XML_tree.getroot() +# list of stopwords +list_stopwords = stopwords.words('german') +list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) +list_stopwords.extend(stopwords.words('french')) +list_stopwords.extend(['ils', 'les', 'celle']) + +# add a few terms to list_stopwords that are easily mistaken as last names +list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen', + 'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter']) + +# list of votation terms +# TODO: make it work for é, etc. +list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', + 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', + 'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e', + "D'accord", 'voix'] + +# open dataframe of last names from pickle file +with open(input_lastnames, 'rb') as f: + df_lastnames = pickle.load(f) + +#%% +# create new XML as a copy of the corrected one +XML_new = copy.deepcopy(XML_root) + +# initialize flags +this_is_speech = False +prev_is_speech = False +this_is_vote = False # for every page for ind_p, page in enumerate(XML_root): @@ -20,12 +64,34 @@ for ind_p, page in enumerate(XML_root): if (textbox.attrib['type_textbox'] == 'text'): print(textbox.tag, textbox.attrib) - # for every textline in that textbox - for ind_tl, textline in enumerate(textbox): - if textline.tag == 'textline': - print(textline.tag, textline.attrib) + textbox_texttype = get_textbox_type(textbox) + print(textbox_texttype) + + if textbox_texttype in ['text_col1', 'text_col2']: + + complete_text = get_complete_text(textbox) + print(complete_text) + XML_new, this_is_speech = label_speechstart(XML_new, complete_text, df_lastnames, list_stopwords, bln_print=False) + if this_is_speech: + prev_is_speech = True + print('stopped after finding speech start') + continue + XML_new, this_is_vote = label_votations(XML_new, complete_text, list_votationterms, bln_print=False) + if this_is_vote: + prev_is_speech = False + print('stopped after finding vote') + continue + if prev_is_speech and (not this_is_vote): + XML_new = label_speechcont(XML_new) + + print('go to next textbox \n') + + +name_xml = 'data/AB/1893/id_doc_previewannotated.xml' +tree = ET.ElementTree(XML_new) +tree.write(name_xml, encoding = 'utf-8') + - print(get_text(textline.text)) #%% sometext = '[font face="8.071" size="Times-Bold"]Für die Bedaktion verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud. SdMarst[/font][font face="8.071" size="Times-Bold"] —• Druck und Expedition von[/font][font face="7.973" size="Times-BoldItalic"] Jmi è Éeineft[/font][font face="8.071" size="Times-Bold"] fa[/font][font face="7.973" size="Times-BoldItalic"] Seìrit. [/font]' @@ -42,7 +108,193 @@ def get_text(sometext): get_text(sometext) #%% +# helper function to get type of textbox_type +# corresponds to majority vote of types of textlines +# input: +# - textbox +# output: +# - textbox_type: string +def get_textbox_type(textbox): + + # initialize empty dictionary + dict_type = {} + + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': +# print(textline.tag, textline.attrib) + + # count types + if textline.attrib['type'] not in dict_type.keys(): + dict_type[textline.attrib['type']] = 1 + else: + dict_type[textline.attrib['type']] += 1 + +# print(dict_type) + # list of all types with maximum count + list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())] +# print(list_types) + # if only one with maximum value + if len(list_types) == 1: + textbox_type = list_types[0] + # if several with same maximum value + else: + textbox_type = 'notdistinct' + return textbox_type +#%% + +# helper function to get complete text of a textbox +# input: +# - textbox +# output: +# - complete_text: string +def get_complete_text(textbox): + + # helper function to get text without font information + def get_text(sometext): + newtext = '' + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext + # initialize empty string + complete_text = '' + + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': + # append text to string + complete_text += get_text(textline.text) + + return complete_text + + +#%% + +# function to label speech starts +# input: +# - text: stringt to be analyzed +# - df_names: dataframe of politicians +# - list_stopwords: list of german and french stopwords +# - bln_print: whether to print during execution, default False +# output: +# - (str_name, str_role, int_uniqueID, str_canton): tuple with strings and ID +def label_speechstart(XML_new, text, df_names, list_stopwords, bln_print=False): + + # initialize strings and ID + str_name = '' + str_role = '' + int_uniqueID = int(0) + str_canton = '' + + # very consistently, a speaker can be identified by looking for a colon + # at the beginning of a textbox and identifiying a name or a role in front + # of that colon + if ':' in text[:100]: + # extract the index of the colon in the text + colon_index_text = text.index(':') + + # look at first few terms of that textbox + text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) + list_oi = tokenizer.tokenize(text_start) + print('possible speech start: ', list_oi) + + # remove stopwords + list_oi = [term for term in list_oi if term.lower() not in list_stopwords] + + # remove punctuation + list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi] + list_oi = [s for s in list_oi if s] + + # remove lower case terms +# list_oi = [term for term in list_oi if not term.islower()] + + # remove numbers + list_oi = [term for term in list_oi if not term.isdigit()] + + # remove single characters + list_oi = [term for term in list_oi if len(term)>1] + + # for every term, reversed finds canton before it finds name + for term in reversed(list_oi): + # if possible, find a name in a list + str_name, str_role, int_uniqueID, str_canton = utils_annot.find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True) + print('name', str_name, 'role', str_role) + + # get rid of 'Präsident stimmt nicht Président ne vote pas' + if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: + if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): + print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi) + str_role = '' + + # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' + if str_name: + if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: + print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) + str_name = '' + + # if a name has been found, add it to XML_new + if str_name or str_role: + # add attribute speech_start to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' + + # add speaker to first textline + XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, int_uniqueID, str_canton) + # TODO: split speaker from text (check on which line and split that line accordingly) +# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], +# text[colon_index_text+1:]) + + # set flag + this_is_speech = True + if bln_print: + print('found a name:', list_oi, str_name, str_role, '\n') + else: + # set flag + this_is_speech = False + + return XML_new, this_is_speech +# %% + +# function to extract votation paragraphs +# !!! error prone, possible improvements see notebook extract_discussions +# input: +# - XML_new: +# - text: string +# - list_votationterms: list of votation terms +# - bln_print: whether to print during execution, default False +# output: +# - XML_new: updated +def label_votations(XML_new, text, list_votationterms, bln_print=True): + + # get first terms of that text + list_oi = tokenizer.tokenize(text)[:15] +# if len(set(list_oi).intersection(set(list_votationterms))) > 1: + # if there is an overlap with typical votation terms: + if set(list_oi).intersection(set(list_votationterms)): + # add attribute vote to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'vote' + + # set flag + this_is_vote = True + if bln_print: + print('found a vote:', list_oi) + else: + #pass + # set flag + this_is_vote = False + if bln_print: + print('not a vote', list_oi) + + return XML_new, this_is_vote + +#%% + + + +def label_speechcont(XML_new): + + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont' - # for every text (actually just a letter) -# for ind_ch, text in enumerate(textline): -# print(ind_ch, text.text) #, len(tex0tline), len(XML_new[ind_p][ind_t][ind_tl])) + return XML_new diff --git a/src/python/def_classes.py b/src/python/def_classes.py index afe3f906d197b500d5f2317e974b888178689967..379fb44ac7257b7fcd6a1760b31af3cf11c34401 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -176,7 +176,7 @@ class Document: def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1, pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_save = 1): - + if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml @@ -204,8 +204,8 @@ class Document: XML_new = ET.Element('pages') - for ind_abs, ind_page in enumerate(self.n_pages): - + for ind_abs, ind_page in enumerate(self.n_pages): + XML_root = ET.Element('pages') #print(ind_abs,len(self.XML_main)) XML_root.append(self.XML_main[ind_abs]) @@ -237,7 +237,7 @@ class Document: # Top and bottom line ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), coord_horz.astype(np.uint32)) - #print(info_font_sizes) + #print(info_font_sizes) ##### # Label the textboxes based on a set of simple rules that make use of # the margins and the fontsizes @@ -263,7 +263,7 @@ class Document: im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def) im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1) im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) - im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page) # Create figure with 4 subplots, for showing all results @@ -308,13 +308,13 @@ class Document: name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' tree = ET.ElementTree(XML_new) tree.write(name_xml, encoding = 'utf-8') - + if flag_save: name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml) else: print('Not saving to tar') - name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' - + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' + self.name_outcorrxml = name_outcorrxml self.name_xml_corr = [name_tar, name_xml] self._xml_ext(suffix_xml, self.name_outcorrxml) @@ -330,7 +330,7 @@ class Document: def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0, name_outxml = '02_extractedxml'): # ind_page has to be a scalar - + if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() if 'XML_main' not in self.__dict__.keys(): @@ -342,8 +342,8 @@ class Document: XML_tree = ET.parse(h_xml) self.XML_main = XML_tree.getroot() else: - self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) + self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) else: #print('Run this') self._get_pages() @@ -351,11 +351,11 @@ class Document: #print(ind_abs, type(ind_abs)) #print(self.XML_main, len(self.imgobj)) - + if ind_page > (len(self.XML_main) - 1): flag_error = 1 return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error - + XML_root = ET.Element('pages') XML_root.append(self.XML_main[ind_abs[0]]) imarray = np.array(self.imgobj[ind_page]) @@ -392,32 +392,32 @@ class Document: if level_proc > 4: label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ - preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) - - if level_proc > 5: - set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, - list_allcoords_textlines, margins) - - if level_proc > 6: - XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, - rescale_factor, centrall_ord, ind_page, dim_img) - + preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) + + if level_proc > 5: + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + list_allcoords_textlines, margins) + + if level_proc > 6: + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + rescale_factor, centrall_ord, ind_page, dim_img) + # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we # send a 1 instead flag_error = 0 return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \ set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error - + def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'): - + if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() - if 'XML_main_corr' not in self.__dict__.keys(): + if 'XML_main_corr' not in self.__dict__.keys(): name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' if os.path.isfile(name_tar): name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' #print(name_xml) - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: #print('Run this') h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) XML_tree = ET.parse(h_xml) @@ -426,13 +426,13 @@ class Document: print('You need to have the tar file to use flag_compute = 0!') flag_error = 1 return 0, 0, 0, 0, 0, 0, flag_error - #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) else: print('You need to have the tar file to use flag_compute = 0!') flag_error = 1 return 0, 0, 0, 0, 0, 0, flag_error - #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) + #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) else: #print('Run this') self._get_pages() @@ -440,19 +440,19 @@ class Document: #print(ind_abs, type(ind_abs)) #print(self.XML_main, len(self.imgobj)) - + if ind_page > (len(self.XML_main_corr) - 1): flag_error = 1 return 0, 0, 0, 0, 0, 0, flag_error - + XML_root = ET.Element('pages') XML_root.append(self.XML_main_corr[ind_abs[0]]) imarray = np.array(self.imgobj[ind_page]) - + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - - ###### + dim_img = imarray.shape[:2] + + ###### # For obtaining label_textlines, list_allcoords_textlines coord_textline = np.array([]).reshape((4,0)) label_textlines = dict() @@ -465,7 +465,7 @@ class Document: if 'type' in XML_root[0][ind_el][ind_line].attrib: coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) - + type_textl = XML_root[0][ind_el][ind_line].attrib['type'] #print(ind_el) if XML_root[0][ind_el].attrib['type_textbox'] == 'line': @@ -482,18 +482,18 @@ class Document: aux_type = np.array([count]) label_textlines[type_textl] = aux_type count += 1 - + coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page) - - ##### + + ##### # To obtain set_of_blocks. This variable simply contains the coordinates, and - # then a final row indicating the order (here are already ordered), and if it - # is a line, which is indicated with a -1 + # then a final row indicating the order (here are already ordered), and if it + # is a line, which is indicated with a -1 set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0) set_of_blocks = dict() set_of_blocks[0] = set_of_blocks_aux #print(set_of_blocks.shape) - + # The last is the flag_error #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks), # len(XML_root), bbox_page.shape) @@ -501,7 +501,7 @@ class Document: return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error # imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error # imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error - + def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', @@ -519,8 +519,8 @@ class Document: else: imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root) - self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, - flag_plot, flag_save_figs) + self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, + flag_plot, flag_save_figs) self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, flag_plot, flag_save_figs) @@ -537,24 +537,24 @@ class Document: name_outxml = self.name_outxml) if flag_error: print(str(ind_page) + ': non existing page!') - else: + else: im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1) self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', + flag_plot, flag_save_figs) + + def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1): - + if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml + self.name_outxml = name_outxml if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - + self.name_outcorrxml = name_outcorrxml + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): + if flag_compute or not os.path.isfile(name_tar): imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \ self._plot_generic_open(ind_page, suffix_xml, level_proc = 5, name_outxml = self.name_outxml) @@ -562,70 +562,70 @@ class Document: else: imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) #print(len(array_elements)) - + if flag_error: print(str(ind_page) + ': non existing page!') - else: - im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) + else: + im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file, flag_plot, flag_save_figs, flag_legend) - - - def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', + + + def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_compute = 0): - + if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - + self.name_outcorrxml = name_outcorrxml + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): + if flag_compute or not os.path.isfile(name_tar): imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \ self._plot_generic_open(ind_page, suffix_xml, level_proc = 6, name_outxml = self.name_outxml) - else: + else: imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \ - = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - + = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + #print(set_of_blocks) if flag_error: print(str(ind_page) + ': non existing page!') - else: - im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + else: + im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', + flag_plot, flag_save_figs) + + def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1): # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - + self.name_outcorrxml = name_outcorrxml + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): + if flag_compute or not os.path.isfile(name_tar): imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \ self._plot_generic_open(ind_page, suffix_xml, level_proc = 7, name_outxml = self.name_outxml) - else: + else: imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \ - = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - + = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + if flag_error: print(str(ind_page) + ': non existing page!') - else: + else: im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl) self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file, flag_plot, flag_save_figs) - + def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '', flag_plot = 1, flag_save_figs = 0, dpi = 200): if flag_plot: @@ -638,8 +638,8 @@ class Document: name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + '_page' + str(ind_page) + '.' + format_fig) fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - + plt.close(fig) + def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '', flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200): #print(groups) @@ -654,17 +654,17 @@ class Document: coords[0] = in_coord coords[1] += int(im_met.shape[1]/1.5) coords[2] = in_coord + 10 - coords[3] += int(im_met.shape[1]/1.5) - im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6) + coords[3] += int(im_met.shape[1]/1.5) + im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6) coords[0] += inc_page coords[2] += inc_page - + if flag_plot: fig, axes = plt.subplots(1, 1, figsize=(8, 10)) axes.axis('off') - axes.imshow(im_met) + axes.imshow(im_met) plt.title(str_title) - + if flag_legend: coords = in_coord + np.array([0, 0, 10, 10]) flag_notinto = 1 @@ -674,20 +674,20 @@ class Document: coords[0] = in_coord coords[1] += int(im_met.shape[1]/1.5) coords[2] = in_coord + 10 - coords[3] += int(im_met.shape[1]/1.5) - plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left') + coords[3] += int(im_met.shape[1]/1.5) + plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left') coords[0] += inc_page - coords[2] += inc_page - + coords[2] += inc_page + if flag_save_figs: format_fig = 'png' - name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + '_page' + str(ind_page) + '.' + format_fig) fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - - - + plt.close(fig) + + + def check_discussion(self): @@ -732,6 +732,7 @@ class Document: print('we have a main corr XML file here') #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) XML_corr = utils_annot.get_text_corrected(self.XML_main_corr) + self.XML_corr = XML_corr diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 37ca9694c284a5c9bdbde04f189188147e31c6a6..b5cde5d7b26a4758a412c9174a23877f0c5cac27 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -20,10 +20,10 @@ import utils_annot #%% # needed for running in atom, can be ignored -input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle" -input_correctedxml = "/home/lili/Downloads/1891/04_correctedxml.tar.gz" -input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz" -output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz" +input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle" +input_correctedxml = "data/AB/1893/04_correctedxml.tar.gz" +input_correctedmeta = "data/AB/1893/03_correctedmeta.tar.gz" +output_annotatedxml = "data/AB/1893/05_annotatedxml.tar.gz" # detect arguments #input_lastnames = sys.argv[1] @@ -98,21 +98,21 @@ with open(input_lastnames, 'rb') as f: df_lastnames.columns #%% -file_tarpath = './1891/20026448_datacorr.xml' +file_tarpath = './1893/20026528_datacorr.xml' -file_number = file_tarpath.split('/')[-1][:8] -metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta) +id_doc = file_tarpath.split('/')[-1][:8] +metafile_tarpath = './{}/{}{}.xml'.format(year, id_doc, suffix_correctedmeta) # instantiate document object (always from original pdf) -infile_aux = year + '/' + file_number + '.pdf' +infile_aux = year + '/' + id_doc + '.pdf' file_doc = defc.Document(infile_aux, folder_database) -file_doc - +file_doc.name_xml_corr if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']): - print(file_number + '\n') + print(id_doc + '\n') file_doc.annotate_speakers() + file_doc.XML_corr[0][3] #%% diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index a9f27fa6c8b620bf4a6af2d208998e849dd32c92..ebc3804fe80337d63b72299cc981ed7122bf3e15 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -78,9 +78,8 @@ def get_text_corrected(XML_root): for ind_tl, textline in enumerate(textbox): if textline.tag == 'textline': print(textline.tag, textline.attrib) - print(len(textline.text)) print(get_text(textline.text)) -# print('this is text', textline.text) + XML_new[ind_p][ind_t][ind_tl].text = get_text(textline.text) @@ -425,7 +424,7 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln print('*******************', str_name, term_approx) - return str_name, str_role, int_uniqueID, find_names + return str_name, str_role, int_uniqueID, str_canton