diff --git a/.gitattributes b/.gitattributes index 69c57ff2c0190283158eabd6960d27fe9cffc6ed..c9a687eeb7fb05ad93c356eb8fe3c2e6c148236e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -166,3 +166,5 @@ data/AB/1967/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1968/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1969/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/train_NER/20190109_train_NER.tar.gz filter=lfs diff=lfs merge=lfs -text +data/AB/1970/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text +data/AB/1971/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 81899c1deb45d25dae2346fbaba8fcb57610eabe..f17696717cfd74906dad9a8b441cc8f59bf68623 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,7 +24,7 @@ image_build: dot: stage: build - image: renku/renku-python:0.2.0 + image: renku/renku-python:v0.3.3 script: - renku log --format dot $(git ls-files --no-empty-directory --recurse-submodules) > graph.dot artifacts: diff --git a/.renku/workflow/3c8d4b7f4a2e4742b98b1e1cbd1aa493_python.cwl b/.renku/workflow/3c8d4b7f4a2e4742b98b1e1cbd1aa493_python.cwl new file mode 100644 index 0000000000000000000000000000000000000000..752abc673e19029f2e3ee7aa0b3083755eba0533 --- /dev/null +++ b/.renku/workflow/3c8d4b7f4a2e4742b98b1e1cbd1aa493_python.cwl @@ -0,0 +1,51 @@ +arguments: [] +baseCommand: +- python +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: + class: File + path: ../../src/python/run_correctxml.py + inputBinding: + position: 1 + separate: true + shellQuote: true + streamable: false + type: File + input_2: + default: + class: File + path: ../../data/AB/1970/02_extractedxml.tar.gz + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: File + input_3: + default: data/AB/1970/04_correctedxml.tar.gz + inputBinding: + position: 3 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_3) + streamable: false + type: File +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/AB/1970 + writable: true +successCodes: [] +temporaryFailCodes: [] diff --git a/.renku/workflow/bc7b832a372149e2986b571e0e8fd144_python.cwl b/.renku/workflow/bc7b832a372149e2986b571e0e8fd144_python.cwl new file mode 100644 index 0000000000000000000000000000000000000000..db26d337488a99b3b883ef2a729eefaed948aa48 --- /dev/null +++ b/.renku/workflow/bc7b832a372149e2986b571e0e8fd144_python.cwl @@ -0,0 +1,51 @@ +arguments: [] +baseCommand: +- python +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: + class: File + path: ../../src/python/run_correctxml.py + inputBinding: + position: 1 + separate: true + shellQuote: true + streamable: false + type: File + input_2: + default: + class: File + path: ../../data/AB/1971/02_extractedxml.tar.gz + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: File + input_3: + default: data/AB/1971/04_correctedxml.tar.gz + inputBinding: + position: 3 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_3) + streamable: false + type: File +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/AB/1971 + writable: true +successCodes: [] +temporaryFailCodes: [] diff --git a/data/AB/1970/04_correctedxml.tar.gz b/data/AB/1970/04_correctedxml.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..66ffa3eb3585962e60062c84e964b6c29d1a892e --- /dev/null +++ b/data/AB/1970/04_correctedxml.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcfe774bc163f3d2507bdd42e18ed08efc1c80e601bfb579e7e7ee2496b1cf87 +size 9925496 diff --git a/data/AB/1971/04_correctedxml.tar.gz b/data/AB/1971/04_correctedxml.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..66cc5664e06f6b82cf70446543bc85ba5154dfc3 --- /dev/null +++ b/data/AB/1971/04_correctedxml.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8a222f55fcfcf9f0d28e0e27cd9e804f9401adfc8fd01ec31b6d2651598f34e +size 26675254 diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py new file mode 100644 index 0000000000000000000000000000000000000000..082289b305219f6495943e8098e924dab8ea6723 --- /dev/null +++ b/src/python/bla_tryreadxml.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +#%% +%load_ext autoreload +%autoreload 2 + +import xml.etree.ElementTree as ET +import re +import pickle +import string +from nltk.corpus import stopwords +from nltk.tokenize import RegexpTokenizer +import copy + +import sys +sys.path.append('src/python/') +import utils_annot + +tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') + +xml_file = 'data/AB/1893/1893/20026528_datacorr.xml' +input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle" + +XML_tree = ET.parse(xml_file) +XML_root = XML_tree.getroot() + +# list of stopwords +list_stopwords = stopwords.words('german') +list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) +list_stopwords.extend(stopwords.words('french')) +list_stopwords.extend(['ils', 'les', 'celle']) + +# add a few terms to list_stopwords that are easily mistaken as last names +list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen', + 'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter']) + +# list of votation terms +# TODO: make it work for é, etc. +list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', + 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', + 'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e', + "D'accord", 'voix'] + +# open dataframe of last names from pickle file +with open(input_lastnames, 'rb') as f: + df_lastnames = pickle.load(f) + +#%% +# create new XML as a copy of the corrected one +XML_new = copy.deepcopy(XML_root) + +# initialize flags to distinguish speeches from votes +this_is_speech = False +prev_is_speech = False +this_is_vote = False + +# for every page +for ind_p, page in enumerate(XML_root): + print(page.tag, page.attrib) + # for every textbox on that page + for ind_t, textbox in enumerate(page): + if (textbox.tag == 'textbox'): + if 'type_textbox' in textbox.attrib.keys(): + if (textbox.attrib['type_textbox'] == 'text'): + print(textbox.tag, textbox.attrib) + + # get complete text of that textbox + complete_text = get_complete_text(textbox) + + # identify and label language in XML + dict_lang = identify_language(complete_text) + XML_new = label_language(XML_new, dict_lang) + + # get texttype of that textbox by majority vote + textbox_texttype = get_textbox_type(textbox) + print(textbox_texttype) + + if textbox_texttype in ['text_col1', 'text_col2']: + + print(complete_text) + XML_new, this_is_speech = label_speechstart(XML_new, complete_text, df_lastnames, list_stopwords, bln_print=False) + if this_is_speech: + prev_is_speech = True + print('stopped after finding speech start') + continue + XML_new, this_is_vote = label_votations(XML_new, complete_text, list_votationterms, bln_print=False) + if this_is_vote: + prev_is_speech = False + print('stopped after finding vote') + continue + if prev_is_speech and (not this_is_vote): + XML_new = label_speechcont(XML_new) + + print('go to next textbox \n') + + +name_xml = 'data/AB/1893/id_doc_previewannotated.xml' +tree = ET.ElementTree(XML_new) +tree.write(name_xml, encoding = 'utf-8') + + + +#%% +sometext = '[font face="8.071" size="Times-Bold"]Für die Bedaktion verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud. SdMarst[/font][font face="8.071" size="Times-Bold"] —• Druck und Expedition von[/font][font face="7.973" size="Times-BoldItalic"] Jmi è Éeineft[/font][font face="8.071" size="Times-Bold"] fa[/font][font face="7.973" size="Times-BoldItalic"] Seìrit. [/font]' + +#re.split('[ | ]', sometext) +def get_text(sometext): + newtext = '' + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext +get_text(sometext) +#%% + +# helper function to get type of textbox_type +# corresponds to majority vote of types of textlines +# input: +# - textbox +# output: +# - textbox_type: string +def get_textbox_type(textbox): + + # initialize empty dictionary + dict_type = {} + + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': +# print(textline.tag, textline.attrib) + + # count types + if textline.attrib['type'] not in dict_type.keys(): + dict_type[textline.attrib['type']] = 1 + else: + dict_type[textline.attrib['type']] += 1 + +# print(dict_type) + # list of all types with maximum count + list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())] +# print(list_types) + # if only one with maximum value + if len(list_types) == 1: + textbox_type = list_types[0] + # if several with same maximum value + else: + textbox_type = 'notdistinct' + return textbox_type +#%% + +# helper function to get complete text of a textbox +# input: +# - textbox +# output: +# - complete_text: string +def get_complete_text(textbox): + + # helper function to get text without font information + def get_text(sometext): + newtext = '' + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext + # initialize empty string + complete_text = '' + + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': + # append text to string + complete_text += get_text(textline.text) + + return complete_text + + +#%% + +# function to label speech starts +# input: +# - text: stringt to be analyzed +# - df_names: dataframe of politicians +# - list_stopwords: list of german and french stopwords +# - bln_print: whether to print during execution, default False +# output: +# - (str_name, str_role, int_uniqueID, str_canton): tuple with strings and ID +# TODO: speakers with double get recognized twice (1893, 20026528, p2, Scherrer-Füllemann) +def label_speechstart(XML_new, text, df_names, list_stopwords, bln_print=False): + + # initialize strings and ID + str_name = '' + str_role = '' + int_uniqueID = int(0) + str_canton = '' + + # very consistently, a speaker can be identified by looking for a colon + # at the beginning of a textbox and identifiying a name or a role in front + # of that colon + if ':' in text[:100]: + # extract the index of the colon in the text + colon_index_text = text.index(':') + + # look at first few terms of that textbox + text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) + list_oi = tokenizer.tokenize(text_start) + print('possible speech start: ', list_oi) + + # remove stopwords + list_oi = [term for term in list_oi if term.lower() not in list_stopwords] + + # remove punctuation + list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi] + list_oi = [s for s in list_oi if s] + + # remove lower case terms +# list_oi = [term for term in list_oi if not term.islower()] + + # remove numbers + list_oi = [term for term in list_oi if not term.isdigit()] + + # remove single characters + list_oi = [term for term in list_oi if len(term)>1] + + # for every term, reversed finds canton before it finds name + for term in reversed(list_oi): + # if possible, find a name in a list + str_name, str_role, int_uniqueID, str_canton = utils_annot.find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True) + print('name', str_name, 'role', str_role) + + # get rid of doubled double names + + + # get rid of 'Präsident stimmt nicht Président ne vote pas' + if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: + if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): + print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi) + str_role = '' + + # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' + if str_name: + if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: + print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) + str_name = '' + + # if a name has been found, add it to XML_new + if str_name or str_role: + # add attribute speech_start to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' + + # add speaker to first textline + XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, int_uniqueID, str_canton) + # TODO: split speaker from text (check on which line and split that line accordingly) +# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], +# text[colon_index_text+1:]) + + # set flag + this_is_speech = True + if bln_print: + print('found a name:', list_oi, str_name, str_role, '\n') + else: + # set flag + this_is_speech = False + + return XML_new, this_is_speech +# %% + +# function to extract votation paragraphs +# !!! error prone, possible improvements see notebook extract_discussions +# input: +# - XML_new: +# - text: string +# - list_votationterms: list of votation terms +# - bln_print: whether to print during execution, default False +# output: +# - XML_new: updated +def label_votations(XML_new, text, list_votationterms, bln_print=True): + + # get first terms of that text + list_oi = tokenizer.tokenize(text)[:15] +# if len(set(list_oi).intersection(set(list_votationterms))) > 1: + # if there is an overlap with typical votation terms: + if set(list_oi).intersection(set(list_votationterms)): + # add attribute vote to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'vote' + + # set flag + this_is_vote = True + if bln_print: + print('found a vote:', list_oi) + else: + #pass + # set flag + this_is_vote = False + if bln_print: + print('not a vote', list_oi) + + return XML_new, this_is_vote + +#%% + + + +def label_speechcont(XML_new): + + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont' + + return XML_new + +#%% +# two functions for language identification +# Author: Luis Salamanca +# small modifications by Lili Gasser +# Using stopwords +# input: +# - text: string +# - valid_lang: tuple of valid languages +# output: +# - dict_language_counts: dictionary of stopword counts for each valid language +def identify_language(text, valid_lang = ('german', 'french', 'italian')): + + # tokenize + tokens = text.split(' ') + # all lowercase + test_words = [word.lower() for word in tokens] + # make a set + test_words_set = set(test_words) + + # initialize dictionary of language elements + dict_language_counts = {} + + # iterate through languages of stopwords + for language in stopwords.fileids(): + if language in valid_lang: + # get stopword set + stopwords_set = set(stopwords.words(language)) + # get intersection between text of interest and stopword set for this language + common_elements = test_words_set.intersection(stopwords_set) + # save number of common elements to dictionary + dict_language_counts[language] = len(common_elements) + + return dict_language_counts + + +# Simply, given the number of ocurrences of the stopwords, it assigns a label +# to a specific textbox, also considering the possibility of textboxes +# mixing languages. For this case, the value ratio_similar is intended +# input: +# - XML_new: XML file to update +# - aux_dict_l: corresponds to dict_language_counts +# output: +# - lang_max: string +def label_language(XML_new, aux_dict_l): + + # specify a similarity ratio + ratio_similar = 0.8 + # if there are counts, determine language + if sum(aux_dict_l.values()): + aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()} + lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key])) + lang_max = '' + count_l = 0 + for lang in aux_dict_l_norm.keys(): + if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar): + if count_l > 0: + lang_max += '_' + lang_max += lang + count_l += 1 + if count_l > 1: + lang_max = 'mixed_' + lang_max + else: + lang_max = 'languageNotIdentified' + + # add attribute to textbox + XML_new[ind_p][ind_t].attrib['language'] = lang_max + + return XML_new + + + +#%% + +int_uniqueID = (123, 123) +print(type(int_uniqueID)) +print(isinstance(int_uniqueID, tuple)) + + +tpl = () +tpl2 = (tpl, 2) +tpl2 +tpl.append(2) + + +lst = [] +list2 = [lst, 2] +list2 +lst.append(2) +lst + + +lst +tuple(lst) +(1, 2, lst, 3) +('a', 'b', 'c', lst) + +[2] +[1, 2, 3].append([4]) +lst = [1,2,3] +lst +lst.append(3) +lst +lst.append([4, 5, 6]) +lst +len(lst) +set(lst) + + +lst = [2, 3] + +list_temptemp = [] +for item in lst: + list_temptemp.extend(item) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 8808fb4602673e53ee2b98b6f238c937df9630ac..cce7282200b409284b6b30a108244026631e6ab1 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -15,7 +15,7 @@ import matplotlib.pyplot as plt import numpy as np import xml.etree.ElementTree as ET import copy -import time +import time import tarfile import pickle @@ -33,12 +33,12 @@ import preproc_docs # Definition of classes and methods associated class Document: - + limit_year = 1950 flag_end_run = 1 name_inpdf = '00_rawpdfs' name_inmeta = '01_rawmeta' - + def __init__(self, input_file, folder_database): self.year = int(input_file.split('/')[-2]) self.id_doc = input_file.split('/')[-1].split('.')[0] @@ -48,14 +48,22 @@ class Document: self.name_wo_ext = os.path.splitext(self.name_file)[0] self.folder_database = folder_database self._meta_ext() - + self._xml_ext() + def _meta_ext(self): # Both for the correction and the extraction of the metadata information name_file = str(self.year) + '/' + self.id_doc + '.xml' name_file_db = str(self.year) + '/' + self.id_doc + '.db' name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' - self.name_meta = [name_tar, name_file, name_file_db] - + self.name_meta = [name_tar, name_file, name_file_db] + + def _xml_ext(self, suffix_xml = '_data', name_outcorrxml = '04_correctedxml'): + # For the extraction, correction and annotation of the xmls + # TODO for extraction and annotation + name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' + name_tar = self.folder_database + str(self.year) + '/' + name_outcorrxml + '.tar.gz' + self.name_xml_corr = [name_tar, name_xml] + def meta_correct(self, name_outmeta = '03_correctedmeta'): utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta) @@ -65,13 +73,13 @@ class Document: command = 'rm -rf ./' + str(self.year) #print(command) utils_proc.call_with_out(command) - + def pdf2imgobj(self, resolution = 100): - + self.resolution = resolution utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) self.imgobj = convert_from_path(self.input_file, dpi = resolution) - command = 'rm -rf ./' + str(self.year) + command = 'rm -rf ./' + str(self.year) utils_proc.call_with_out(command) def _get_pages(self, pages = 'all'): @@ -82,7 +90,7 @@ class Document: elif isinstance(pages,str): self.n_pages = np.array(pages.split(',')).astype(np.uint32) else: - self.n_pages = np.array(pages) + self.n_pages = np.array(pages) def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1, name_outxml = '02_extractedxml'): @@ -90,7 +98,7 @@ class Document: if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() self._get_pages(pages = pages) - + utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml, flag_end = self.flag_end_run) @@ -126,58 +134,58 @@ class Document: imarray = np.array(self.imgobj[ind_page]) else: return print('Not possible! - You need to convert first the pdf to image\n') - + if XML_root == None: XML_root = ET.Element('pages') ind_abs = np.argwhere(self.n_pages == ind_page) XML_root.append(XML_main[ind_abs]) - + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - + imarray_textb = np.copy(imarray) - + if textb_textl == 1: coord_textboxes = np.array([]).reshape((4,0)) for ind_el in range(0, len(XML_root[0])): if XML_root[0][ind_el].tag == 'textbox': coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64) coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1) - imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, - bbox_page, color_vec = 'blue', alpha = True, - filled = False, thick_line = 6) + imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, + bbox_page, color_vec = 'blue', alpha = True, + filled = False, thick_line = 6) return imarray_textb, coord_textboxes - elif textb_textl == 2: + elif textb_textl == 2: imarray_textl = np.copy(imarray) coord_textline = np.array([]).reshape((4,0)) - all_font_sizes = np.array([]) + all_font_sizes = np.array([]) for ind_el in range(0, len(XML_root[0])): for ind_line in range(0, len(XML_root[0][ind_el])): if XML_root[0][ind_el][ind_line].tag == 'textline': coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) if len(XML_root[0][ind_el][ind_line]): - all_font_sizes = np.concatenate((all_font_sizes, + all_font_sizes = np.concatenate((all_font_sizes, np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64))) coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) - imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, - color_vec = 'red', alpha = True, filled = False, thick_line = 6) - - all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True) + imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, + color_vec = 'red', alpha = True, filled = False, thick_line = 6) + + all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True) info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])), - counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64))) - - return imarray_textb, coord_textline, all_font_sizes, info_font_sizes - + counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64))) + + return imarray_textb, coord_textline, all_font_sizes, info_font_sizes + def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1, pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_save = 1): - + if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml - + start_time = time.time() if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() - + if 'XML_main' not in self.__dict__.keys(): name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz' if os.path.isfile(name_tar): @@ -189,141 +197,142 @@ class Document: else: # TODO if already exists 02_extractedxml self.pdf2xml(pages = pages, suffix_xml = suffix_xml) - + self._get_pages(pages = pages) flag_central = 1 if self.year > self.limit_year: flag_central = 0 flag_2col = 1 - + XML_new = ET.Element('pages') - - for ind_abs, ind_page in enumerate(self.n_pages): - + + for ind_abs, ind_page in enumerate(self.n_pages): + XML_root = ET.Element('pages') #print(ind_abs,len(self.XML_main)) XML_root.append(self.XML_main[ind_abs]) imarray = np.array(self.imgobj[ind_page]) - + if XML_root[0][0].tag == 'textbox': bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) dim_img = imarray.shape[:2] _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page) - + # Image with textboxes highlighted imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root) - - # Image with textlines highlighted, BUT also, array with all textlines + + # Image with textlines highlighted, BUT also, array with all textlines # coordinates, and the fontsizes, required for later _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, - textb_textl = 2) - + textb_textl = 2) + ##### # Central vertical line and horizontal lines, through Hough transform - coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, - flag_2col, flag_central) - + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col, flag_central) + ##### # Obtain lateral margins - margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), coord_horz.astype(np.uint32)) - + # Top and bottom line - ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), coord_horz.astype(np.uint32)) - #print(info_font_sizes) + #print(info_font_sizes) ##### - # Label the textboxes based on a set of simple rules that make use of + # Label the textboxes based on a set of simple rules that make use of # the margins and the fontsizes label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est - + ##### # Order the textlines, taken all them together, in order to later merge # in a single textbox textlines that so far form different textboxes - set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textlines, margins) - - # Given the ordered textlines, group them in new textboxes, creating a + + # Given the ordered textlines, group them in new textboxes, creating a # XML, This uses some criteria of distance between paragraphs - XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, rescale_factor, centrall_ord, ind_page, dim_img) - + # Append to the new XML XML_new.append(XML_enrich[0]) - - + + if flag_plots: im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def) im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1) im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) - im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page) - + # Create figure with 4 subplots, for showing all results if flag_save_figs: path_output_img = self.path_file + '/previews' if flag_save_figs: if not os.path.exists(path_output_img): os.makedirs(path_output_img) - + if flag_parallel: if flag_save_figs: name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl' with open(name_pickle, 'wb') as f: # Python 3: open(..., 'wb') pickle.dump([im_met2, im_met3, im_met4, im_met5], f) - + else: fig, axes = plt.subplots(1, 4, figsize=(30, 10)) ax = axes.ravel() ax[0].axis('off') - ax[0].imshow(im_met2) + ax[0].imshow(im_met2) ax[1].axis('off') ax[1].imshow(im_met3) ax[2].axis('off') - ax[2].imshow(im_met4) + ax[2].imshow(im_met4) ax[3].axis('off') ax[3].imshow(im_met5) - + if flag_save_figs: format_fig = 'png' name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig fig.savefig(name_fig, format = format_fig, dpi = 200) plt.close(fig) - - name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml' - + + name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml' + tree = ET.ElementTree(XML_new) self.XML_main_corr = XML_new if not os.path.exists('./' + str(self.year)): os.makedirs('./' + str(self.year)) tree.write(name_xml_prev, encoding = 'utf-8') XML_new = preproc_docs.get_text_onefile(self.XML_main_corr) - name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' + name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' tree = ET.ElementTree(XML_new) tree.write(name_xml, encoding = 'utf-8') - + if flag_save: name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml) else: print('Not saving to tar') - name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' - + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' + self.name_outcorrxml = name_outcorrxml self.name_xml_corr = [name_tar, name_xml] + self._xml_ext(suffix_xml, self.name_outcorrxml) command = 'rm -rf ./' + str(self.year) #print(command) - utils_proc.call_with_out(command) - - print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) + utils_proc.call_with_out(command) + + print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) #XML_tree = ET.parse(name_xml) #self.XML_main = XML_tree.getroot() - + def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0, name_outxml = '02_extractedxml'): # ind_page has to be a scalar - + if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() if 'XML_main' not in self.__dict__.keys(): @@ -335,8 +344,8 @@ class Document: XML_tree = ET.parse(h_xml) self.XML_main = XML_tree.getroot() else: - self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) + self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) else: #print('Run this') self._get_pages() @@ -344,7 +353,7 @@ class Document: #print(ind_abs, type(ind_abs)) #print(self.XML_main, len(self.imgobj)) - + if ind_page > (len(self.XML_main) - 1): flag_error = 1 return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error @@ -357,65 +366,65 @@ class Document: XML_root = ET.Element('pages') XML_root.append(self.XML_main[ind_abs[0]]) imarray = np.array(self.imgobj[ind_page]) - + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) dim_img = imarray.shape[:2] - + _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, - textb_textl = 2) - margins = [] - ind_limits = [] + textb_textl = 2) + margins = [] + ind_limits = [] label_textlines = [] list_allcoords_textlines = [] set_of_blocks = [] XML_enrich = [] - + if level_proc > 0: coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, flag_2col, flag_central) if level_proc > 1: _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page) - - if level_proc > 2: + + if level_proc > 2: ##### # Obtain lateral margins - margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), - coord_horz.astype(np.uint32)) - - if level_proc > 3: + margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + if level_proc > 3: # Top and bottom line - ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), coord_horz.astype(np.uint32)) - - if level_proc > 4: + + if level_proc > 4: label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ - preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) - - if level_proc > 5: - set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, - list_allcoords_textlines, margins) - - if level_proc > 6: - XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, - rescale_factor, centrall_ord, ind_page, dim_img) - + preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) + + if level_proc > 5: + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + list_allcoords_textlines, margins) + + if level_proc > 6: + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + rescale_factor, centrall_ord, ind_page, dim_img) + # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we # send a 1 instead flag_error = 0 return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \ set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error - + def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'): - + if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() - if 'XML_main_corr' not in self.__dict__.keys(): + if 'XML_main_corr' not in self.__dict__.keys(): name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' if os.path.isfile(name_tar): name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' #print(name_xml) - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: #print('Run this') h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) XML_tree = ET.parse(h_xml) @@ -424,13 +433,13 @@ class Document: print('You need to have the tar file to use flag_compute = 0!') flag_error = 1 return 0, 0, 0, 0, 0, 0, flag_error - #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) else: print('You need to have the tar file to use flag_compute = 0!') flag_error = 1 return 0, 0, 0, 0, 0, 0, flag_error - #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) + #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) else: #print('Run this') self._get_pages() @@ -438,19 +447,19 @@ class Document: #print(ind_abs, type(ind_abs)) #print(self.XML_main, len(self.imgobj)) - + if ind_page > (len(self.XML_main_corr) - 1): flag_error = 1 return 0, 0, 0, 0, 0, 0, flag_error - + XML_root = ET.Element('pages') XML_root.append(self.XML_main_corr[ind_abs[0]]) imarray = np.array(self.imgobj[ind_page]) - + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - - ###### + dim_img = imarray.shape[:2] + + ###### # For obtaining label_textlines, list_allcoords_textlines coord_textline = np.array([]).reshape((4,0)) label_textlines = dict() @@ -463,7 +472,7 @@ class Document: if 'type' in XML_root[0][ind_el][ind_line].attrib: coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) - + type_textl = XML_root[0][ind_el][ind_line].attrib['type'] #print(ind_el) if XML_root[0][ind_el].attrib['type_textbox'] == 'line': @@ -480,18 +489,18 @@ class Document: aux_type = np.array([count]) label_textlines[type_textl] = aux_type count += 1 - + coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page) - - ##### + + ##### # To obtain set_of_blocks. This variable simply contains the coordinates, and - # then a final row indicating the order (here are already ordered), and if it - # is a line, which is indicated with a -1 + # then a final row indicating the order (here are already ordered), and if it + # is a line, which is indicated with a -1 set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0) set_of_blocks = dict() set_of_blocks[0] = set_of_blocks_aux #print(set_of_blocks.shape) - + # The last is the flag_error #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks), # len(XML_root), bbox_page.shape) @@ -499,58 +508,60 @@ class Document: return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error # imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error # imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error - - def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', + + + def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): - + if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml - + for ind_page in range_pages: imarray, margins, ind_limits, _, _, \ _, _, _, XML_root, _, flag_error = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0, name_outxml = self.name_outxml) - if flag_error: print(str(ind_page) + ': non existing page!') else: imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root) - self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, - flag_plot, flag_save_figs) + self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, + flag_plot, flag_save_figs) - def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', + self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): - + if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml - + for ind_page in range_pages: imarray, margins, ind_limits, _, _, \ _, _, _, _, _, flag_error= self._plot_generic_open(ind_page, suffix_xml, level_proc = 4, name_outxml = self.name_outxml) - if flag_error: print(str(ind_page) + ': non existing page!') - else: + else: im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1) self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', + flag_plot, flag_save_figs) + + def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1): - + if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml + self.name_outxml = name_outxml if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - + self.name_outcorrxml = name_outcorrxml + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): + if flag_compute or not os.path.isfile(name_tar): imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \ self._plot_generic_open(ind_page, suffix_xml, level_proc = 5, name_outxml = self.name_outxml) @@ -558,80 +569,80 @@ class Document: else: imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) #print(len(array_elements)) - + if flag_error: print(str(ind_page) + ': non existing page!') - else: - im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) + else: + im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file, flag_plot, flag_save_figs, flag_legend) - - - def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', + + + def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_compute = 0): - + if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - + self.name_outcorrxml = name_outcorrxml + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): + if flag_compute or not os.path.isfile(name_tar): imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \ self._plot_generic_open(ind_page, suffix_xml, level_proc = 6, name_outxml = self.name_outxml) - else: + else: imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \ - = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - + = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + #print(set_of_blocks) if flag_error: print(str(ind_page) + ': non existing page!') - else: - im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + else: + im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', + flag_plot, flag_save_figs) + + def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1): # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes if 'name_outxml' not in self.__dict__.keys(): self.name_outxml = name_outxml if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - + self.name_outcorrxml = name_outcorrxml + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): + if flag_compute or not os.path.isfile(name_tar): imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \ self._plot_generic_open(ind_page, suffix_xml, level_proc = 7, name_outxml = self.name_outxml) - else: + else: imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \ - = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - + = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + if flag_error: print(str(ind_page) + ': non existing page!') - else: + else: im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl) self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file, flag_plot, flag_save_figs) - + def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '', flag_plot = 1, flag_save_figs = 0, dpi = 200): if flag_plot: fig, axes = plt.subplots(1, 1, figsize=(8, 10)) axes.axis('off') - axes.imshow(im_met) + axes.imshow(im_met) plt.title(str_title) if flag_save_figs: format_fig = 'png' - name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + '_page' + str(ind_page) + '.' + format_fig) fig.savefig(name_fig, format = format_fig, dpi = dpi) plt.close(fig) @@ -659,17 +670,17 @@ class Document: coords[0] = in_coord coords[1] += int(im_met.shape[1]/1.5) coords[2] = in_coord + 10 - coords[3] += int(im_met.shape[1]/1.5) - im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6) + coords[3] += int(im_met.shape[1]/1.5) + im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6) coords[0] += inc_page coords[2] += inc_page - + if flag_plot: fig, axes = plt.subplots(1, 1, figsize=(8, 10)) axes.axis('off') - axes.imshow(im_met) + axes.imshow(im_met) plt.title(str_title) - + if flag_legend: coords = in_coord + np.array([0, 0, 10, 10]) flag_notinto = 1 @@ -679,17 +690,90 @@ class Document: coords[0] = in_coord coords[1] += int(im_met.shape[1]/1.5) coords[2] = in_coord + 10 - coords[3] += int(im_met.shape[1]/1.5) - plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left') + coords[3] += int(im_met.shape[1]/1.5) + plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left') coords[0] += inc_page - coords[2] += inc_page - + coords[2] += inc_page + if flag_save_figs: format_fig = 'png' - name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + '_page' + str(ind_page) + '.' + format_fig) fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - - - \ No newline at end of file + plt.close(fig) + + + + + + def check_discussion(self): + utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) + flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + return flag_discussion + + + + def annotate_xml(self, flag_save = 1, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'): + + start_time = time.time() + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + if 'XML_main_corr' not in self.__dict__.keys(): + print('no main corr') + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outcorrxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outcorrxml) + XML_tree = ET.parse(h_xml) + self.XML_main_corr = XML_tree.getroot() + #else: + #self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, + #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, + #name_outcorrxml = self.name_outcorrxml) + #else: + ## TODO if already exists 02_extractedxml + #self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, + #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, + #name_outcorrxml = self.name_outcorrxml) + + + print('we have a main corr XML file') + #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) + XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False) + self.XML_main_annot = XML_main_annot + + # save xml file + name_xml = './' + str(self.year) + '/' + self.name_wo_ext + '.xml' + tree = ET.ElementTree(XML_main_annot) + if not os.path.exists('./' + str(self.year)): + os.makedirs('./' + str(self.year)) + tree.write(name_xml, encoding = 'utf-8') + + if flag_save: + name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outannotxml) + else: + print('Not saving to tar') + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outannotxml + '.tar.gz' + + self.name_outannotxml = name_outannotxml + self.name_annot_corr = [name_tar, name_xml] +# self._xml_ext(suffix_xml, self.name_outannotxml) + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) + + + command = 'rm -rf ./' + str(self.year) + #print(command) +# utils_proc.call_with_out(command) diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py index 8b4349f037ab126cf60cad2098616713d79a2a0d..b9c9bbb0fb8f88e699daad571d980044a24063bb 100644 --- a/src/python/preproc_docs.py +++ b/src/python/preproc_docs.py @@ -6,7 +6,7 @@ Created on Fri Sep 28 13:39:10 2018 @author: luissalamanca """ -# File for all the functions used for preprocessing. +# File for all the functions used for preprocessing. import numpy as np import os @@ -46,42 +46,42 @@ import tables HEIGHT_CHAR = 12 WIDTH_CHAR = 6 -def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, +def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, flag_2col, flag_central = 1): # Using the coordinates of the boxes, we put the rest to 0, and then estimate # the central line - # Here, since we use the image, we have to rely again on a ref00 in topleft, and + # Here, since we use the image, we have to rely again on a ref00 in topleft, and # the corners in topleftbottomright - # We also look for horizontal lines + # We also look for horizontal lines # We assume that we will only have one vertical line, and then many horizontal # lines, either spanning the whole image, or at both sides of the central line - + coord, rescale_factor = adapt_coordtoimg(img, coord, dim_bbox_page) img_aux = np.abs(255 - img[:,:,0]) img_aux[img_aux < 20] = 0 img_aux[img_aux >= 20] = 255 img_aux_in = np.copy(img_aux) - - + + width_resc = WIDTH_CHAR * rescale_factor[0,1] height_resc = HEIGHT_CHAR * rescale_factor[0,1] gap_central = int(4 * width_resc) top_bbox_red = 0 #int(height_resc/2) - + for ind in range(coord.shape[1]): img_aux[(coord[0,ind] + top_bbox_red):coord[2,ind],coord[1,ind]:coord[3,ind]] = 0 - + # Also remove possible mark and artefacts in the edges img_aux[:,:int(img_aux.shape[1]/20)] = 0 img_aux[:int(img_aux.shape[0]/20),:] = 0 img_aux[int(19 * img_aux.shape[0]/20):,:] = 0 - img_aux[:,int(19 * img_aux.shape[1]/20):] = 0 - + img_aux[:,int(19 * img_aux.shape[1]/20):] = 0 + img_prev = np.copy(img_aux) - + img_aux_rem = remove_small_objects(label(img_aux), 2 * width_resc) #img_aux = dilation(img_aux_rem, selem = np.ones((11,11))) - img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) + img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) max_val = np.max(img_aux) if max_val > 0: img_aux_norm = (255 * img_aux/max_val).astype(np.uint8) @@ -90,13 +90,13 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, else: img_aux[:] = 0 #print(np.unique(img_aux)) - + # Remove big objects, like the shields and other logos #img_label = label(img_aux) edges = canny(img_aux, 2, 1, 25) #img_cent = np.copy(img_aux) - + if flag_2col: if flag_central: img_cent = np.copy(img_prev) @@ -108,37 +108,37 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, #lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = 2 * width_resc, # line_gap = width_resc) lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc), - line_gap = int(width_resc)) + line_gap = int(width_resc)) else: sum_img_aux_in = np.sum(img_aux_in, axis = 0) sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)] - + #plt.plot(sum_img_aux_in) #sum_img_aux_in[sum_img_aux_in < np.max(sum_img_aux_in)/10] = 0 # We need to substract the baseline value, in order to account for # central headers and stuff like that sum_img_aux_in = sum_img_aux_in - np.min(sum_img_aux_in) - #not_end_vect = 1 + #not_end_vect = 1 #while not_end_vect: ind_min_start = np.argwhere((sum_img_aux_in) < np.mean(sum_img_aux_in)/10) - ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start) + ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start) ind_min_start = int(2*img_aux.shape[1]/5) + np.min(ind_min_start) ind_central = int((ind_min_start + ind_min_end)/2) - coord_vert_def = np.array([1, ind_central - int(width_resc/2), + coord_vert_def = np.array([1, ind_central - int(width_resc/2), img_aux_in.shape[0], ind_central + int(width_resc/2)]) - #print(lines_vert,img_aux.shape) - - theta = np.linspace(-5*pi/8, -3* pi/8,num = 90) - #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90) + #print(lines_vert,img_aux.shape) + + theta = np.linspace(-5*pi/8, -3* pi/8,num = 90) + #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90) #lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = 2 * width_resc, - # line_gap = width_resc) + # line_gap = width_resc) lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = int(2 * width_resc), - line_gap = int(width_resc)) - - # These lines are given in a standard xy coordinate, with the corner in the - # bottom left + line_gap = int(width_resc)) + + # These lines are given in a standard xy coordinate, with the corner in the + # bottom left lines_horz = np.transpose(np.asarray(lines_horz).reshape((len(lines_horz),4))) - + lines_horz = np.concatenate((np.minimum(lines_horz[1,:],lines_horz[3,:]).reshape((1,lines_horz.shape[1])), np.minimum(lines_horz[0,:],lines_horz[2,:]).reshape((1,lines_horz.shape[1])), @@ -150,15 +150,15 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])), np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32) - - + + #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True) - #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True) - + #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True) + # First clean the vertical from unexpected outliers if flag_central: - sum_rows = np.sum(img_cent, axis = 0)/255 - ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) + sum_rows = np.sum(img_cent, axis = 0)/255 + ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]), np.argwhere([(ind_central - gap_central) < aux_l2 < (ind_central + gap_central) for aux_l2 in lines_vert[3,:]])) if len(ind_valid): @@ -169,19 +169,19 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc]) #ind_central = np.mean(coord_vert_def[[1,3]]) - + # And now, just iterate over the horizontal lines, merging them if required. return clean_horz_vert_lines(lines_horz, coord_vert_def, width_resc, height_resc, ind_central, gap_central, img_aux.shape) - - + + def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_resc, ind_central, gap_central, dim_page): # We just iterate over all the horizontal lines, merging them if required coord_horz = np.array([]).reshape((4,0)).astype(np.int32) min_length_line = 2 * width_resc - - while coord_horz_pre.size > 3: + + while coord_horz_pre.size > 3: if coord_horz_pre.shape[1] == 1: coord_horz = np.concatenate((coord_horz, coord_horz_pre[:,0].reshape((4,1))), axis = 1) coord_horz_pre = np.array([]) @@ -190,33 +190,33 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res #print(coord_horz_curr) coord_horz_check = coord_horz_pre[:,1:] flag_stay = 1 - while flag_stay: + while flag_stay: # Boxes to the right ind_val1 = np.intersect1d(np.argwhere((abs(coord_horz_check[1,:] - coord_horz_curr[3]) < (width_resc * 10))), np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) # Boxes to the left ind_val2 = np.intersect1d(np.argwhere((abs(coord_horz_check[3,:] - coord_horz_curr[1]) < (width_resc * 10))), - np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) - + np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) + ind_val = np.unique(np.concatenate((ind_val1,ind_val2))) if len(ind_val) > 0: for i_b in range(len(ind_val)): - coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), + coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), np.min((coord_horz_curr[1],coord_horz_check[1,ind_val[i_b]])), - np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), + np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), np.max((coord_horz_curr[3],coord_horz_check[3,ind_val[i_b]]))]) coord_horz_check = coord_horz_check[:,np.setdiff1d(np.arange(coord_horz_check.shape[1]), ind_val)] - #coord_horz_check = np.delete(coord_horz_check, ind_val, 1) + #coord_horz_check = np.delete(coord_horz_check, ind_val, 1) if coord_horz_check.shape[1] == 0: flag_stay = 0 coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1) coord_horz_pre = np.array([]) - else: + else: flag_stay = 0 coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1) coord_horz_pre = coord_horz_check[:,:] - + # Remove overlapping boxes coord_horz_def = np.array([]).reshape((4,0)) while coord_horz.size > 3: @@ -226,15 +226,15 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res np.argwhere((width_resc/2 + coord_horz_curr[2]) > coord_horz[2,:]), np.argwhere((width_resc/2 + coord_horz_curr[3]) > coord_horz[3,:]))) ind_overlap = np.setdiff1d(ind_overlap,0) - + coord_horz_def = np.concatenate((coord_horz_def, coord_horz_curr.reshape((4,1))), axis = 1) coord_horz = coord_horz[:,np.setdiff1d(np.arange(1,coord_horz.shape[1]),ind_overlap)] #coord_horz = np.delete(coord_horz, ind_overlap, 1) - + if coord_horz.size == 4: coord_horz_def = np.concatenate((coord_horz_def, coord_horz.reshape((4,1))), axis = 1) coord_horz = np.array([0]) - + ind_val_long = np.argwhere((coord_horz_def[3,:] - coord_horz_def[1,:]) > (3 * (coord_horz_def[2,:] - coord_horz_def[0,:]))) coord_horz_def = coord_horz_def[:,ind_val_long].reshape((4,ind_val_long.shape[0])) @@ -245,35 +245,35 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0])) else: coord_horz_def = np.array([]).reshape((4,0)) - - + + # To identify the topline ''' ind_topline = identify_topline(coord_horz_def, width_resc, dim_page) - if str_page == 'firsts': + if str_page == 'firsts': # We correct the top of the vertical line in case it is cutting some of the horizontal lines ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)), np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)), np.argwhere(coord_horz_def[0,:] > coord_vert_def[0]))) ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline) - + coord_vert_def = np.array([np.max(np.concatenate((np.array([coord_vert_def[0]]),coord_horz_def[2,ind_val_horz]))),coord_vert_def[1], - coord_vert_def[2],coord_vert_def[3]]) + coord_vert_def[2],coord_vert_def[3]]) elif str_page == 'lasts': - # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines + # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)), np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)), - np.argwhere(coord_horz_def[2,:] < coord_vert_def[2]))) + np.argwhere(coord_horz_def[2,:] < coord_vert_def[2]))) ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline) - + coord_vert_def = np.array([coord_vert_def[0],coord_vert_def[1], - np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]]) + np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]]) ''' - + coord_vert_def[1] = np.max((coord_vert_def[1], int(ind_central - width_resc))) coord_vert_def[3] = np.min((coord_vert_def[3], int(ind_central + width_resc))) - - # Finally, remove short central lines, likely artefacts of the calculation + + # Finally, remove short central lines, likely artefacts of the calculation # of the central vertical line length_lines = coord_horz_def[3,:] - coord_horz_def[1,:] ind_wrong = reduce(np.intersect1d, (np.argwhere(length_lines < 2* min_length_line), @@ -283,98 +283,98 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res if len(ind_val): coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0])) else: - coord_horz_def = np.array([]).reshape((4,0)) - - return coord_vert_def, coord_horz_def + coord_horz_def = np.array([]).reshape((4,0)) + + return coord_vert_def, coord_horz_def def identify_topline(coord_horz, width_resc, dim_page): # Two rules for identifying the top line ind_topline = reduce(np.intersect1d, (np.argwhere(coord_horz[2,:] < dim_page[0]/8), np.argwhere((coord_horz[3,:] - coord_horz[1,:]) > width_resc * 60))) - + return ind_topline def lateral_margins(img, dim_bbox_page, coord_vert, coord_horz): - + coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page) width_resc = WIDTH_CHAR * rescale_factor[0,1] gap_central = int(3 * width_resc) thres_margin = 0.1 - + img_aux = np.abs(255 - img[:,:,0]) for ind in range(coord_horz.shape[1]): img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0 - + img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0 central_line = (coord_vert[1] + coord_vert[3])/2 - + # Also remove possible mark and artefacts in the edges img_aux[:,:gap_central] = 0 img_aux[:int(gap_central/2),:] = 0 img_aux[(img_aux.shape[1] - gap_central):,:] = 0 - img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 - + img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 + sum_imarray_aux = np.sum(img_aux, axis = 0) sum_imarray_aux = 1000*sum_imarray_aux.astype(np.float64)/np.max(sum_imarray_aux) mean_val_rows_left = np.mean(sum_imarray_aux[:int(central_line - gap_central)]) mean_val_rows_right = np.mean(sum_imarray_aux[int(central_line + gap_central):]) - + left_margin = np.min(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_left)) right_margin = np.max(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_right)) - + return left_margin, right_margin, left_margin/rescale_factor[0,1], right_margin/rescale_factor[0,1] def bottomtop_margins(img, dim_bbox_page, coord_vert, coord_horz): val_thres = 300 # In this case we don't use the mean of sum_cols because we have - + coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page) img_aux = np.abs(255 - img[:,:,0]) - + height_resc = HEIGHT_CHAR * rescale_factor[0,1] width_resc = WIDTH_CHAR * rescale_factor[0,1] gap_central = int(3 * width_resc) - + for ind in range(coord_horz.shape[1]): img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0 - + img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0 - + sum_cols = np.sum(img_aux, axis = 1)/255 sum_cols = 1000 * sum_cols/np.max(sum_cols) - + # Now, limit by using the horizontal lines ind_topline = identify_topline(coord_horz, width_resc, img_aux.shape) - + if len(ind_topline) > 0: ind_min_textbox = np.max(coord_horz[2,ind_topline]) sum_cols[:ind_min_textbox] = 0 - + #plt.figure() #plt.plot(sum_cols) - ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), + ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), np.max(np.argwhere(sum_cols > val_thres))]) - - return ind_limits + + return ind_limits def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_font_sizes): - + # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines # and k for characters - + coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page) list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64)) list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page) - + font_main_block = info_font_sizes[0, np.argmax(info_font_sizes[1,:])] thres_font = font_main_block/5 # To compensate for error in the fontsize between columns width_resc = WIDTH_CHAR * rescale_factor[0,1] height_resc = HEIGHT_CHAR * rescale_factor[0,1] gap_central = int(2 * width_resc) indentation = int(4 * width_resc) - + ind_central = (coord_vert_def[3] + coord_vert_def[1])/2 - + # First pass just to discover main blocks list_col1 = list() list_col2 = list() @@ -383,25 +383,25 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon all_mean_heights = np.array([]).reshape((1,0)) list_allcoords_textlines = np.array([]).reshape((4,0)) relative_ref_textline = np.array([], dtype = np.uint32).reshape((3,0)) - + count_text = 0 - + for ind_block in range(len(xml_page)-2): xml_block = xml_page[ind_block] list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:] if 'bbox' in o.attrib]).astype(np.float64)) - #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64)) + #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64)) if len(list_coords_textline)>3: list_coords_textline list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page) list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1) relative_ref_textline_aux = np.zeros((3,list_coords_textline.shape[1])) - + relative_ref_textline_aux[0,:] = count_text + np.arange(list_coords_textline.shape[1]) relative_ref_textline_aux[1,:] = ind_block relative_ref_textline_aux[2,:] = np.arange(list_coords_textline.shape[1]) relative_ref_textline = np.concatenate((relative_ref_textline,relative_ref_textline_aux.astype(np.uint32)), axis = 1) - + for ind_textl in range(list_coords_textline.shape[1]): all_heights = np.array([]) xml_textline = xml_block[ind_textl] @@ -412,10 +412,10 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon all_heights = np.append(all_heights, float(xml_text.attrib['size'])) #fontsize = fontsize_fromtextline(img[bbox_textline[0]:bbox_textline[2], # bbox_textline[1]:bbox_textline[3],0]) - + fontsize = np.average(all_heights) all_mean_heights = np.append(all_mean_heights, fontsize) - + # Normal font #if ((font_main_block - thres_font) < mean_height < (font_main_block + thres_font)): if ((font_main_block - thres_font) < fontsize < (font_main_block + thres_font)): @@ -426,18 +426,18 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon if len(xml_block[0]) < 12: list_pagen.append(count_text) else: - list_textinheader.append(count_text) + list_textinheader.append(count_text) elif ((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))): list_col1.append(count_text) # Right side of the central line - elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): + elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): list_col2.append(count_text) count_text += 1 - + discovered_blocks = np.concatenate((np.array(list_col1),np.array(list_col2), np.array(list_pagen),np.array(list_textinheader))) blocks_left = np.setdiff1d(np.arange(list_allcoords_textlines.shape[1]),discovered_blocks) - + if len(list_col1): bbox_col1 = np.array([np.min(list_allcoords_textlines[0,list_col1]), np.min(list_allcoords_textlines[1,list_col1]), @@ -445,17 +445,17 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon np.max(list_allcoords_textlines[3,list_col1])]) else: bbox_col1 = np.array([0,0,10,10]) # Dummy value - + if len(list_col2): bbox_col2 = np.array([np.min(list_allcoords_textlines[0,list_col2]), np.min(list_allcoords_textlines[1,list_col2]), np.max(list_allcoords_textlines[2,list_col2]), np.max(list_allcoords_textlines[3,list_col2])]) else: - bbox_col2 = np.array([0,0,10,10]) # Dummy value + bbox_col2 = np.array([0,0,10,10]) # Dummy value list_header = list() - list_header_singlecol = list() + list_header_singlecol = list() list_footnote = list() list_notidentified = list() for ind_textline in blocks_left: @@ -463,7 +463,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon if xml_textline.tag == 'textline': bbox_textline = list_allcoords_textlines[:,ind_textline] # Small fontsize and below current bboxes of main blocks - if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and + if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and (bbox_textline[2] > bbox_col1[2]) and (bbox_textline[2] > bbox_col2[2])): list_footnote.append(ind_textline) # Large fontsizes @@ -473,7 +473,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon list_header.append(ind_textline) # To the left or right of the central line elif (((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))) or - ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): + ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): list_header_singlecol.append(ind_textline) # Standard fontsize elif ((font_main_block - thres_font) < all_mean_heights[ind_textline] < (font_main_block + thres_font)): @@ -483,13 +483,13 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon list_col1.append(ind_textline) # Contained into the bbox of the right column elif (((bbox_col2[0] - height_resc) < bbox_textline[0]) and ((bbox_col2[1] - width_resc) < bbox_textline[1]) - and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])): + and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])): list_col2.append(ind_textline) else: list_notidentified.append(ind_textline) - + label_textlines = dict() - label_textlines['text_col1'] = list_col1 + label_textlines['text_col1'] = list_col1 label_textlines['text_col2'] = list_col2 label_textlines['footnote'] = list_footnote label_textlines['pagen'] = list_pagen @@ -497,8 +497,8 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon label_textlines['header'] = list_header label_textlines['header_singlecol'] = list_header_singlecol label_textlines['notidentified'] = list_notidentified - - vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str) + + vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str) vec_labels_textline[list_col1] = 'text_col1' vec_labels_textline[list_col2] = 'text_col2' vec_labels_textline[list_footnote] = 'footnote' @@ -507,56 +507,56 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon vec_labels_textline[list_header] = 'header' vec_labels_textline[list_header_singlecol] = 'header_singlecol' vec_labels_textline[list_notidentified] = 'notidentified' - - # relative_ref_textline: three rows with the following, the aboslute reference + + # relative_ref_textline: three rows with the following, the aboslute reference # for the textline, the number of the block, and the number of the textline inside # that block return label_textlines, list_allcoords_textlines, relative_ref_textline, all_mean_heights, vec_labels_textline - + def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textlines, margins): - # Two steps, first ordering the textlines, grouping them in big blocks separated + # Two steps, first ordering the textlines, grouping them in big blocks separated # by horizontal lines. Then, inside these groups, we group them in textboxes, # incorporating this to the XML height_resc = HEIGHT_CHAR * rescale_factor[0,1] widht_resc = WIDTH_CHAR * rescale_factor[0,1] - + gap_central = 3 * widht_resc gap_row = height_resc/2 - - # This parameters is intended for removing artefacts such as small dots in the + + # This parameters is intended for removing artefacts such as small dots in the # text. But we have to be careful, as we can remove valuable characters. # I first set a value of 3 * width_resc/4 - min_width_textl = 6 * widht_resc/4 - + min_width_textl = 6 * widht_resc/4 + central_line = (coord_vert_def[3] + coord_vert_def[1])/2 array_coords_textl = np.concatenate((list_allcoords_textlines[:,:], np.arange(list_allcoords_textlines.shape[1]).reshape((1,list_allcoords_textlines.shape[1])))) - + # Clean from to thin lines, thatn are just probably artefacts all_widths = array_coords_textl[3,:] - array_coords_textl[1,:] ind_valid = np.argwhere(all_widths > min_width_textl) array_coords_textl = array_coords_textl[:,ind_valid].reshape((5,len(ind_valid))) - + ind_centralines = np.intersect1d(np.argwhere(coord_horz[1,:] < (central_line - gap_central)), np.argwhere(coord_horz[3,:] > (central_line + gap_central))) ind_sepfootnotes = np.intersect1d(np.argwhere(coord_horz[1,:] < (margins[0] + 2 * widht_resc)), - np.argwhere(coord_horz[3,:] < (central_line - gap_central))) + np.argwhere(coord_horz[3,:] < (central_line - gap_central))) ind_centralines = np.union1d(ind_centralines,ind_sepfootnotes) ind_collines = np.setdiff1d(np.arange(coord_horz.shape[1]),ind_centralines) - + array_coords_centrall = coord_horz[:,ind_centralines] array_coords_coll = coord_horz[:,ind_collines] array_coords_coll = np.concatenate((array_coords_coll, -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1])))) - - not_visited = 1 + + not_visited = 1 toprow = 0 count_b = 0 set_of_blocks = dict() array_coords_centrall_ord = np.array([]).reshape((4,0)) while not_visited: - + if array_coords_centrall.size > 3: bottomrow = np.min(array_coords_centrall[0,:]) ind_bottomrow = np.argmin(array_coords_centrall[0,:]) @@ -573,47 +573,47 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl coord_cat = np.concatenate((array_coords_textl[:,ind_textl_proc].reshape(5,len(ind_textl_proc)), array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))), axis = 1) - + if coord_cat.size > 0: flag_col = 1 - ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line) + ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line) if ind_currcord == -1: - ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) + ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) flag_col = 2 - + order_coords = np.array([]).reshape(5,0) while coord_cat.size > 4: order_coords = np.concatenate((order_coords,coord_cat[:,ind_currcord].reshape(5,1)), axis = 1) curr_coord = coord_cat[:,ind_currcord] coord_cat = np.delete(coord_cat,ind_currcord,1) if coord_cat.size > 4: - if flag_col == 1: + if flag_col == 1: ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, max_col = central_line) - + if ind_currcord == -1: ind_currcord = next_textline_samecol(coord_cat, gap_row, max_col = central_line) - + if ind_currcord == -1 : - ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) + ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) flag_col = 2 - + elif flag_col == 2: ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, min_col = central_line) - + if ind_currcord == -1: ind_currcord = next_textline_samecol(coord_cat, gap_row, min_col = central_line) - + if ind_currcord == -1 : flag_col = 1 ind_currcord = 0 - + else: - order_coords = np.array([]).reshape(5,0) - + order_coords = np.array([]).reshape(5,0) + toprow = np.copy(bottomrow) set_of_blocks[count_b] = order_coords count_b += 1 - + return set_of_blocks, array_coords_centrall_ord def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000): @@ -629,13 +629,13 @@ def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000): return curr_ind else: return -1 - + def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10000): curr_row = curr_coord[2] #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), - # np.argwhere(coords[3,:] > min_col)) + # np.argwhere(coords[3,:] > min_col)) ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)), - np.argwhere(coords[1,:] > (min_col - gap_row))) + np.argwhere(coords[1,:] > (min_col - gap_row))) if len(ind_valid): min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (curr_row - gap_row)), np.argwhere(coords[2,ind_valid] < (curr_row + gap_row))) @@ -646,41 +646,41 @@ def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10 else: return -1 else: - return -1 - + return -1 + def next_textline_samecol(coords, gap_row, min_col = 0, max_col = 10000): #print(coords, max_col, min_col) #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), - # np.argwhere(coords[3,:] > min_col)) + # np.argwhere(coords[3,:] > min_col)) ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)), - np.argwhere(coords[1,:] > (min_col - gap_row))) + np.argwhere(coords[1,:] > (min_col - gap_row))) if len(ind_valid): min_row = np.min(coords[2,ind_valid]) min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (min_row - gap_row)), - np.argwhere(coords[2,ind_valid] < (min_row + gap_row))) + np.argwhere(coords[2,ind_valid] < (min_row + gap_row))) ind_valid_min = ind_valid[min_row] ind_next_textl = ind_valid_min[np.argmin(coords[1,ind_valid_min])] return ind_next_textl else: - return -1 + return -1 def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescale_factor, centrall_ord, ind_page, dim_img): - + height_resc = HEIGHT_CHAR * rescale_factor[0,1] widht_resc = WIDTH_CHAR * rescale_factor[0,1] - + max_inrow_sep = 4 * widht_resc - max_incol_sep = 1 * height_resc + max_incol_sep = 1 * height_resc gap_row = height_resc/2 similarity_fonts = 0.95 indentation = 2 * widht_resc - - centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, + + centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', refCorners = 'topleftbottomright') - + # Start creating the xml xml_e = [] xml_e = ET.Element('pages') @@ -689,21 +689,21 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal page_el.attrib['bbox'] = xml_t[0].attrib['bbox'] page_el.attrib['rotate'] = '0' xml_e.append(page_el) - + val_type_col1 = {'text_col1', 'notidentified', 'header_singlecol', 'text_inheader'} val_type_col2 = {'text_col2', 'notidentified', 'header_singlecol', 'text_inheader'} - - + + count_b = 0 text_b = ET.SubElement(page_el, 'textbox') - text_b.attrib['id'] = str(count_b) - text_b.attrib['block'] = '0' + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = '0' for ind_b in range(len(set_of_blocks)): all_el = set_of_blocks[ind_b].astype(np.int64) all_bbox = np.array([]).reshape((4,0)) for ind_c in range(all_el.shape[1]): - curr_el = all_el[:,ind_c] - flag_copy_textb = 1 + curr_el = all_el[:,ind_c] + flag_copy_textb = 1 # If it is a textline with text if curr_el[4] > -1: all_bbox = np.concatenate((all_bbox, curr_el[:4].reshape((4,1))), axis = 1) @@ -713,12 +713,12 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal text_l.attrib['type'] = type_textl text_b.append(text_l) type_textbox = 'text' - + # To check if it satisfies the conditions for being a new textbox if ind_c < (all_el.shape[1] - 1): - next_el = all_el[:,ind_c + 1] - if next_el[4] > -1: - if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) + next_el = all_el[:,ind_c + 1] + if next_el[4] > -1: + if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) or ((type_textl in val_type_col2) and (labels_textl[int(next_el[4])] in val_type_col2)) or (type_textl == labels_textl[int(next_el[4])])): # Object to the right or beneath @@ -728,33 +728,33 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal # Accounting for footnotes or other stuff curr_fontsize = curr_el[3] - curr_el[1] next_fontsize = next_el[3] - next_el[1] - if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < + if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < (curr_fontsize + next_fontsize * similarity_fonts)): # Finally, account for indentation if ((np.min(all_bbox[1,:]) + indentation) > next_el[1]): flag_copy_textb = 0 - - # Attributes and stuff in case we need to store as textbox + + # Attributes and stuff in case we need to store as textbox if flag_copy_textb: bbox_text_b = np.array([np.min(all_bbox[0,:]),np.min(all_bbox[1,:]), np.max(all_bbox[2,:]),np.max(all_bbox[3,:])]) - bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, + bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', refCorners = 'topleftbottomright') all_bbox = np.array([]).reshape((4,0)) - # Instead, if we have a line + # Instead, if we have a line else: bbox_text_b = curr_el[:4] text_l = ET.SubElement(text_b, 'textline') text_l.attrib['type'] = 'col_lines' - bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, + bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', refCorners = 'topleftbottomright') text_l.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2] type_textbox = 'line' - + # Creating the new textbox - if flag_copy_textb: + if flag_copy_textb: text_b.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2] text_b.attrib['type_textbox'] = type_textbox count_b += 1 @@ -762,10 +762,10 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal pass else: text_b = ET.SubElement(page_el, 'textbox') - text_b.attrib['id'] = str(count_b) - text_b.attrib['block'] = str(ind_b) + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = str(ind_b) all_bbox = np.array([]).reshape((4,0)) - + if (ind_b < (len(set_of_blocks) - 1)): text_l = ET.SubElement(text_b, 'textline') text_l.attrib['type'] = 'central_lines' @@ -775,20 +775,20 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal text_b.attrib['type_textbox'] = 'line' count_b += 1 text_b = ET.SubElement(page_el, 'textbox') - text_b.attrib['id'] = str(count_b) - text_b.attrib['block'] = str(ind_b) + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = str(ind_b) all_bbox = np.array([]).reshape((4,0)) - - - # Just add the two final elements from the original xml + + + # Just add the two final elements from the original xml page_el.append(xml_t[0][-2]) # Figure page_el.append(xml_t[0][-2]) # Layout - + return xml_e - + def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_col2')): - + # helper function to clean text # !!! so far only removing new lines and primitive dehyphenation def clean_text(text): @@ -796,10 +796,11 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c text = text.replace('\n', ' ') # account for hyphenation (not completely correct...) + # TODO: needs to be improved text = text.replace('- ', '') return text - + # initialize textbox count and empty dictionary XML_new = copy.deepcopy(XML_root) @@ -814,7 +815,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c if (textbox.tag == 'textbox'): if 'type_textbox' in textbox.attrib.keys(): if (textbox.attrib['type_textbox'] == 'text'): - + # initialize string #print(textbox.tag, textbox.attrib) @@ -827,7 +828,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c if textline.tag == 'textline': #print(textline.tag, textline.attrib) # for every text (actually just a letter) - + for ind_ch, text in enumerate(textline): #print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl])) # extend string @@ -847,7 +848,6 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c complete_text += '[/font]' complete_text = clean_text(complete_text) XML_new[ind_p][ind_t][ind_tl].text = complete_text - - + + return XML_new - \ No newline at end of file diff --git a/src/python/run_correct_meta.py b/src/python/run_correct_meta.py index 6fa7429475d93fb1b5f1bad4c16ea43e85e6d5f6..0a3788677845ea3fc351755623e82f8353f127a7 100755 --- a/src/python/run_correct_meta.py +++ b/src/python/run_correct_meta.py @@ -38,7 +38,7 @@ files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file) list_proc = list() for infile in files_proc: - + # 8 is always the length of the id code infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf' if infile_aux not in list_proc: @@ -50,5 +50,5 @@ for infile in files_proc: print('Meta corrected %s' % infile) except: print("Meta to correct %s prompted an error" % infile) - + print('Total time for correcting meta of year %d: %f' % (int(year_tocomp) ,(time.time() - t1))) diff --git a/src/python/run_correctxml.py b/src/python/run_correctxml.py index 45543847c34d783fb8792afb882c159f7a69ed5a..ff0710cb6d532daa9e60fb5a5d9e5c25923e7507 100644 --- a/src/python/run_correctxml.py +++ b/src/python/run_correctxml.py @@ -38,9 +38,9 @@ files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file) list_proc = list() for infile in files_proc: - + # 8 is always the length of the id code - infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf' + infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf' if infile_aux not in list_proc: list_proc.append(infile_aux) d1 = defc.Document(infile_aux, folder_database) @@ -54,5 +54,5 @@ for infile in files_proc: # Commands to get the compressed version of the file #data/AB/${year}/02_extractedxml.tar.gz utils_proc.compress_tar(output_file) - -print('Total time for correction of year %d: %f' % (int(year_tocomp) ,(time.time() - t1))) \ No newline at end of file + +print('Total time for correction of year %d: %f' % (int(year_tocomp) ,(time.time() - t1))) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 2a5821f533a54c6293413f396ec4fa859bcb6bf9..495b23ceb7f8b38e2849c5d16119e76cf177d48f 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -1,199 +1,128 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import os +# Code to extract discussions from corrected XML files +#%% +# to work with atom +%load_ext autoreload +%autoreload 2 + import pickle -import re -import pandas as pd -from nltk.corpus import stopwords import time +import xml.etree.ElementTree as ET -import hf_extractdiscussions as hf +import sys +sys.path.append('src/python/') -# specify input values -years = [1891, 1995] -range_years = range(years[0], years[1] + 1) +import def_classes as defc +import utils_proc +import utils_annot -# paths -#path_start = '/home/lili/NLP_DemocraSci/nlp-democracy/' -path_data = '/data/complete_data/AB/' -path_output = '/data/output/' +import os -# open dictionary of last names from pickle file -with open('/home/lili/nlp-democracy/output/MPs/MPs_lastnames.pickle', 'rb') as f: - dict_lastnames = pickle.load(f) +from utils_proc import call_with_out + +#%% +# specify input and output files + +# needed for running in atom, can be ignored +year = '1891' +input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" +input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" +input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" +output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" + +#%% +# detect arguments +input_lastnames = sys.argv[1] +input_correctedxml = sys.argv[2] +input_correctedmeta = sys.argv[3] +output_annotatedxml = sys.argv[4] + +#%% +# extract suffixes, year, folder_database +suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0] +#suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0] +year = input_correctedxml.split('/')[-2] +folder_database = input_correctedxml.split(year)[0] +suffix_correctedmeta = '_metacorr' +#suffix_correctedxml = '_datacorr' +# needed to instantiate object for every document +input_rawmeta = folder_database + '/' + year + '/' + '01_rawmeta.tar.gz' + +#%% +# git lfs pull necessary data +for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]: + command = 'git lfs pull -I ' + lfsfile + #print(command) + call_with_out(command) + +#%% +# TODO: exclude overlaps --> after annotation + + +#%% +start_time_discussions = time.time() +print('start to identify discussions of the year', year, '\n') -# open dictionary of overlaps -with open('/data/complete_data/Results_overlap/DictOverlap1891to1930.pkl', 'rb') as f: - dict_overlaps_1 = pickle.load(f) -with open('/data/complete_data/Results_overlap/DictOverlap1931to1995.pkl', 'rb') as f: - dict_overlaps_2 = pickle.load(f) -with open('/data/complete_data/Results_overlap/DictOverlap1991to1995.pkl', 'rb') as f: - dict_overlaps_3 = pickle.load(f) -dict_overlaps = {**dict_overlaps_1, **dict_overlaps_2, **dict_overlaps_3} -print(dict_overlaps.keys()) +# extract list of files +files_to_process, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedxml) +files_to_process.sort() +print('files to process loaded:', files_to_process) -# get dictionary of discussions -# ----------------------------- +# open dataframe of last names from pickle file +# (there is one file of lastnames per year) +with open(input_lastnames, 'rb') as f: + df_lastnames = pickle.load(f) -start_time_discussions = time.time() +print('dataframe with lastnames loaded') + +#%% +# for each file +# TODO !!!! get rid of [66:] +for file_tarpath in files_to_process: + #print(file_tarpath) + id_doc = file_tarpath.split('/')[-1][:8] + + # instantiate document object (always from original pdf) + file_aux = year + '/' + id_doc + '.pdf' + file_doc = defc.Document(file_aux, folder_database) + + # if document is a discussion + if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): + print(id_doc + '\n') + file_doc.df_lastnames = df_lastnames + file_doc.annotate_xml() + +# Commands to get the compressed version of the file +# (compressed file is around 5 times smaller than uncompressed file) +#data/AB/${year}/05_annotatedxml.tar.gz +utils_proc.compress_tar(output_annotatedxml) + + + + +#%% +## to test for one file +#file_tarpath = './1893/20026526_datacorr.xml' +# +#id_doc = file_tarpath.split('/')[-1][:8] +# +## instantiate document object (always from original pdf) +#infile_aux = year + '/' + id_doc + '.pdf' +#file_doc = defc.Document(infile_aux, folder_database) +# +#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): + #print(id_doc + '\n') +# + #file_doc.df_lastnames = df_lastnames + #file_doc.annotate_xml() + + +#%% + + + +#id_doc -# list of votation terms -list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', - 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', - 'Votation', 'Vote', 'votation', #'(AdoptÃs)', 'adoptÃs', 'adoptÃe', 'rejetÃe', - "D'accord", 'voix'] - -# list of stopwords -list_stopwords = stopwords.words('german') -list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) -list_stopwords.extend(stopwords.words('french')) -list_stopwords.extend(['ils', 'les', 'celle']) - -print('start to identify discussions of the years', years, '\n\n\n') - -# initialize empty dictionary for all documents -dict_docs = {} - -# for every year -for year in range_years: - start_time = time.time() - - str_year = str(year) - print(year, '\n', 30*'=') - # initialize empty dictionary for that year - dict_year = {} - # extract list of numbers for that year - list_numbers = next(os.walk(path_data + str_year))[1]#os.listdir(path_data + str_year) - list_numbers.sort() - # extract list of lastnames for that year and generate dataframe from it - lists_lastnames = dict_lastnames[int(year)] - df_lastnames = hf.get_df_from_lists_names(lists_lastnames) - # extract overlaps of that year - dict_overlaps_year = dict_overlaps[year] - # for each number, i.e. document - for number in list_numbers: - path_number = path_data + str_year + '/' + number + '/' - # if document is a discussion - if (hf.check_if_discussion(path_number + number + '.xml')) and (number not in ['20032463', '20032952', '20014332']): - print(number + '\n') - # get dictionary with text - dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml') - # exclude parts from previous and next document - if number in dict_overlaps_year: - dict_text = hf.exclude_overlaps(dict_text, dict_overlaps_year[number]) - # get all discussionstarts - dict_discussionstarts = hf.get_discussion_starts(dict_text, df_lastnames, list_stopwords, bln_print=True) - # get votation paragraphs - dict_votations = hf.get_votations(dict_text, list_votationterms) - # put all discussions together in dictionary - dict_discussions, list_keys = hf.get_discussions(dict_text, dict_discussionstarts, dict_votations) - # save that discussions dictionary to the yearly dictionary - dict_year[number] = dict_discussions - - #print('\n\n') - # save that yearly dictionary to the dictionary for all documents - dict_docs[year] = dict_year - # dump that discussions dictionary in the yearly folder - path_year = path_output + 'AB/' + str_year + '/' - os.makedirs(path_year, exist_ok=True) - with open(path_year + 'dict_discussions.pickle', 'wb') as f: - pickle.dump(dict_year, f) - - print("Time to extract discussions for year %s: %s minutes\n" % (year, (time.time() - start_time)/60)) - - -# dump dictionary of documents to a pickle file -year_start = str(list(dict_docs.keys())[0]) -year_end = str(list(dict_docs.keys())[-1]) -with open(path_output + 'dict_discussions_' + year_start + '-' + year_end + '.pickle', 'wb') as f: - pickle.dump(dict_docs, f) - -print("Time to extract all discussions: %s minutes\n" % ((time.time() - start_time_discussions)/60)) - - -# Language identification with Luis' method -# ----------------------------------------- - -print('start to identify languages of the years', years, '\n\n\n') -start_time_languages = time.time() - - -# initialize empty dictionaries -dict_languages = {} -dict_german = {} -dict_french = {} -dict_italian = {} - -# for every year -for year in range_years: - str_year = str(year) - start_time = time.time() - - # initialize empty dictionaries for that year - dict_year = {} - dict_year_german = {} - dict_year_french = {} - dict_year_italian = {} - print(year) - - # load pickle dump for that year - with open(path_output + 'AB/' + str_year + '/dict_discussions.pickle', 'rb') as f: - dict_disc_year = pickle.load(f) - - # for every document in that year - for number in dict_disc_year: #dict_docs[year]: - # initiaze empty dictionaries for that document - dict_number = {} - dict_number_german = {} - dict_number_french = {} - dict_number_italian = {} - print(number) - - # tokenize discussion - dict_tokenized = hf.tokenize_dictionary(dict_disc_year[number], hf.tokenizer) - - # identify language - dict_lang = hf.identify_lang(dict_tokenized) - #print(dict_lang) - - # assign language - for tupel, value in dict_lang.items(): - #print(tupel) - lang = hf.label_language(value) - dict_number[tupel] = lang - if lang == 'german': - dict_number_german[tupel] = dict_disc_year[number][tupel] - elif lang == 'french': - dict_number_french[tupel] = dict_disc_year[number][tupel] - elif lang == 'italian': - dict_number_italian[tupel] = dict_disc_year[number][tupel] - else: - pass - #print(lang, value, dict_docs[year][number][tupel]) - - # add to dictionaries of that year - dict_year[number] = dict_number - dict_year_german[number] = dict_number_german - dict_year_french[number] = dict_number_french - dict_year_italian[number] = dict_number_italian - - # add to overall dictionaries - dict_languages[year] = dict_year - dict_german[year] = dict_year_german - dict_french[year] = dict_year_french - dict_italian[year] = dict_year_italian - - print("Time to identify languages for discussions of year %s: %s minutes\n" % (year, (time.time() - start_time)/60)) - - # dump for that year - with open(path_output + 'AB/' + str_year + '/dict_languages.pickle', 'wb') as f: - pickle.dump(dict_year, f) - with open(path_output + 'AB/' + str_year + '/dict_discussions_german.pickle', 'wb') as f: - pickle.dump(dict_year_german, f) - with open(path_output + 'AB/' + str_year + '/dict_discussions_french.pickle', 'wb') as f: - pickle.dump(dict_year_french, f) - with open(path_output + 'AB/' + str_year + '/dict_discussions_italian.pickle', 'wb') as f: - pickle.dump(dict_year_italian, f) - -print("Time to identify languages for all discussions: %s minutes\n" % ((time.time() - start_time_languages)/60)) +#len(files_to_process) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 0f555b4887d557a001caabd9b90ab8b76e9b45a0..e41d934ac7914234ab6fae1f425123999971c413 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -9,6 +9,11 @@ import numpy as np import pandas as pd import string import re +import os, sys +sys.path.append('src/python/') +import utils_proc +import copy +import collections # function to check whether a file containts discussions @@ -44,191 +49,385 @@ def check_if_discussion(path_meta_xml_file, return True +# helper function to get text without font information +# example for font information: [font face="11.718" size="Times-Roman"] sometext [/font] +# input: +# - sometext: string +# output: +# - newtext: modified string +def get_text(sometext): + # initialize + newtext = '' + + # find text between font information + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext + + +# function to annotated corrected XML +def get_annotated_xml(XML_root, df_lastnames, bln_print=False): + + # list of votation terms + # TODO: make it work for é, etc. + list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', + 'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', + 'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e', + "D'accord", 'voix'] + + # list of stopwords + list_stopwords = stopwords.words('german') + list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr']) + list_stopwords.extend(stopwords.words('french')) + list_stopwords.extend(['ils', 'les', 'celle']) + + # create new XML as a copy of the corrected one + XML_new = copy.deepcopy(XML_root) + + # initialize flags to distinguish speeches from votes + this_is_speech = False + prev_is_speech = False + this_is_vote = False + + # for every page + for ind_p, page in enumerate(XML_root): + if bln_print: + print(page.tag, page.attrib) + # for every textbox on that page + for ind_t, textbox in enumerate(page): + if (textbox.tag == 'textbox'): + if 'type_textbox' in textbox.attrib.keys(): + if (textbox.attrib['type_textbox'] == 'text'): + if bln_print: + print(textbox.tag, textbox.attrib) + + # get complete text of that textbox + complete_text, ind_tl_colon = get_complete_text(textbox) + if bln_print: + print(complete_text[:100]) + + # identify and label language in XML + dict_lang = identify_language(complete_text) + XML_new = label_language(XML_new, ind_p, ind_t, dict_lang) + + # get texttype of that textbox by majority vote + # TODO add that type to XML + textbox_texttype = get_textbox_type(textbox) + if bln_print: + print(textbox_texttype) + + if textbox_texttype in ['text_col1', 'text_col2']: + + XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False) + if this_is_speech: + prev_is_speech = True +# print('stopped after finding speech start') + continue + XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, bln_print=False) + if this_is_vote: + prev_is_speech = False +# print('stopped after finding vote') + continue + if prev_is_speech and (not this_is_vote): + XML_new = label_speechcont(XML_new, ind_p, ind_t) + + + return XML_new + + +# helper function to get type of textbox_type +# corresponds to majority vote of types of textlines +# input: +# - textbox +# output: +# - textbox_type: string +def get_textbox_type(textbox): + # initialize empty dictionary + dict_type = {} + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': + # count types + if textline.attrib['type'] not in dict_type.keys(): + dict_type[textline.attrib['type']] = 1 + else: + dict_type[textline.attrib['type']] += 1 + # list of all types with maximum count + list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())] + # if only one with maximum value + if len(list_types) == 1: + textbox_type = list_types[0] + # if several with same maximum value + else: + textbox_type = 'notdistinct' + return textbox_type - - - - - - - - - - - - - - - - - - - - - - -# functions from hf_extractdiscussions.property -# ============================================================================== - -# function to exclude overlapping textboxes between documents +# helper function to get complete text of a textbox # input: -# - dict_text: dictionary of texts of one document -# - dict_overlaps_year: dictionary with overlaps +# - textbox # output: -# - dict_text: modified dict_text -def exclude_overlaps(dict_text, dict_overlaps): - # initialize to impossible values - first_entry = -1 - last_entry = 1000 +# - complete_text: string +# - ind_tl_colon: index of textline with colon (needed for label speech start) +def get_complete_text(textbox): - # get index of textbox from first and last page - # the overlap dictionary only contains an entry, if an overlap was detected - for entry, array in dict_overlaps.items(): - if entry == 'first': - first_entry = int(array[0]) - if entry == 'last': - last_entry = int(array[0]) + # helper function to get text without font information + def get_text(sometext): + newtext = '' + for text in re.findall('\].*?\[',sometext): + #print(text) + if text.startswith(']') and text.endswith('['): + newtext += text[1:-1] + #print(newtext) + return newtext - # get list of keys for first and last page - list_first_page = [key for key in dict_text if key.split(',')[1] == '0'] - last_page = max([int(key.split(',')[1]) for key in dict_text]) - list_last_page = [key for key in dict_text if key.split(',')[1] == str(last_page)] + # initialize empty string + complete_text = '' - # modify dict_text on first page - for key in list_first_page: - if int(key.split(',')[2]) < first_entry: - dict_text[key] = '' + # initialize index of textline colon to impossible value + ind_tl_colon = -1 - # ... and on last page - for key in list_last_page: - if int(key.split(',')[2]) > last_entry: - dict_text[key] = '' + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': + # get that text + thattext = get_text(textline.text) - return dict_text + # append that text to string + complete_text += thattext + # in first two textlines of textbox, check for colon + if ind_tl < 3: + if ':' in thattext: + ind_tl_colon = ind_tl -# tokenizer -tokenizer_canton = RegexpTokenizer(r'\w+') # only leaves words -#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+') -# last part \S+ is needed to get colon, \S stands for white space -tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') + return complete_text, ind_tl_colon -# function to extract discussion starts -# !!! maybe we only need a list of discussion starts + +# function to label speech starts # input: -# - dict_text: dictionary with text of one file -# - list_names: list of MPs +# - text: string to be analyzed +# - df_names: dataframe of politicians # - list_stopwords: list of german and french stopwords # - bln_print: whether to print during execution, default False # output: -# - dict_discussionstarts: dictionary with discussion starts -def get_discussion_starts(dict_text, df_names, list_stopwords, bln_print=False): - - # initialize empty dictionary - dict_discussionstarts = {} - - # add a few terms to list_stopwords that are easily mistaken as last names - list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen', - 'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter']) - - # for every textbox - for key, text in dict_text.items(): - if ':' in text[:100]: - # extract the index of the colon in the text - colon_index_text = text.index(':') - - # look at first few terms of that textbox - text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) - print('text_start', text_start) - list_oi = tokenizer.tokenize(text_start) - print('asdf', list_oi) - - # shorten to part before colon - list_oi2 = list_oi - - # remove stopwords - list_oi2 = [term for term in list_oi2 if term.lower() not in list_stopwords] - - # remove punctuation - list_oi2 = [''.join(c for c in s if c not in string.punctuation) for s in list_oi2] - list_oi2 = [s for s in list_oi2 if s] +# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID +def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False): + + # initialize flag + this_is_speech = False + + # initialize strings and ID + str_name = '' + str_role = '' + list_uniqueID = [] + str_canton = '' + + # font text end + fontend = '[/font]' + + # very consistently, a speaker can be identified by looking for a colon + # at the beginning of a textbox and identifiying a name or a role in front + # of that colon + if ind_tl_colon >= 0: +# if ':' in text[:100]: + # extract the index of the colon in the text + colon_index_text = text.index(':') + + # look at first few terms of that textbox + text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) + list_oi = tokenizer.tokenize(text_start) + if bln_print: + print('possible speech start: ', list_oi) + + # remove stopwords + list_oi = [term for term in list_oi if term.lower() not in list_stopwords] + + # remove punctuation + list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi] + list_oi = [s for s in list_oi if s] + + # remove lower case terms +# list_oi = [term for term in list_oi if not term.islower()] + + # remove numbers + list_oi = [term for term in list_oi if not term.isdigit()] + + # remove single characters + list_oi = [term for term in list_oi if len(term)>1] + + # for every term + for term in list_oi: + # if possible, find a name in a list + str_name, str_role, list_uniqueID, str_canton = find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False) + if bln_print: + print('name', str_name, 'role', str_role) - # remove lower case terms -# list_oi2 = [term for term in list_oi2 if not term.islower()] + # get rid of doubled double names + # TODO - # remove numbers - list_oi2 = [term for term in list_oi2 if not term.isdigit()] + # get rid of 'Präsident stimmt nicht Président ne vote pas' + if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: + if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): + if bln_print: + print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi) + str_role = '' - # remove single characters - list_oi2 = [term for term in list_oi2 if len(term)>1] + # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' + if str_name: + if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: + if bln_print: + print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) + str_name = '' + + # if a name has been found, add it to XML_new + if str_name or str_role: + # add attribute speech_start to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' + + # add speaker as attribute to first textline + XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton) + + # update text of XML (speaker is on first line, actual speech start on second line of speech_start textbox) + # if colon is on first line + if ind_tl_colon == 0: + # get text of that line and colon index + thattext = XML_new[ind_p][ind_t][0].text + colon_index = thattext.index(':') + + # get last font information of thattext + fontstart = re.findall('\[font.*?\]', thattext)[-1] + + try: + # write speaker to first line + XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend + + # get start of speech with correct font start + if thattext[colon_index+1:].startswith('[font'): + startspeech = thattext[colon_index+1:] + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = '' + else: + startspeech = fontstart + thattext[colon_index+1:] + + # write beginning of speech to second line + # (create new ET element if necessary) + if len(list(XML_new[ind_p][ind_t])) > 1: + XML_new[ind_p][ind_t][1].text = startspeech + ' ' + XML_new[ind_p][ind_t][1].text + else: + XML_new[ind_p][ind_t].append(copy.deepcopy(XML_new[ind_p][ind_t][0])) + XML_new[ind_p][ind_t][1].attrib.pop('speaker') + XML_new[ind_p][ind_t][1].text = startspeech + except: + print('error in self.input_file when splitting speaker') + #print(thattext) + #print(len(list(XML_new[ind_p][ind_t]))) + #print(list(XML_new[ind_p][ind_t])) + #print(XML_new[ind_p][ind_t]) + #print('gefundener Name:', str_name, str_role) + pass + + # if colon is on second line + if ind_tl_colon == 1: + # get text of that line and colon index + thattext = XML_new[ind_p][ind_t][1].text + colon_index = thattext.index(':') + + # get last font information of thattext + fontstart = re.findall('\[font.*?\]', thattext)[-1] + + # get start of speech with correct font start + if thattext[colon_index+1:].startswith('[font'): + startspeech = thattext[colon_index+1:] + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = '' + else: + startspeech = fontstart + thattext[colon_index+1:] + + # write speaker to first line + XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend + # write beginning of speech to second line + XML_new[ind_p][ind_t][1].text = startspeech + + # set flag + this_is_speech = True + if bln_print: + print('found a name:', list_oi, str_name, str_role, '\n') - # initialize string for name and role - str_name = '' - str_role = '' - int_uniqueID = int(0) - str_canton = '' + return XML_new, this_is_speech - # for every term, reversed finds canton before it finds name - for term in reversed(list_oi2): - # if possible, find a name in a list - str_name, str_role, int_uniqueID, str_canton = find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True) - print('name', str_name, 'role', str_role) - # get rid of 'Präsident stimmt nicht Président ne vote pas' - if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: - print('++++++++++ Präsident', list_oi2, list_oi) - if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi): - str_role = '' - - # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":' - if str_name: - print('++++++++++ Name', list_oi2, list_oi) - if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1: - str_name = '' - - # if a name has been found - if str_name or str_role: - # add text to dictionary - dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], - text[colon_index_text+1:]) - if bln_print: - print('found a name:', list_oi2, str_name, str_role, '\n') +# function to extract votation paragraphs +# !!! error prone, possible improvements see notebook extract_discussions +# input: +# - XML_new: +# - text: string +# - list_votationterms: list of votation terms +# - bln_print: whether to print during execution, default False +# output: +# - XML_new: updated +def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, bln_print=True): - return dict_discussionstarts + # get first terms of that text + list_oi = tokenizer.tokenize(text)[:15] +# if len(set(list_oi).intersection(set(list_votationterms))) > 1: + # if there is an overlap with typical votation terms: + if set(list_oi).intersection(set(list_votationterms)): + # add attribute vote to textbox + XML_new[ind_p][ind_t].attrib['text_type'] = 'vote' + + # set flag + this_is_vote = True + if bln_print: + print('found a vote:', list_oi) + else: + #pass + # set flag + this_is_vote = False + if bln_print: + print('not a vote', list_oi) + return XML_new, this_is_vote -# small function to get first item of tupels in a list -def get_first_item(list_tupels): - list_first_item = [tupel[0] for tupel in list_tupels] - return list_first_item +# function to label continuation of speech +# only adds label to corresponding textbox +def label_speechcont(XML_new, ind_p, ind_t): -# small function to get last two items of tupels in a list -def get_last_item(list_tupels): - list_last_item = [tupel[-2:] for tupel in list_tupels] - return list_last_item + XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont' + return XML_new +# helper function to flatten nested irregular list +def flatten(l): + for el in l: + if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)): + yield from flatten(el) + else: + yield el # function to find names # input: # - term: term that might be name # - str_name: string to which name should be attached # - str_role: string to which role should be attached -# - int_uniqueID: integer for uniqueID -# !!! (if there are several possibilities, this becomes a tuple) +# - list_uniqueID: list with one or several uniqueIDs # - list_tupels: list of tupels containing all types of names -def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=False): +# TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton +# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer) +def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False): - def get_string(term, str_name, str_role, int_uniqueID, str_canton): + def get_string(term, str_name, str_role, list_uniqueID, str_canton): name_type = '' # if it is one of the simple names if term in list(df_names['name_short'].loc[df_names['type']=='simple']): @@ -239,12 +438,16 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln str_role = add_to_string(str_role, term) # if it is a double name elif term in list(df_names['name_short'].loc[df_names['type']=='double']): - print(20*'\n', 'DOUBLE NAME') + if bln_print: + print(20*'\n', 'DOUBLE NAME') # get correct name correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')] if bln_print: print('double name', correct_name) - str_name = add_to_string(str_name, correct_name) + # only add name if it is not there yet + # if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries + if correct_name not in str_name.split(' '): + str_name = add_to_string(str_name, correct_name) name_type = 'double' # if it is a composite name elif term in list(df_names['name_short'].loc[df_names['type']=='comp']): @@ -276,23 +479,47 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln name_type = 'canton' - temp = '' + # extract uniqueID + list_temp = [] if name_type in ['simple', 'double', 'comp']: - temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')] + list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]] elif name_type in ['canton']: - temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) # if canton_missing: # temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) # else: # temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_correct']==str_correct)].iat[0, df_names.columns.get_loc('uniqueIndex')] - if temp: - if int_uniqueID == 0: - int_uniqueID = temp - else: - int_uniqueID = (int_uniqueID, temp) - - return str_name, str_role, int_uniqueID + if len(list_temp) > 0: + if bln_print: + print(list_temp, list_uniqueID) + print(type(list_temp), type(list_uniqueID)) + print(isinstance(list_uniqueID, list)) + # if no unique ID has been assigned so far + if len(list_uniqueID) == 0: + list_uniqueID = list_temp + # if there are already one or several people and have a new person, we update + elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0: + list_uniqueID.append(list_temp) + + ## if we already have several possible people, e.g. because of canton + #elif isinstance(int_uniqueID, tuple): + #print('I should be here') + ## and refound the uniqueID of one of those, don't update + #if temp in int_uniqueID: + #pass + ## and update if we don't have that uniqueID yet + #else: + #int_uniqueID = (int_uniqueID, temp) + ## if a person with that uniqueID exists already, don't update + #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID: + #print('but end up here.. not even.....') + #pass + ## if a different unique ID has been assigned already + #else: + #int_uniqueID = (int_uniqueID, temp) + + return str_name, str_role, list_uniqueID # small function to add term to str_name def add_to_string(string, term): @@ -308,8 +535,8 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', 'Vizepräsident'] - list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'Gallen', 'StGallen', - 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter'] + list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen', + 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'Rédiger', 'rédiger', 'Wer', 'Fällen'] list_places = get_list_cantons(df_names) @@ -325,7 +552,7 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln # if term is in the list of all names and roles if term in (list_all_names + list_roles): # get correct name and uniqueID, or role, for that term - str_name, str_role, int_uniqueID = get_string(term, str_name, str_role, int_uniqueID, str_canton) + str_name, str_role, list_uniqueID = get_string(term, str_name, str_role, list_uniqueID, str_canton) if bln_print: print('=== correct name', term) @@ -362,166 +589,185 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln if bln_print: print('we found several possible names', set_intersection, 'and choose', array_min) if term_approx: - str_name, str_role, int_uniqueID = get_string(term_approx, str_name, str_role, int_uniqueID, str_canton) - print('*******************', str_name, term_approx) + str_name, str_role, list_uniqueID = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton) + if bln_print: + print('*******************', str_name, term_approx) + + + return str_name, str_role, list_uniqueID, str_canton - return str_name, str_role, int_uniqueID, find_names +# two functions for language identification +# Author: Luis Salamanca +# small modifications by Lili Gasser +# Using stopwords +# input: +# - text: string +# - valid_lang: tuple of valid languages +# output: +# - dict_language_counts: dictionary of stopword counts for each valid language +def identify_language(text, valid_lang = ('german', 'french', 'italian')): + # tokenize + tokens = text.split(' ') + # all lowercase + test_words = [word.lower() for word in tokens] + # make a set + test_words_set = set(test_words) + # initialize dictionary of language elements + dict_language_counts = {} -# function to get data frame from lists of names + # iterate through languages of stopwords + for language in stopwords.fileids(): + if language in valid_lang: + # get stopword set + stopwords_set = set(stopwords.words(language)) + # get intersection between text of interest and stopword set for this language + common_elements = test_words_set.intersection(stopwords_set) + # save number of common elements to dictionary + dict_language_counts[language] = len(common_elements) + + return dict_language_counts + + +# Simply, given the number of ocurrences of the stopwords, it assigns a label +# to a specific textbox, also considering the possibility of textboxes +# mixing languages. For this case, the value ratio_similar is intended # input: -# - lists_names: lists of names (simple, double, comp, canton) +# - XML_new: XML file to update +# - aux_dict_l: corresponds to dict_language_counts # output: -# - df: corresponding dataframe -def get_df_from_lists_names(lists_names): - list_types = ['simple', 'double', 'comp', 'canton'] - df = pd.DataFrame() - for i in range(4): - df_temp = pd.DataFrame(lists_names[i], - columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) - df_temp['type'] = list_types[i] - df = pd.concat([df, df_temp], ignore_index = True) - return df +# - lang_max: string +def label_language(XML_new, ind_p, ind_t, aux_dict_l): + # specify a similarity ratio + ratio_similar = 0.8 + # if there are counts, determine language + if sum(aux_dict_l.values()): + aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()} + lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key])) + lang_max = '' + count_l = 0 + for lang in aux_dict_l_norm.keys(): + if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar): + if count_l > 0: + lang_max += '_' + lang_max += lang + count_l += 1 + if count_l > 1: + lang_max = 'mixed_' + lang_max + else: + lang_max = 'languageNotIdentified' + # add attribute to textbox + XML_new[ind_p][ind_t].attrib['language'] = lang_max + return XML_new -# function to extract votation paragraphs -# !!! maybe we only need a list of votation paragraphs -# !!! error prone, possible improvements see notebook extract_discussions + + + + + + + + + + + + + + + + + + +# functions from hf_extractdiscussions +# ============================================================================== +# TODO: check whether they still are needed + +# function to exclude overlapping textboxes between documents # input: -# - dict_text: dictionary with text of one file -# - list_names: list of votation terms -# - bln_print: whether to print during execution, default False +# - dict_text: dictionary of texts of one document +# - dict_overlaps_year: dictionary with overlaps # output: -# - dict_votations: dictionary with votations -def get_votations(dict_text, list_votationterms, bln_print=True): - count = 0 - dict_votations = {} - for key, text in dict_text.items(): - list_oi = tokenizer.tokenize(text)[:15] -# if len(set(list_oi).intersection(set(list_votationterms))) > 1: - if set(list_oi).intersection(set(list_votationterms)): - count += 1 - dict_votations[key] = text - if bln_print: - print(count, 'MATCH', key, list_oi) - else: - #pass - if bln_print: - print('----- ', list_oi) +# - dict_text: modified dict_text +def exclude_overlaps(dict_text, dict_overlaps): + # initialize to impossible values + first_entry = -1 + last_entry = 1000 + + # get index of textbox from first and last page + # the overlap dictionary only contains an entry, if an overlap was detected + for entry, array in dict_overlaps.items(): + if entry == 'first': + first_entry = int(array[0]) + if entry == 'last': + last_entry = int(array[0]) + + # get list of keys for first and last page + list_first_page = [key for key in dict_text if key.split(',')[1] == '0'] + last_page = max([int(key.split(',')[1]) for key in dict_text]) + list_last_page = [key for key in dict_text if key.split(',')[1] == str(last_page)] + + # modify dict_text on first page + for key in list_first_page: + if int(key.split(',')[2]) < first_entry: + dict_text[key] = '' + + # ... and on last page + for key in list_last_page: + if int(key.split(',')[2]) > last_entry: + dict_text[key] = '' + + return dict_text + + +# tokenizer +tokenizer_canton = RegexpTokenizer(r'\w+') # only leaves words +#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+') +# last part \S+ is needed to get colon, \S stands for white space +tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+') + + + + +# small function to get first item of tupels in a list +def get_first_item(list_tupels): + list_first_item = [tupel[0] for tupel in list_tupels] + return list_first_item + +# small function to get last two items of tupels in a list +def get_last_item(list_tupels): + list_last_item = [tupel[-2:] for tupel in list_tupels] + return list_last_item - if bln_print: - print(count) - return dict_votations -# function to put discussions together -# !!! needs improvement when OCRed xml is corrected (details see notebook) +# function to get data frame from lists of names # input: -# - dict_discussionstarts -# - dict_votations +# - lists_names: lists of names (simple, double, comp, canton) # output: -# - dict_discussions: dictionary of discussion parts -# key: integer of discussion start -# value: text until next discussion start or votation paragraph -def get_discussions(dict_text, dict_discussionstarts, dict_votations): - - # helper function to add text to discussion dictionary - def add_to_dict(key, i): -# print(key, i) - if key not in dict_discussions: - dict_discussions[key] = dict_discussionstarts[key] - else: - if i in list_text_keys_integers: - actual_i = list(dict_text.keys())[list_text_keys_integers.index(i)] - only_text = dict_discussions[key][1] + dict_text[actual_i] - dict_discussions[key] = (dict_discussions[key][0], only_text) - list_keys.append(i) - - - # list of keys for discussion starts and votation paragraphs - list_discussionstarts = list(dict_discussionstarts.keys()) - list_discussionstarts_integers = [int(tpl[0].split(',')[0]) for tpl in dict_discussionstarts.keys()] - list_votations_strings = list(dict_votations.keys()) - list_votations_integers = [int(tpl.split(',')[0]) for tpl in list_votations_strings] - list_text_keys_integers = [int(tpl.split(',')[0]) for tpl in dict_text.keys()] - - # initialize empty dictionary for discussions and empty list for all added keys - dict_discussions = {} - list_keys = [] - - # if there are no discussion starts, return empty dictionary and list - if not list_discussionstarts: - return dict_discussions, list_keys - - # for every discussion start except last - for idx, key in enumerate(list_discussionstarts_integers[:-1]): - #print(idx, key) - # write discussion start to dictionary - add_to_dict(list_discussionstarts[idx], key) - - # for every textbox until next discussion start - for i in range(key + 1, list_discussionstarts_integers[idx + 1]): - # if it is not a votation paragraph, write it to dictionary, - if i not in list_votations_integers: - add_to_dict(list_discussionstarts[idx], i) - # else, stop execution of for loop - else: - break - - # for last discussion start - last_key = list_discussionstarts_integers[-1] - # write discussion start to dictionary - add_to_dict(list_discussionstarts[-1], last_key) - # for every textbox until the end of the document - for i in range(last_key + 1, max(list_text_keys_integers) + 1): - # if it is not a votation paragraph, write it to dictionary - if i not in list_votations_strings: - add_to_dict(list_discussionstarts[-1], i) - # else, stop execution of for loop - else: - break +# - df: corresponding dataframe +def get_df_from_lists_names(lists_names): + list_types = ['simple', 'double', 'comp', 'canton'] + df = pd.DataFrame() + for i in range(4): + df_temp = pd.DataFrame(lists_names[i], + columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName')) + df_temp['type'] = list_types[i] + df = pd.concat([df, df_temp], ignore_index = True) + return df + - return dict_discussions, list_keys -# function to check whether a file containts discussions -# achieved by excluding title pages, table of content, etc. -# !!! function works well for 1891 - 1900, not checked after that !!! -def check_if_discussion(path_meta_xml_file, - list_attributes = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'], - list_nondiscussion = ['inhaltsverzeichnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste', - 'umschlag', 'sachregister', 'titelblatt', 'numerierung'], - list_nondiscussion2 = ['table', 'matières', 'répertoire', 'procès-verbaux']): - # parse, get root and then part of interest - XML_tree = ET.parse(path_meta_xml_file) - XML_root = XML_tree.getroot() - XML_poi = XML_root[0].find('ADS_TEXTEINHEIT') - # for each title attribute - for attribute in list_attributes: - # if xml contains this attribute - if attribute in XML_poi.attrib: - # get title and generate set with lower case terms - title = XML_poi.attrib[attribute] - set_title = set([term.lower() for term in title.split()]) - #print(set_title) - # if one of terms is in list_nondiscussion, return False - if set_title.intersection(set(list_nondiscussion)): - #print('NOOO', path_meta_xml_file) - return False - # if two terms are in list_nondiscussion2, also return False - if len(set_title.intersection(set(list_nondiscussion2))) > 1: - #print('NOOO', path_meta_xml_file) - return False - return True @@ -573,52 +819,6 @@ def dict_only_text(dictionary): -# two functions for language identification -# Author: Luis Salamanca -# small modifications by Lili Gasser -# Using stopwords -def identify_lang(dict_text, valid_lang = ('german', 'french', 'italian')): - - language_ratios_textbox = {} - - for i_k in dict_text.keys(): - tokens = dict_text[i_k][1] - test_words = [word.lower() for word in tokens] # lowercase all tokens - test_words_set = set(test_words) - language_ratios = {} - for language in stopwords.fileids(): - if language in valid_lang: - stopwords_set = set(stopwords.words(language)) # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too. - common_elements = test_words_set.intersection(stopwords_set) - language_ratios[language] = len(common_elements) # language "score" - language_ratios_textbox[i_k] = language_ratios - - return language_ratios_textbox - - -# Simply, given the number of ocurrences of the stopwords, it assigns a label -# to a specific textbox, also considering the possibility of textboxes -# mixing languages. For this case, the value ratio_similar is intended - -def label_language(aux_dict_l): - ratio_similar = 0.8 - if sum(aux_dict_l.values()): - aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()} - lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key])) - lang_max = '' - count_l = 0 - for lang in aux_dict_l_norm.keys(): - if (aux_dict_l_norm[lang] > aux_dict_l_norm[lang_max_aux] * ratio_similar): - if count_l > 0: - lang_max += '_' - lang_max += lang - count_l += 1 - if count_l > 1: - lang_max = 'mixed_' + lang_max - else: - lang_max = 'NotIdentified' - return lang_max - # function to get list of places def get_list_cantons(df_names): diff --git a/src/python/utils_proc.py b/src/python/utils_proc.py index 62e74547d3d1802fd357f84d812b9e9c28e1c630..55491cf7f6a10a7b58fb57ecfda18747796dac5d 100644 --- a/src/python/utils_proc.py +++ b/src/python/utils_proc.py @@ -205,4 +205,4 @@ def correct_metadata(year, id_doc, flag_end): tree = ET.ElementTree(XML_root_meta) tree.write(full_path + '_metacorr.xml', encoding = 'utf-8') - return full_path + '_metacorr.xml' \ No newline at end of file + return full_path + '_metacorr.xml' diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh new file mode 100755 index 0000000000000000000000000000000000000000..dbec0daf596094cf95f50dad868b250320eab8b9 --- /dev/null +++ b/src/sh/extract_discussions_yearly.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +year_start=1891 +year_end=1893 + +for year in $(seq $year_start $year_end) +do + echo $year + python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz +done