diff --git a/src/python/def_classes.py b/src/python/def_classes.py index c888295e92ef91eeda4ba155b41b9ec446c490bc..8808fb4602673e53ee2b98b6f238c937df9630ac 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -1,3 +1,695 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b2a2b225fb4605aad5cfe69b441a9e08c1991e95008246ca4b1c25207653fe5 -size 36362 +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 28 13:31:06 2018 + +@author: luissalamanca +""" + +import sys, os + +from colour import Color +import matplotlib.image as mpimg +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.pyplot as plt +import numpy as np +import xml.etree.ElementTree as ET +import copy +import time +import tarfile +import pickle + +from pdf2image import convert_from_path, convert_from_bytes + +import utils_proc +import utils_annot +import plot_tools +import preproc_docs + + + + + +# Definition of classes and methods associated + +class Document: + + limit_year = 1950 + flag_end_run = 1 + name_inpdf = '00_rawpdfs' + name_inmeta = '01_rawmeta' + + def __init__(self, input_file, folder_database): + self.year = int(input_file.split('/')[-2]) + self.id_doc = input_file.split('/')[-1].split('.')[0] + self.input_file = input_file + _, self.name_file = os.path.split(input_file) + self.path_file = folder_database + str(self.year) + '/' + self.name_wo_ext = os.path.splitext(self.name_file)[0] + self.folder_database = folder_database + self._meta_ext() + + def _meta_ext(self): + # Both for the correction and the extraction of the metadata information + name_file = str(self.year) + '/' + self.id_doc + '.xml' + name_file_db = str(self.year) + '/' + self.id_doc + '.db' + name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' + self.name_meta = [name_tar, name_file, name_file_db] + + def meta_correct(self, name_outmeta = '03_correctedmeta'): + utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) + utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta) + name_meta_corr = utils_proc.correct_metadata(self.year, self.id_doc, self.flag_end_run) + name_tar = utils_proc.addto_tar(name_meta_corr, self.folder_database, name_file = name_outmeta) + self.name_outmeta = name_outmeta + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + def pdf2imgobj(self, resolution = 100): + + self.resolution = resolution + utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) + self.imgobj = convert_from_path(self.input_file, dpi = resolution) + command = 'rm -rf ./' + str(self.year) + utils_proc.call_with_out(command) + + def _get_pages(self, pages = 'all'): + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + if pages == 'all': + self.n_pages = np.arange(len(self.imgobj)) + elif isinstance(pages,str): + self.n_pages = np.array(pages.split(',')).astype(np.uint32) + else: + self.n_pages = np.array(pages) + + def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1, + name_outxml = '02_extractedxml'): + # To extract the embedded text of the pdf into an xml file + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + self._get_pages(pages = pages) + + utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) + name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml, + flag_end = self.flag_end_run) + if flag_save: + name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outxml) + else: + print('Not saving to tar') + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' + + self.name_xml = [name_tar, name_xml] + if flag_save: + h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml) + else: + h_xml = name_xml + self.name_outxml = name_outxml + XML_tree = ET.parse(h_xml) + self.XML_main = XML_tree.getroot() + self.n_pages = np.arange(len(self.XML_main)) + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, + ind_page = 0, textb_textl = 1): + # The page refers here to the page of the imgobj, which might not correspond + # to the one of the xml. For that reason we use n_pages to obtain the index + # for the xml + # textb_textl = 1 for textboxes, and 2 for textlines + if (XML_root == None) and (XML_main == None): + return print('Not possible! - You need to provide a valid XML\n') + if np.sum(imarray.shape) == 0: + if 'imgobj' not in self.__dict__.keys(): + imarray = np.array(self.imgobj[ind_page]) + else: + return print('Not possible! - You need to convert first the pdf to image\n') + + if XML_root == None: + XML_root = ET.Element('pages') + ind_abs = np.argwhere(self.n_pages == ind_page) + XML_root.append(XML_main[ind_abs]) + + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + + imarray_textb = np.copy(imarray) + + if textb_textl == 1: + coord_textboxes = np.array([]).reshape((4,0)) + for ind_el in range(0, len(XML_root[0])): + if XML_root[0][ind_el].tag == 'textbox': + coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64) + coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1) + imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, + bbox_page, color_vec = 'blue', alpha = True, + filled = False, thick_line = 6) + return imarray_textb, coord_textboxes + elif textb_textl == 2: + imarray_textl = np.copy(imarray) + coord_textline = np.array([]).reshape((4,0)) + all_font_sizes = np.array([]) + for ind_el in range(0, len(XML_root[0])): + for ind_line in range(0, len(XML_root[0][ind_el])): + if XML_root[0][ind_el][ind_line].tag == 'textline': + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + if len(XML_root[0][ind_el][ind_line]): + all_font_sizes = np.concatenate((all_font_sizes, + np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64))) + coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) + imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, + color_vec = 'red', alpha = True, filled = False, thick_line = 6) + + all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True) + info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])), + counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64))) + + return imarray_textb, coord_textline, all_font_sizes, info_font_sizes + + def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1, + pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml', + name_outcorrxml = '04_correctedxml', flag_save = 1): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + start_time = time.time() + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + + if 'XML_main' not in self.__dict__.keys(): + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outxml) + XML_tree = ET.parse(h_xml) + self.XML_main = XML_tree.getroot() + else: + # TODO if already exists 02_extractedxml + self.pdf2xml(pages = pages, suffix_xml = suffix_xml) + + self._get_pages(pages = pages) + flag_central = 1 + if self.year > self.limit_year: + flag_central = 0 + flag_2col = 1 + + XML_new = ET.Element('pages') + + for ind_abs, ind_page in enumerate(self.n_pages): + + XML_root = ET.Element('pages') + #print(ind_abs,len(self.XML_main)) + XML_root.append(self.XML_main[ind_abs]) + imarray = np.array(self.imgobj[ind_page]) + + if XML_root[0][0].tag == 'textbox': + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page) + + # Image with textboxes highlighted + imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root) + + # Image with textlines highlighted, BUT also, array with all textlines + # coordinates, and the fontsizes, required for later + _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, + textb_textl = 2) + + ##### + # Central vertical line and horizontal lines, through Hough transform + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col, flag_central) + + ##### + # Obtain lateral margins + margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + # Top and bottom line + ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + #print(info_font_sizes) + ##### + # Label the textboxes based on a set of simple rules that make use of + # the margins and the fontsizes + label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ + preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est + + ##### + # Order the textlines, taken all them together, in order to later merge + # in a single textbox textlines that so far form different textboxes + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + list_allcoords_textlines, margins) + + # Given the ordered textlines, group them in new textboxes, creating a + # XML, This uses some criteria of distance between paragraphs + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + rescale_factor, centrall_ord, ind_page, dim_img) + + # Append to the new XML + XML_new.append(XML_enrich[0]) + + + if flag_plots: + im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def) + im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1) + im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) + im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page) + + # Create figure with 4 subplots, for showing all results + if flag_save_figs: + path_output_img = self.path_file + '/previews' + if flag_save_figs: + if not os.path.exists(path_output_img): + os.makedirs(path_output_img) + + if flag_parallel: + if flag_save_figs: + name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl' + with open(name_pickle, 'wb') as f: # Python 3: open(..., 'wb') + pickle.dump([im_met2, im_met3, im_met4, im_met5], f) + + else: + fig, axes = plt.subplots(1, 4, figsize=(30, 10)) + ax = axes.ravel() + ax[0].axis('off') + ax[0].imshow(im_met2) + ax[1].axis('off') + ax[1].imshow(im_met3) + ax[2].axis('off') + ax[2].imshow(im_met4) + ax[3].axis('off') + ax[3].imshow(im_met5) + + if flag_save_figs: + format_fig = 'png' + name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig + fig.savefig(name_fig, format = format_fig, dpi = 200) + plt.close(fig) + + name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml' + + tree = ET.ElementTree(XML_new) + self.XML_main_corr = XML_new + if not os.path.exists('./' + str(self.year)): + os.makedirs('./' + str(self.year)) + tree.write(name_xml_prev, encoding = 'utf-8') + XML_new = preproc_docs.get_text_onefile(self.XML_main_corr) + name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' + tree = ET.ElementTree(XML_new) + tree.write(name_xml, encoding = 'utf-8') + + if flag_save: + name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml) + else: + print('Not saving to tar') + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' + + self.name_outcorrxml = name_outcorrxml + self.name_xml_corr = [name_tar, name_xml] + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) + + #XML_tree = ET.parse(name_xml) + #self.XML_main = XML_tree.getroot() + + def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0, + name_outxml = '02_extractedxml'): + # ind_page has to be a scalar + + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + if 'XML_main' not in self.__dict__.keys(): + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outxml) + XML_tree = ET.parse(h_xml) + self.XML_main = XML_tree.getroot() + else: + self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) + else: + #print('Run this') + self._get_pages() + ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,)) + + #print(ind_abs, type(ind_abs)) + #print(self.XML_main, len(self.imgobj)) + + if ind_page > (len(self.XML_main) - 1): + flag_error = 1 + return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error + + flag_central = 1 + if self.year > self.limit_year: + flag_central = 0 + flag_2col = 1 + + XML_root = ET.Element('pages') + XML_root.append(self.XML_main[ind_abs[0]]) + imarray = np.array(self.imgobj[ind_page]) + + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + + _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, + textb_textl = 2) + margins = [] + ind_limits = [] + label_textlines = [] + list_allcoords_textlines = [] + set_of_blocks = [] + XML_enrich = [] + + if level_proc > 0: + coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, + flag_2col, flag_central) + + if level_proc > 1: + _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page) + + if level_proc > 2: + ##### + # Obtain lateral margins + margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + if level_proc > 3: + # Top and bottom line + ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), + coord_horz.astype(np.uint32)) + + if level_proc > 4: + label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ + preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) + + if level_proc > 5: + set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, + list_allcoords_textlines, margins) + + if level_proc > 6: + XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, + rescale_factor, centrall_ord, ind_page, dim_img) + + # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we + # send a 1 instead + flag_error = 0 + return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \ + set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error + + def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'): + + if 'imgobj' not in self.__dict__.keys(): + self.pdf2imgobj() + if 'XML_main_corr' not in self.__dict__.keys(): + name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' + #print(name_xml) + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: + #print('Run this') + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) + XML_tree = ET.parse(h_xml) + self.XML_main_corr = XML_tree.getroot() + else: + print('You need to have the tar file to use flag_compute = 0!') + flag_error = 1 + return 0, 0, 0, 0, 0, 0, flag_error + #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + else: + print('You need to have the tar file to use flag_compute = 0!') + flag_error = 1 + return 0, 0, 0, 0, 0, 0, flag_error + #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) + ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) + else: + #print('Run this') + self._get_pages() + ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,)) + + #print(ind_abs, type(ind_abs)) + #print(self.XML_main, len(self.imgobj)) + + if ind_page > (len(self.XML_main_corr) - 1): + flag_error = 1 + return 0, 0, 0, 0, 0, 0, flag_error + + XML_root = ET.Element('pages') + XML_root.append(self.XML_main_corr[ind_abs[0]]) + imarray = np.array(self.imgobj[ind_page]) + + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + + ###### + # For obtaining label_textlines, list_allcoords_textlines + coord_textline = np.array([]).reshape((4,0)) + label_textlines = dict() + count = 0 + count_l = 0 + vec_textline_lines = list() + for ind_el in range(0, len(XML_root[0])): + for ind_line in range(0, len(XML_root[0][ind_el])): + if XML_root[0][ind_el][ind_line].tag == 'textline': + if 'type' in XML_root[0][ind_el][ind_line].attrib: + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) + + type_textl = XML_root[0][ind_el][ind_line].attrib['type'] + #print(ind_el) + if XML_root[0][ind_el].attrib['type_textbox'] == 'line': + vec_textline_lines.append(-1) + else: + vec_textline_lines.append(count_l) + count_l += 1 + #print(type_textl) + if type_textl in label_textlines.keys(): + aux_type = label_textlines[type_textl] + aux_type = np.concatenate((aux_type, np.array([count]))).reshape((-1,)) + label_textlines[type_textl] = aux_type + else: + aux_type = np.array([count]) + label_textlines[type_textl] = aux_type + count += 1 + + coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page) + + ##### + # To obtain set_of_blocks. This variable simply contains the coordinates, and + # then a final row indicating the order (here are already ordered), and if it + # is a line, which is indicated with a -1 + set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0) + set_of_blocks = dict() + set_of_blocks[0] = set_of_blocks_aux + #print(set_of_blocks.shape) + + # The last is the flag_error + #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks), + # len(XML_root), bbox_page.shape) + flag_error = 0 + return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error +# imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error +# imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error + + + def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, margins, ind_limits, _, _, \ + _, _, _, XML_root, _, flag_error = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0, + name_outxml = self.name_outxml) + + if flag_error: + print(str(ind_page) + ': non existing page!') + else: + imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root) + + self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + for ind_page in range_pages: + imarray, margins, ind_limits, _, _, \ + _, _, _, _, _, flag_error= self._plot_generic_open(ind_page, suffix_xml, level_proc = 4, + name_outxml = self.name_outxml) + + if flag_error: + print(str(ind_page) + ': non existing page!') + else: + im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1) + + self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', + name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + for ind_page in range_pages: + if flag_compute or not os.path.isfile(name_tar): + imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \ + self._plot_generic_open(ind_page, suffix_xml, level_proc = 5, + name_outxml = self.name_outxml) + #print(label_textlines,list_allcoords_textlines) + else: + imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + #print(len(array_elements)) + + if flag_error: + print(str(ind_page) + ': non existing page!') + else: + im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) + self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file, + flag_plot, flag_save_figs, flag_legend) + + + def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', + name_outcorrxml = '04_correctedxml', flag_compute = 0): + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + for ind_page in range_pages: + if flag_compute or not os.path.isfile(name_tar): + imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \ + self._plot_generic_open(ind_page, suffix_xml, level_proc = 6, + name_outxml = self.name_outxml) + else: + imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \ + = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + + #print(set_of_blocks) + if flag_error: + print(str(ind_page) + ': non existing page!') + else: + im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) + + self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', + flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', + name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1): + # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + for ind_page in range_pages: + if flag_compute or not os.path.isfile(name_tar): + imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \ + self._plot_generic_open(ind_page, suffix_xml, level_proc = 7, + name_outxml = self.name_outxml) + else: + imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \ + = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) + + if flag_error: + print(str(ind_page) + ': non existing page!') + else: + im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl) + + self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file, + flag_plot, flag_save_figs) + + def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '', + flag_plot = 1, flag_save_figs = 0, dpi = 200): + if flag_plot: + fig, axes = plt.subplots(1, 1, figsize=(8, 10)) + axes.axis('off') + axes.imshow(im_met) + plt.title(str_title) + if flag_save_figs: + format_fig = 'png' + name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + + '_page' + str(ind_page) + '.' + format_fig) + fig.savefig(name_fig, format = format_fig, dpi = dpi) + plt.close(fig) + + def check_discussion(self): + utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) + flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + return flag_discussion + + def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '', + flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200): + #print(groups) + if flag_legend: + in_coord = 0 + coords = in_coord + np.array([0, 0, 10, 10]) + inc_page = 20 + flag_notinto = 1 + for ind_g, i_g in enumerate(groups): + if ind_g >= int(len(groups)/2) and flag_notinto: + flag_notinto = 0 + coords[0] = in_coord + coords[1] += int(im_met.shape[1]/1.5) + coords[2] = in_coord + 10 + coords[3] += int(im_met.shape[1]/1.5) + im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6) + coords[0] += inc_page + coords[2] += inc_page + + if flag_plot: + fig, axes = plt.subplots(1, 1, figsize=(8, 10)) + axes.axis('off') + axes.imshow(im_met) + plt.title(str_title) + + if flag_legend: + coords = in_coord + np.array([0, 0, 10, 10]) + flag_notinto = 1 + for ind_g, i_g in enumerate(groups): + if ind_g >= int(len(groups)/2) and flag_notinto: + flag_notinto = 0 + coords[0] = in_coord + coords[1] += int(im_met.shape[1]/1.5) + coords[2] = in_coord + 10 + coords[3] += int(im_met.shape[1]/1.5) + plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left') + coords[0] += inc_page + coords[2] += inc_page + + if flag_save_figs: + format_fig = 'png' + name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) + + '_page' + str(ind_page) + '.' + format_fig) + fig.savefig(name_fig, format = format_fig, dpi = dpi) + plt.close(fig) + + + \ No newline at end of file diff --git a/src/python/utils_proc.py b/src/python/utils_proc.py index b222bbe176471ff507d02453d37fa3083ebd9648..62e74547d3d1802fd357f84d812b9e9c28e1c630 100644 --- a/src/python/utils_proc.py +++ b/src/python/utils_proc.py @@ -1,3 +1,208 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d39e3bdb2d4572a0bb7f488d038795c975fe9db4d2873cf36d19f1bf7a87c6ae -size 7891 +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 28 13:44:58 2018 + +@author: luissalamanca +""" + +# Just some useful various functions + +import os +import numpy as np +import git +import subprocess +import tarfile +from pdf2image import convert_from_path, convert_from_bytes +import re +import xml.etree.ElementTree as ET + +#git_repo = git.Repo(os.getcwd(), search_parent_directories=True) +#git_root = git_repo.git.rev_parse("--show-toplevel") + +PATH_PDF2TXT = 'pdf2txt.py' +#print(PATH_PDF2TXT) +#PATH_PDF2TXT = "/Users/luissalamanca/anaconda3/envs/py36/bin/pdf2txt.py" +HEIGHT_CHAR = 12 +WIDTH_CHAR = 6 + + +def call_with_out(full_comm): + ## call date command ## + #print(full_comm) + p = subprocess.Popen(full_comm, stdout=subprocess.PIPE, shell=True) + (output, err) = p.communicate() + p_status = p.wait() ## Wait for function to terminate. + #print("Command output : ", output) +# print("Command exit status/return code : ", p_status) + return output + +# This is a bit cumbersome, but wand.image is not importing the pdf from Spyder, +# but it is working from the terminal. Thus, I just run function +def pdf2png(input_file, res = 300, output_file = '', output_path = ''): + if len(output_file) == 0: + output_file = os.path.split(os.path.splitext(input_file)[0])[1] + if len(output_path) == 0: + output_path = os.path.split(os.path.splitext(input_file)[0])[0] + convert_from_path(input_file, dpi = res, output_folder = output_path, fmt = 'png') + +def pdf2xml(input_file, output_file = '', output_path = '', fmt = 'xml', suffix_str = '_data', + page_n = 1, flag_end = 0): + # Also to txt or html + if len(output_file) == 0: + output_file = os.path.split(os.path.splitext(input_file)[0])[1] + if len(output_path) == 0: + output_path = os.path.split(os.path.splitext(input_file)[0])[0] + ext = fmt + if fmt == 'text': + ext = 'txt' + + string_nums = np.array2string(page_n[0]) + for str in page_n[1:]: + string_nums = string_nums + ',' + np.array2string(str) + name_out = output_path + "/" + output_file + suffix_str + "." + ext + + if flag_end: + full_comm = PATH_PDF2TXT + " -o " \ + + name_out + " -t " + fmt + " -p " \ + + string_nums + " " + input_file + else: + full_comm = "python " + PATH_PDF2TXT + " " + input_file + " -o " \ + + name_out + " -t " + fmt + " -p " \ + + string_nums + + +# print(full_comm) + call_with_out(full_comm) + return name_out + +def get_list(year, folder_database, name_file): + with tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz') as tar: + members = tar.getmembers() + + files_aux = [ + tarinfo for tarinfo in members + if tarinfo.isfile() + ] + + files = list() + list_ids = list() + for f in files_aux: + aux_l = str(f).split(' ')[1][1:-1] + files.append(aux_l) + list_ids.append(aux_l.split('/')[-1].split('.')[0]) + + list_ids = np.unique(np.array(list_ids)).tolist() + tar.close() + return files, list_ids + +def get_handlerfile(input_file, folder_database, name_file): + if input_file[:2] != './': + year = input_file.split('/')[0] + input_file = './' + input_file + else: + year = input_file.split('/')[1] + return tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz').extractfile(input_file) + +def tar_extractfile(input_file, folder_database, name_file): + if input_file[:2] != './': + year = input_file.split('/')[0] + input_file = './' + input_file + else: + year = input_file.split('/')[1] + with tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz') as tar: + tar.extract(input_file) + tar.close() + return input_file + +def addto_tar(input_file, folder_database, name_file): + if input_file[:2] != './': + year = input_file.split('/')[0] + input_file = './' + input_file + else: + year = input_file.split('/')[1] + name_tar = folder_database + '/' + str(year) +'/' + name_file + '.tar.gz' + if os.path.isfile(name_tar): + list_files_intar = get_list(year, folder_database, name_file)[0] + # Just to check if the file exists already inside the tar. In that case, we + # first extract everything, except for it, and tar everything again + if input_file in list_files_intar: + for in_f in list_files_intar: + if in_f!= input_file: + #print(in_f, folder_database, name_file) + tar_extractfile(in_f, folder_database, name_file) + tf = tarfile.open(name_tar, mode="w") + for in_f in list_files_intar: + tf.add(in_f) + + else: + tf = tarfile.open(name_tar, mode="a") + tf.add(input_file) + else: + tf = tarfile.open(name_tar, mode="w") + tf.add(input_file) + tf.close() + return name_tar + +def compress_tar(infile, outname = ''): + if len(outname) == 0: + outname = infile + + # tar -xf data/AB/${year}/02_extractedxml.tar.gz + # tar -czvf data/AB/${year}/02_extractedxml.tar.gz ./${year}/ + # rm -rf ${year} + year = infile.split('/')[-2] + c1 = 'tar -xf ' + infile + call_with_out(c1) + c2 = 'tar -czvf ' + outname + ' ./' + str(year) + '/' + call_with_out(c2) + c3 = 'rm -rf ' + str(year) + call_with_out(c3) + + +def correct_metadata(year, id_doc, flag_end): + + keywords = ('In','Jahr','Band','Session','Rat','Sitzung','Geschäftsnummer', + 'Datum','Seite','Ref. No') + max_sep = 6 # Just a parameter to capture the textlines from the db file + + full_path = str(year) + '/' + str(id_doc) + command = 'cp ' + full_path + '.db ' + full_path + '_db.pdf' + call_with_out(command) + name_xml = pdf2xml(full_path + '_db.pdf', page_n = np.array([1]), suffix_str = '', flag_end = flag_end) + + XML_tree = ET.parse(name_xml) + XML_root = XML_tree.getroot() + + coord_textline = np.array([]).reshape((6,0)) + all_text_list = list() + for ind_el in range(len(XML_root[0])): + for ind_line in range(len(XML_root[0][ind_el])): + if XML_root[0][ind_el][ind_line].tag == 'textline': + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + coord_textline_aux = np.concatenate((coord_textline_aux, np.array([ind_el,ind_line]))) + coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((6,1))), axis = 1) + text_str = '' + aux_str = text_str.join([XML_root[0][ind_el][ind_line][ind_t].text for ind_t in range(len(XML_root[0][ind_el][ind_line]))]) + all_text_list.append(aux_str) + + xml_extrameta = ET.Element('META_FROM_DB') + for keyw in keywords: + #print(keyw) + ind_textl = np.min(np.argwhere(np.array([all_text_list]) == keyw + '\n')[:,1]) + coord_r = coord_textline[1,ind_textl] + distance = abs(coord_textline[1,:] - coord_r) + ind_valtextl = np.setdiff1d(np.argwhere(distance < max_sep),ind_textl)[0] + if ind_valtextl.size: + xml_extrameta.attrib[keyw.replace('. ','').upper()] = all_text_list[ind_valtextl][:-1] + else: + xml_extrameta.attrib[keyw.replace('. ','').upper()] = '' + + path_xml_meta = full_path + '.xml' + XML_tree_meta = ET.parse(path_xml_meta) + XML_root_meta = XML_tree_meta.getroot() + XML_root_meta[0].append(xml_extrameta) + + tree = ET.ElementTree(XML_root_meta) + tree.write(full_path + '_metacorr.xml', encoding = 'utf-8') + return full_path + '_metacorr.xml' \ No newline at end of file