From 92eb8d7a299a643c75ca254b885e3749069820e4 Mon Sep 17 00:00:00 2001 From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch> Date: Mon, 10 Dec 2018 11:35:45 +0000 Subject: [PATCH] renku run python src/python/run_correctxml.py data/AB//1940/02_extractedxml.tar.gz data/AB//1940/04_correctedxml.tar.gz --- .gitattributes | 2 + ...d0fe29c9646473b946bb4e307a1a4fc_python.cwl | 67 ++ data/AB/1940/04_correctedxml.tar.gz | 3 + notebooks/RunningClasses.ipynb | 23 +- src/python/def_classes.py | 694 +----------------- 5 files changed, 91 insertions(+), 698 deletions(-) create mode 100644 .renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl create mode 100644 data/AB/1940/04_correctedxml.tar.gz diff --git a/.gitattributes b/.gitattributes index 2bcc997f..1def3672 100644 --- a/.gitattributes +++ b/.gitattributes @@ -112,3 +112,5 @@ data/AB/1936/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1937/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1938/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1939/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text +src/python/def_classes.py filter=lfs diff=lfs merge=lfs -text +data/AB/1940/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl b/.renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl new file mode 100644 index 00000000..f3dc1418 --- /dev/null +++ b/.renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl @@ -0,0 +1,67 @@ +arguments: [] +baseCommand: +- python +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: + class: File + path: ../../src/python/run_correctxml.py + inputBinding: + position: 1 + separate: true + shellQuote: true + streamable: false + type: File + input_2: + default: + class: File + path: ../../data/AB/1940/02_extractedxml.tar.gz + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: File + input_3: + default: data/AB/1940/04_correctedxml.tar.gz + inputBinding: + position: 3 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: notebooks/RunningClasses.ipynb + streamable: false + type: File + output_1: + outputBinding: + glob: src/python/def_classes.py + streamable: false + type: File + output_2: + outputBinding: + glob: $(inputs.input_3) + streamable: false + type: File +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: notebooks + writable: true + - entry: '$({"listing": [], "class": "Directory"})' + entryname: src/python + writable: true + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/AB/1940 + writable: true +successCodes: [] +temporaryFailCodes: [] diff --git a/data/AB/1940/04_correctedxml.tar.gz b/data/AB/1940/04_correctedxml.tar.gz new file mode 100644 index 00000000..af6fd2ce --- /dev/null +++ b/data/AB/1940/04_correctedxml.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7c6a255842500864f285f8f87670a512448927ff35e98a141d6cf34b905e9ce +size 6645275 diff --git a/notebooks/RunningClasses.ipynb b/notebooks/RunningClasses.ipynb index 211a9f93..48955862 100755 --- a/notebooks/RunningClasses.ipynb +++ b/notebooks/RunningClasses.ipynb @@ -9,9 +9,18 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 101, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", @@ -28,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -337,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -348,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 104, "metadata": {}, "outputs": [ { @@ -369,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 105, "metadata": {}, "outputs": [ { @@ -378,7 +387,7 @@ "True" ] }, - "execution_count": 99, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 9a90013b..b8954add 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -1,691 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Sep 28 13:31:06 2018 - -@author: luissalamanca -""" - -import sys, os - -from colour import Color -import matplotlib.image as mpimg -from mpl_toolkits.mplot3d import Axes3D -import matplotlib.pyplot as plt -import numpy as np -import xml.etree.ElementTree as ET -import copy -import time -import tarfile -import pickle - -from pdf2image import convert_from_path, convert_from_bytes - -import utils_proc -import utils_annot -import plot_tools -import preproc_docs - - - - - -# Definition of classes and methods associated - -class Document: - - limit_year = 1950 - flag_end_run = 1 - name_inpdf = '00_rawpdfs' - name_inmeta = '01_rawmeta' - - def __init__(self, input_file, folder_database): - self.year = int(input_file.split('/')[-2]) - self.id_doc = input_file.split('/')[-1].split('.')[0] - self.input_file = input_file - _, self.name_file = os.path.split(input_file) - self.path_file = folder_database + str(self.year) + '/' - self.name_wo_ext = os.path.splitext(self.name_file)[0] - self.folder_database = folder_database - self._meta_ext() - - def _meta_ext(self): - # Both for the correction and the extraction of the metadata information - name_file = str(self.year) + '/' + self.id_doc + '.xml' - name_file_db = str(self.year) + '/' + self.id_doc + '.db' - name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' - self.name_meta = [name_tar, name_file, name_file_db] - - def meta_correct(self, name_outmeta = '03_correctedmeta'): - utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) - utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta) - name_meta_corr = utils_proc.correct_metadata(self.year, self.id_doc, self.flag_end_run) - name_tar = utils_proc.addto_tar(name_meta_corr, self.folder_database, name_file = name_outmeta) - self.name_outmeta = name_outmeta - command = 'rm -rf ./' + str(self.year) - #print(command) - utils_proc.call_with_out(command) - - def pdf2imgobj(self, resolution = 100): - - self.resolution = resolution - utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) - self.imgobj = convert_from_path(self.input_file, dpi = resolution) - command = 'rm -rf ./' + str(self.year) - utils_proc.call_with_out(command) - - def _get_pages(self, pages = 'all'): - if 'imgobj' not in self.__dict__.keys(): - self.pdf2imgobj() - if pages == 'all': - self.n_pages = np.arange(len(self.imgobj)) - elif isinstance(pages,str): - self.n_pages = np.array(pages.split(',')).astype(np.uint32) - else: - self.n_pages = np.array(pages) - - def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1, - name_outxml = '02_extractedxml'): - # To extract the embedded text of the pdf into an xml file - if 'imgobj' not in self.__dict__.keys(): - self.pdf2imgobj() - self._get_pages(pages = pages) - - utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf) - name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml, - flag_end = self.flag_end_run) - if flag_save: - name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outxml) - else: - print('Not saving to tar') - name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' - - command = 'rm -rf ./' + str(self.year) - #print(command) - utils_proc.call_with_out(command) - self.name_xml = [name_tar, name_xml] - if flag_save: - h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml) - else: - h_xml = name_xml - self.name_outxml = name_outxml - XML_tree = ET.parse(h_xml) - self.XML_main = XML_tree.getroot() - self.n_pages = np.arange(len(self.XML_main)) - - - def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, - ind_page = 0, textb_textl = 1): - # The page refers here to the page of the imgobj, which might not correspond - # to the one of the xml. For that reason we use n_pages to obtain the index - # for the xml - # textb_textl = 1 for textboxes, and 2 for textlines - if (XML_root == None) and (XML_main == None): - return print('Not possible! - You need to provide a valid XML\n') - if np.sum(imarray.shape) == 0: - if 'imgobj' not in self.__dict__.keys(): - imarray = np.array(self.imgobj[ind_page]) - else: - return print('Not possible! - You need to convert first the pdf to image\n') - - if XML_root == None: - XML_root = ET.Element('pages') - ind_abs = np.argwhere(self.n_pages == ind_page) - XML_root.append(XML_main[ind_abs]) - - bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - - imarray_textb = np.copy(imarray) - - if textb_textl == 1: - coord_textboxes = np.array([]).reshape((4,0)) - for ind_el in range(0, len(XML_root[0])): - if XML_root[0][ind_el].tag == 'textbox': - coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64) - coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1) - imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, - bbox_page, color_vec = 'blue', alpha = True, - filled = False, thick_line = 6) - return imarray_textb, coord_textboxes - elif textb_textl == 2: - imarray_textl = np.copy(imarray) - coord_textline = np.array([]).reshape((4,0)) - all_font_sizes = np.array([]) - for ind_el in range(0, len(XML_root[0])): - for ind_line in range(0, len(XML_root[0][ind_el])): - if XML_root[0][ind_el][ind_line].tag == 'textline': - coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) - if len(XML_root[0][ind_el][ind_line]): - all_font_sizes = np.concatenate((all_font_sizes, - np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64))) - coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) - imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, - color_vec = 'red', alpha = True, filled = False, thick_line = 6) - - all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True) - info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])), - counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64))) - - return imarray_textb, coord_textline, all_font_sizes, info_font_sizes - - def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1, - pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml', - name_outcorrxml = '04_correctedxml', flag_save = 1): - - if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml - - start_time = time.time() - if 'imgobj' not in self.__dict__.keys(): - self.pdf2imgobj() - - if 'XML_main' not in self.__dict__.keys(): - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' - if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outxml)[0]: - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outxml) - XML_tree = ET.parse(h_xml) - self.XML_main = XML_tree.getroot() - else: - # TODO if already exists 02_extractedxml - self.pdf2xml(pages = pages, suffix_xml = suffix_xml) - - self._get_pages(pages = pages) - flag_central = 1 - if self.year > self.limit_year: - flag_central = 0 - flag_2col = 1 - - XML_new = ET.Element('pages') - - for ind_abs, ind_page in enumerate(self.n_pages): - - XML_root = ET.Element('pages') - #print(ind_abs,len(self.XML_main)) - XML_root.append(self.XML_main[ind_abs]) - imarray = np.array(self.imgobj[ind_page]) - - if XML_root[0][0].tag == 'textbox': - bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page) - - # Image with textboxes highlighted - imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root) - - # Image with textlines highlighted, BUT also, array with all textlines - # coordinates, and the fontsizes, required for later - _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, - textb_textl = 2) - - ##### - # Central vertical line and horizontal lines, through Hough transform - coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, - flag_2col, flag_central) - - ##### - # Obtain lateral margins - margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), - coord_horz.astype(np.uint32)) - - # Top and bottom line - ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), - coord_horz.astype(np.uint32)) - #print(info_font_sizes) - ##### - # Label the textboxes based on a set of simple rules that make use of - # the margins and the fontsizes - label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ - preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est - - ##### - # Order the textlines, taken all them together, in order to later merge - # in a single textbox textlines that so far form different textboxes - set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, - list_allcoords_textlines, margins) - - # Given the ordered textlines, group them in new textboxes, creating a - # XML, This uses some criteria of distance between paragraphs - XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, - rescale_factor, centrall_ord, ind_page, dim_img) - - # Append to the new XML - XML_new.append(XML_enrich[0]) - - - if flag_plots: - im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def) - im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1) - im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) - im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) - im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page) - - # Create figure with 4 subplots, for showing all results - if flag_save_figs: - path_output_img = self.path_file + '/previews' - if flag_save_figs: - if not os.path.exists(path_output_img): - os.makedirs(path_output_img) - - if flag_parallel: - if flag_save_figs: - name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl' - with open(name_pickle, 'wb') as f: # Python 3: open(..., 'wb') - pickle.dump([im_met2, im_met3, im_met4, im_met5], f) - - else: - fig, axes = plt.subplots(1, 4, figsize=(30, 10)) - ax = axes.ravel() - ax[0].axis('off') - ax[0].imshow(im_met2) - ax[1].axis('off') - ax[1].imshow(im_met3) - ax[2].axis('off') - ax[2].imshow(im_met4) - ax[3].axis('off') - ax[3].imshow(im_met5) - - if flag_save_figs: - format_fig = 'png' - name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig - fig.savefig(name_fig, format = format_fig, dpi = 200) - plt.close(fig) - - name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml' - - tree = ET.ElementTree(XML_new) - self.XML_main_corr = XML_new - if not os.path.exists('./' + str(self.year)): - os.makedirs('./' + str(self.year)) - tree.write(name_xml_prev, encoding = 'utf-8') - XML_new = preproc_docs.get_text_onefile(self.XML_main_corr) - name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' - tree = ET.ElementTree(XML_new) - tree.write(name_xml, encoding = 'utf-8') - - if flag_save: - name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml) - else: - print('Not saving to tar') - name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' - - self.name_outcorrxml = name_outcorrxml - self.name_xml_corr = [name_tar, name_xml] - command = 'rm -rf ./' + str(self.year) - #print(command) - utils_proc.call_with_out(command) - - print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) - - #XML_tree = ET.parse(name_xml) - #self.XML_main = XML_tree.getroot() - - def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0, - name_outxml = '02_extractedxml'): - # ind_page has to be a scalar - - if 'imgobj' not in self.__dict__.keys(): - self.pdf2imgobj() - if 'XML_main' not in self.__dict__.keys(): - name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outxml)[0]: - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outxml) - XML_tree = ET.parse(h_xml) - self.XML_main = XML_tree.getroot() - else: - self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) - else: - #print('Run this') - self._get_pages() - ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,)) - - #print(ind_abs, type(ind_abs)) - #print(self.XML_main, len(self.imgobj)) - - if ind_page > (len(self.XML_main) - 1): - flag_error = 1 - return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error - - XML_root = ET.Element('pages') - XML_root.append(self.XML_main[ind_abs[0]]) - imarray = np.array(self.imgobj[ind_page]) - - bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - - _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root, - textb_textl = 2) - margins = [] - ind_limits = [] - label_textlines = [] - list_allcoords_textlines = [] - set_of_blocks = [] - XML_enrich = [] - - if level_proc > 0: - coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, - flag_2col = 1) - - if level_proc > 1: - _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page) - - if level_proc > 2: - ##### - # Obtain lateral margins - margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), - coord_horz.astype(np.uint32)) - - if level_proc > 3: - # Top and bottom line - ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), - coord_horz.astype(np.uint32)) - - if level_proc > 4: - label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \ - preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) - - if level_proc > 5: - set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, - list_allcoords_textlines, margins) - - if level_proc > 6: - XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, - rescale_factor, centrall_ord, ind_page, dim_img) - - # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we - # send a 1 instead - flag_error = 0 - return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \ - set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error - - def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'): - - if 'imgobj' not in self.__dict__.keys(): - self.pdf2imgobj() - if 'XML_main_corr' not in self.__dict__.keys(): - name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' - #print(name_xml) - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: - #print('Run this') - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) - XML_tree = ET.parse(h_xml) - self.XML_main_corr = XML_tree.getroot() - else: - print('You need to have the tar file to use flag_compute = 0!') - flag_error = 1 - return 0, 0, 0, 0, 0, 0, flag_error - #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - else: - print('You need to have the tar file to use flag_compute = 0!') - flag_error = 1 - return 0, 0, 0, 0, 0, 0, flag_error - #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0) - ind_abs = np.array([ind_page]).astype(int).reshape((-1,)) - else: - #print('Run this') - self._get_pages() - ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,)) - - #print(ind_abs, type(ind_abs)) - #print(self.XML_main, len(self.imgobj)) - - if ind_page > (len(self.XML_main_corr) - 1): - flag_error = 1 - return 0, 0, 0, 0, 0, 0, flag_error - - XML_root = ET.Element('pages') - XML_root.append(self.XML_main_corr[ind_abs[0]]) - imarray = np.array(self.imgobj[ind_page]) - - bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - - ###### - # For obtaining label_textlines, list_allcoords_textlines - coord_textline = np.array([]).reshape((4,0)) - label_textlines = dict() - count = 0 - count_l = 0 - vec_textline_lines = list() - for ind_el in range(0, len(XML_root[0])): - for ind_line in range(0, len(XML_root[0][ind_el])): - if XML_root[0][ind_el][ind_line].tag == 'textline': - if 'type' in XML_root[0][ind_el][ind_line].attrib: - coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) - coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1) - - type_textl = XML_root[0][ind_el][ind_line].attrib['type'] - #print(ind_el) - if XML_root[0][ind_el].attrib['type_textbox'] == 'line': - vec_textline_lines.append(-1) - else: - vec_textline_lines.append(count_l) - count_l += 1 - #print(type_textl) - if type_textl in label_textlines.keys(): - aux_type = label_textlines[type_textl] - aux_type = np.concatenate((aux_type, np.array([count]))).reshape((-1,)) - label_textlines[type_textl] = aux_type - else: - aux_type = np.array([count]) - label_textlines[type_textl] = aux_type - count += 1 - - coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page) - - ##### - # To obtain set_of_blocks. This variable simply contains the coordinates, and - # then a final row indicating the order (here are already ordered), and if it - # is a line, which is indicated with a -1 - set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0) - set_of_blocks = dict() - set_of_blocks[0] = set_of_blocks_aux - #print(set_of_blocks.shape) - - # The last is the flag_error - #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks), - # len(XML_root), bbox_page.shape) - flag_error = 0 - return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error -# imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error -# imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error - - - def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', - flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): - - if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml - - for ind_page in range_pages: - imarray, margins, ind_limits, _, _, \ - _, _, _, XML_root, _, flag_error = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0, - name_outxml = self.name_outxml) - - if flag_error: - print(str(ind_page) + ': non existing page!') - else: - imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root) - - self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', - flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'): - - if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml - - for ind_page in range_pages: - imarray, margins, ind_limits, _, _, \ - _, _, _, _, _, flag_error= self._plot_generic_open(ind_page, suffix_xml, level_proc = 4, - name_outxml = self.name_outxml) - - if flag_error: - print(str(ind_page) + ': non existing page!') - else: - im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1) - - self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', - flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', - name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1): - - if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): - imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \ - self._plot_generic_open(ind_page, suffix_xml, level_proc = 5, - name_outxml = self.name_outxml) - #print(label_textlines,list_allcoords_textlines) - else: - imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - #print(len(array_elements)) - - if flag_error: - print(str(ind_page) + ': non existing page!') - else: - im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines) - self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file, - flag_plot, flag_save_figs, flag_legend) - - - def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', - flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', - name_outcorrxml = '04_correctedxml', flag_compute = 0): - - if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): - imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \ - self._plot_generic_open(ind_page, suffix_xml, level_proc = 6, - name_outxml = self.name_outxml) - else: - imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \ - = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - - #print(set_of_blocks) - if flag_error: - print(str(ind_page) + ': non existing page!') - else: - im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) - - self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', - flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml', - name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1): - # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes - if 'name_outxml' not in self.__dict__.keys(): - self.name_outxml = name_outxml - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - for ind_page in range_pages: - if flag_compute or not os.path.isfile(name_tar): - imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \ - self._plot_generic_open(ind_page, suffix_xml, level_proc = 7, - name_outxml = self.name_outxml) - else: - imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \ - = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml) - - if flag_error: - print(str(ind_page) + ': non existing page!') - else: - im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl) - - self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file, - flag_plot, flag_save_figs) - - def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '', - flag_plot = 1, flag_save_figs = 0, dpi = 200): - if flag_plot: - fig, axes = plt.subplots(1, 1, figsize=(8, 10)) - axes.axis('off') - axes.imshow(im_met) - plt.title(str_title) - if flag_save_figs: - format_fig = 'png' - name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) - + '_page' + str(ind_page) + '.' + format_fig) - fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - - def check_discussion(self): - utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) - flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) - command = 'rm -rf ./' + str(self.year) - #print(command) - utils_proc.call_with_out(command) - - return flag_discussion - - def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '', - flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200): - #print(groups) - if flag_legend: - in_coord = 0 - coords = in_coord + np.array([0, 0, 10, 10]) - inc_page = 20 - flag_notinto = 1 - for ind_g, i_g in enumerate(groups): - if ind_g >= int(len(groups)/2) and flag_notinto: - flag_notinto = 0 - coords[0] = in_coord - coords[1] += int(im_met.shape[1]/1.5) - coords[2] = in_coord + 10 - coords[3] += int(im_met.shape[1]/1.5) - im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6) - coords[0] += inc_page - coords[2] += inc_page - - if flag_plot: - fig, axes = plt.subplots(1, 1, figsize=(8, 10)) - axes.axis('off') - axes.imshow(im_met) - plt.title(str_title) - - if flag_legend: - coords = in_coord + np.array([0, 0, 10, 10]) - flag_notinto = 1 - for ind_g, i_g in enumerate(groups): - if ind_g >= int(len(groups)/2) and flag_notinto: - flag_notinto = 0 - coords[0] = in_coord - coords[1] += int(im_met.shape[1]/1.5) - coords[2] = in_coord + 10 - coords[3] += int(im_met.shape[1]/1.5) - plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left') - coords[0] += inc_page - coords[2] += inc_page - - if flag_save_figs: - format_fig = 'png' - name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) - + '_page' + str(ind_page) + '.' + format_fig) - fig.savefig(name_fig, format = format_fig, dpi = dpi) - plt.close(fig) - - - \ No newline at end of file +version https://git-lfs.github.com/spec/v1 +oid sha256:149424a5b7e006d37e7802e942fdfee48b36a171d0d24d9be203344a80f01cae +size 36219 -- GitLab