From 92eb8d7a299a643c75ca254b885e3749069820e4 Mon Sep 17 00:00:00 2001
From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch>
Date: Mon, 10 Dec 2018 11:35:45 +0000
Subject: [PATCH] renku run python src/python/run_correctxml.py
 data/AB//1940/02_extractedxml.tar.gz data/AB//1940/04_correctedxml.tar.gz

---
 .gitattributes                                |   2 +
 ...d0fe29c9646473b946bb4e307a1a4fc_python.cwl |  67 ++
 data/AB/1940/04_correctedxml.tar.gz           |   3 +
 notebooks/RunningClasses.ipynb                |  23 +-
 src/python/def_classes.py                     | 694 +-----------------
 5 files changed, 91 insertions(+), 698 deletions(-)
 create mode 100644 .renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl
 create mode 100644 data/AB/1940/04_correctedxml.tar.gz

diff --git a/.gitattributes b/.gitattributes
index 2bcc997f..1def3672 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -112,3 +112,5 @@ data/AB/1936/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
 data/AB/1937/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
 data/AB/1938/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
 data/AB/1939/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
+src/python/def_classes.py filter=lfs diff=lfs merge=lfs -text
+data/AB/1940/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/.renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl b/.renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl
new file mode 100644
index 00000000..f3dc1418
--- /dev/null
+++ b/.renku/workflow/fd0fe29c9646473b946bb4e307a1a4fc_python.cwl
@@ -0,0 +1,67 @@
+arguments: []
+baseCommand:
+- python
+class: CommandLineTool
+cwlVersion: v1.0
+hints: []
+inputs:
+  input_1:
+    default:
+      class: File
+      path: ../../src/python/run_correctxml.py
+    inputBinding:
+      position: 1
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_2:
+    default:
+      class: File
+      path: ../../data/AB/1940/02_extractedxml.tar.gz
+    inputBinding:
+      position: 2
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_3:
+    default: data/AB/1940/04_correctedxml.tar.gz
+    inputBinding:
+      position: 3
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+outputs:
+  output_0:
+    outputBinding:
+      glob: notebooks/RunningClasses.ipynb
+    streamable: false
+    type: File
+  output_1:
+    outputBinding:
+      glob: src/python/def_classes.py
+    streamable: false
+    type: File
+  output_2:
+    outputBinding:
+      glob: $(inputs.input_3)
+    streamable: false
+    type: File
+permanentFailCodes: []
+requirements:
+- class: InlineJavascriptRequirement
+- class: InitialWorkDirRequirement
+  listing:
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: notebooks
+    writable: true
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: src/python
+    writable: true
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: data/AB/1940
+    writable: true
+successCodes: []
+temporaryFailCodes: []
diff --git a/data/AB/1940/04_correctedxml.tar.gz b/data/AB/1940/04_correctedxml.tar.gz
new file mode 100644
index 00000000..af6fd2ce
--- /dev/null
+++ b/data/AB/1940/04_correctedxml.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7c6a255842500864f285f8f87670a512448927ff35e98a141d6cf34b905e9ce
+size 6645275
diff --git a/notebooks/RunningClasses.ipynb b/notebooks/RunningClasses.ipynb
index 211a9f93..48955862 100755
--- a/notebooks/RunningClasses.ipynb
+++ b/notebooks/RunningClasses.ipynb
@@ -9,9 +9,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 101,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2\n",
@@ -28,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -337,7 +346,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -348,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 104,
    "metadata": {},
    "outputs": [
     {
@@ -369,7 +378,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 105,
    "metadata": {},
    "outputs": [
     {
@@ -378,7 +387,7 @@
        "True"
       ]
      },
-     "execution_count": 99,
+     "execution_count": 105,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 9a90013b..b8954add 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -1,691 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Sep 28 13:31:06 2018
-
-@author: luissalamanca
-"""
-
-import sys, os
-
-from colour import Color
-import matplotlib.image as mpimg
-from mpl_toolkits.mplot3d import Axes3D
-import matplotlib.pyplot as plt
-import numpy as np
-import xml.etree.ElementTree as ET
-import copy
-import time 
-import tarfile
-import pickle
-
-from pdf2image import convert_from_path, convert_from_bytes
-
-import utils_proc
-import utils_annot
-import plot_tools
-import preproc_docs
-
-
-
-
-
-# Definition of classes and methods associated
-
-class Document:
-    
-    limit_year = 1950
-    flag_end_run = 1
-    name_inpdf = '00_rawpdfs'
-    name_inmeta = '01_rawmeta'
-    
-    def __init__(self, input_file, folder_database):
-        self.year = int(input_file.split('/')[-2])
-        self.id_doc = input_file.split('/')[-1].split('.')[0]
-        self.input_file = input_file
-        _, self.name_file = os.path.split(input_file)
-        self.path_file = folder_database + str(self.year) + '/'
-        self.name_wo_ext = os.path.splitext(self.name_file)[0]
-        self.folder_database = folder_database
-        self._meta_ext()
-        
-    def _meta_ext(self):
-    # Both for the correction and the extraction of the metadata information
-        name_file = str(self.year) + '/' + self.id_doc + '.xml'
-        name_file_db = str(self.year) + '/' + self.id_doc + '.db'
-        name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz'
-        self.name_meta = [name_tar, name_file, name_file_db]        
-    
-    def meta_correct(self, name_outmeta = '03_correctedmeta'):
-        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
-        utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta)
-        name_meta_corr = utils_proc.correct_metadata(self.year, self.id_doc, self.flag_end_run)
-        name_tar = utils_proc.addto_tar(name_meta_corr, self.folder_database, name_file = name_outmeta)
-        self.name_outmeta = name_outmeta
-        command = 'rm -rf ./' + str(self.year)
-        #print(command)
-        utils_proc.call_with_out(command)
-    
-    def pdf2imgobj(self, resolution = 100):
-        
-        self.resolution = resolution
-        utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf)
-        self.imgobj = convert_from_path(self.input_file, dpi = resolution)
-        command = 'rm -rf ./' + str(self.year)     
-        utils_proc.call_with_out(command)
-
-    def _get_pages(self, pages = 'all'):
-        if 'imgobj' not in self.__dict__.keys():
-            self.pdf2imgobj()
-        if pages == 'all':
-            self.n_pages = np.arange(len(self.imgobj))
-        elif isinstance(pages,str):
-            self.n_pages = np.array(pages.split(',')).astype(np.uint32)
-        else:
-            self.n_pages = np.array(pages)  
-
-    def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1,
-                name_outxml = '02_extractedxml'):
-        # To extract the embedded text of the pdf into an xml file
-        if 'imgobj' not in self.__dict__.keys():
-            self.pdf2imgobj()
-        self._get_pages(pages = pages)
-        
-        utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf)
-        name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml,
-                                      flag_end = self.flag_end_run)
-        if flag_save:
-            name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outxml)
-        else:
-            print('Not saving to tar')
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
-            
-        command = 'rm -rf ./' + str(self.year)
-        #print(command)
-        utils_proc.call_with_out(command)
-        self.name_xml = [name_tar, name_xml]
-        if flag_save:
-            h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml)
-        else:
-            h_xml = name_xml
-        self.name_outxml = name_outxml
-        XML_tree = ET.parse(h_xml)
-        self.XML_main = XML_tree.getroot()
-        self.n_pages = np.arange(len(self.XML_main))
-        
-        
-    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, 
-                    ind_page = 0, textb_textl = 1):        
-        # The page refers here to the page of the imgobj, which might not correspond
-        # to the one of the xml. For that reason we use n_pages to obtain the index
-        # for the xml
-        # textb_textl =  1 for textboxes, and 2 for textlines
-        if (XML_root == None) and (XML_main == None):
-            return print('Not possible! - You need to provide a valid XML\n')
-        if np.sum(imarray.shape) == 0:
-            if 'imgobj' not in self.__dict__.keys():
-                imarray = np.array(self.imgobj[ind_page])
-            else:
-                return print('Not possible! - You need to convert first the pdf to image\n')
-        
-        if XML_root == None:
-            XML_root = ET.Element('pages')
-            ind_abs = np.argwhere(self.n_pages == ind_page)
-            XML_root.append(XML_main[ind_abs])
-        
-        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-        
-        imarray_textb = np.copy(imarray)
-        
-        if textb_textl == 1:
-            coord_textboxes = np.array([]).reshape((4,0))
-            for ind_el in range(0, len(XML_root[0])):
-                if XML_root[0][ind_el].tag == 'textbox':
-                    coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64)
-                    coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1)
-                    imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, 
-                                                                      bbox_page, color_vec = 'blue', alpha = True, 
-                                                                      filled = False, thick_line = 6) 
-            return imarray_textb, coord_textboxes
-        elif textb_textl == 2:   
-            imarray_textl = np.copy(imarray)
-            coord_textline = np.array([]).reshape((4,0))
-            all_font_sizes = np.array([])  
-            for ind_el in range(0, len(XML_root[0])):
-                for ind_line in range(0, len(XML_root[0][ind_el])):
-                    if XML_root[0][ind_el][ind_line].tag == 'textline':
-                        coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
-                        if len(XML_root[0][ind_el][ind_line]):
-                            all_font_sizes = np.concatenate((all_font_sizes, 
-                                                             np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64)))
-                        coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
-                        imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, 
-                                                       color_vec = 'red', alpha = True, filled = False, thick_line = 6)  
-                        
-            all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True)        
-            info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])),
-                                              counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64)))                        
-            
-            return imarray_textb, coord_textline, all_font_sizes, info_font_sizes        
-    
-    def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1,
-                    pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml',
-                    name_outcorrxml = '04_correctedxml', flag_save = 1):
-        
-        if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml
-        
-        start_time = time.time()
-        if 'imgobj' not in self.__dict__.keys():
-            self.pdf2imgobj()
-            
-        if 'XML_main' not in self.__dict__.keys():
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz'
-            if os.path.isfile(name_tar):
-                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml'
-                if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outxml)[0]:
-                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outxml)
-                    XML_tree = ET.parse(h_xml)
-                    self.XML_main = XML_tree.getroot()
-            else:
-                # TODO if already exists 02_extractedxml
-                self.pdf2xml(pages = pages, suffix_xml = suffix_xml)
-        
-        self._get_pages(pages = pages)
-        flag_central = 1
-        if self.year > self.limit_year:
-            flag_central = 0
-        flag_2col = 1
-        
-        XML_new = ET.Element('pages')
-                
-        for ind_abs, ind_page in enumerate(self.n_pages): 
-            
-            XML_root = ET.Element('pages')
-            #print(ind_abs,len(self.XML_main))
-            XML_root.append(self.XML_main[ind_abs])
-            imarray = np.array(self.imgobj[ind_page])
-        
-            if XML_root[0][0].tag == 'textbox':
-                bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-                dim_img = imarray.shape[:2]
-                _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page)
-            
-                # Image with textboxes highlighted
-                imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root)
-    
-                # Image with textlines highlighted, BUT also, array with all textlines 
-                # coordinates, and the fontsizes, required for later
-                _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root,
-                                                                           textb_textl = 2)                
-                
-                #####
-                # Central vertical line and horizontal lines, through Hough transform
-                coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
-                                                                          flag_2col, flag_central)                    
-                                        
-                #####
-                # Obtain lateral margins
-                margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
-                                               coord_horz.astype(np.uint32))
-                        
-                # Top and bottom line
-                ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
-                                               coord_horz.astype(np.uint32))
-                #print(info_font_sizes)                                      
-                #####
-                # Label the textboxes based on a set of simple rules that make use of 
-                # the margins and the fontsizes
-                label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
-                    preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est
-                                            
-                #####
-                # Order the textlines, taken all them together, in order to later merge
-                # in a single textbox textlines that so far form different textboxes
-                set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
-                                                          list_allcoords_textlines, margins)
-                
-                # Given the ordered textlines, group them in new textboxes, creating a 
-                # XML, This uses some criteria of distance between paragraphs
-                XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
-                                                    rescale_factor, centrall_ord, ind_page, dim_img)
-                
-                # Append to the new XML
-                XML_new.append(XML_enrich[0])
-                
-                
-                if flag_plots:
-                    im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def)
-                    im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1)
-                    im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)
-                    im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])    
-                    im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page)
-                    
-                    # Create figure with 4 subplots, for showing all results
-                    if flag_save_figs:
-                        path_output_img = self.path_file + '/previews'
-                        if flag_save_figs:
-                            if not os.path.exists(path_output_img):
-                                os.makedirs(path_output_img)
-                
-                    if flag_parallel:
-                        if flag_save_figs:
-                            name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl'
-                            with open(name_pickle, 'wb') as f:  # Python 3: open(..., 'wb')
-                                pickle.dump([im_met2, im_met3, im_met4, im_met5], f)
-                    
-                    else:
-                        fig, axes = plt.subplots(1, 4, figsize=(30, 10))
-                        ax = axes.ravel()
-                        ax[0].axis('off')
-                        ax[0].imshow(im_met2) 
-                        ax[1].axis('off')
-                        ax[1].imshow(im_met3)
-                        ax[2].axis('off')
-                        ax[2].imshow(im_met4)        
-                        ax[3].axis('off')
-                        ax[3].imshow(im_met5)
-                        
-                        if flag_save_figs:
-                            format_fig = 'png'
-                            name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig
-                            fig.savefig(name_fig, format = format_fig, dpi = 200)
-                            plt.close(fig)
-                
-        name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml'  
-        
-        tree = ET.ElementTree(XML_new)
-        self.XML_main_corr = XML_new
-        if not os.path.exists('./' + str(self.year)):
-            os.makedirs('./' + str(self.year))
-        tree.write(name_xml_prev, encoding = 'utf-8')
-        XML_new = preproc_docs.get_text_onefile(self.XML_main_corr)
-        name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' 
-        tree = ET.ElementTree(XML_new)
-        tree.write(name_xml, encoding = 'utf-8')
-        
-        if flag_save:
-            name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml)
-        else:
-            print('Not saving to tar')
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'        
-        
-        self.name_outcorrxml = name_outcorrxml
-        self.name_xml_corr = [name_tar, name_xml]
-        command = 'rm -rf ./' + str(self.year)
-        #print(command)
-        utils_proc.call_with_out(command)        
-        
-        print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))  
-
-        #XML_tree = ET.parse(name_xml)
-        #self.XML_main = XML_tree.getroot()
-    
-    def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0,
-                           name_outxml = '02_extractedxml'):
-        # ind_page has to be a scalar
-        
-        if 'imgobj' not in self.__dict__.keys():
-            self.pdf2imgobj()
-        if 'XML_main' not in self.__dict__.keys():
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
-            if os.path.isfile(name_tar):
-                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml'
-                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outxml)[0]:
-                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outxml)
-                    XML_tree = ET.parse(h_xml)
-                    self.XML_main = XML_tree.getroot()
-            else:
-                self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
-        else:
-            #print('Run this')
-            self._get_pages()
-            ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,))
-
-        #print(ind_abs, type(ind_abs))
-        #print(self.XML_main, len(self.imgobj))
-        
-        if ind_page > (len(self.XML_main) - 1):
-            flag_error = 1
-            return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
-        
-        XML_root = ET.Element('pages')
-        XML_root.append(self.XML_main[ind_abs[0]])
-        imarray = np.array(self.imgobj[ind_page])
-        
-        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-        dim_img = imarray.shape[:2]
-        
-        _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root,
-                                                                               textb_textl = 2)        
-        margins = [] 
-        ind_limits = [] 
-        label_textlines = []
-        list_allcoords_textlines = []
-        set_of_blocks = []
-        XML_enrich = []
-        
-        if level_proc > 0:
-            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
-                                                                                   flag_2col = 1)  
-            
-        if level_proc > 1:            
-            _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page)
-           
-        if level_proc > 2:            
-            #####
-            # Obtain lateral margins
-            margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
-                                           coord_horz.astype(np.uint32))                
-            
-        if level_proc > 3:            
-            # Top and bottom line
-            ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
-                                           coord_horz.astype(np.uint32))
-            
-        if level_proc > 4:            
-            label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
-                preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes)   
-            
-        if level_proc > 5:            
-            set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
-                                                                   list_allcoords_textlines, margins)            
-            
-        if level_proc > 6:            
-            XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
-                                                rescale_factor, centrall_ord, ind_page, dim_img)        
-        
-        # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we
-        # send a 1 instead
-        flag_error = 0
-        return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \
-            set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error
-            
-    def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'):
-        
-        if 'imgobj' not in self.__dict__.keys():
-            self.pdf2imgobj()
-        if 'XML_main_corr' not in self.__dict__.keys():            
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
-            if os.path.isfile(name_tar):
-                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
-                #print(name_xml)
-                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:    
-                    #print('Run this')
-                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
-                    XML_tree = ET.parse(h_xml)
-                    self.XML_main_corr = XML_tree.getroot()
-                else:
-                    print('You need to have the tar file to use flag_compute = 0!')
-                    flag_error = 1
-                    return 0, 0, 0, 0, 0, 0, flag_error
-                    #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            else:
-                print('You need to have the tar file to use flag_compute = 0!')
-                flag_error = 1
-                return 0, 0, 0, 0, 0, 0, flag_error
-                #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
-        else:
-            #print('Run this')
-            self._get_pages()
-            ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,))
-
-        #print(ind_abs, type(ind_abs))
-        #print(self.XML_main, len(self.imgobj))
-        
-        if ind_page > (len(self.XML_main_corr) - 1):
-            flag_error = 1
-            return 0, 0, 0, 0, 0, 0, flag_error
-        
-        XML_root = ET.Element('pages')
-        XML_root.append(self.XML_main_corr[ind_abs[0]])
-        imarray = np.array(self.imgobj[ind_page])
-        
-        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-        dim_img = imarray.shape[:2]        
-        
-        ######        
-        # For obtaining label_textlines, list_allcoords_textlines
-        coord_textline = np.array([]).reshape((4,0))
-        label_textlines = dict()
-        count = 0
-        count_l = 0
-        vec_textline_lines = list()
-        for ind_el in range(0, len(XML_root[0])):
-            for ind_line in range(0, len(XML_root[0][ind_el])):
-                if XML_root[0][ind_el][ind_line].tag == 'textline':
-                    if 'type' in XML_root[0][ind_el][ind_line].attrib:
-                        coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
-                        coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
-                    
-                        type_textl = XML_root[0][ind_el][ind_line].attrib['type']
-                        #print(ind_el)
-                        if XML_root[0][ind_el].attrib['type_textbox'] == 'line':
-                            vec_textline_lines.append(-1)
-                        else:
-                            vec_textline_lines.append(count_l)
-                            count_l += 1
-                        #print(type_textl)
-                        if type_textl in label_textlines.keys():
-                            aux_type = label_textlines[type_textl]
-                            aux_type = np.concatenate((aux_type, np.array([count]))).reshape((-1,))
-                            label_textlines[type_textl] = aux_type
-                        else:
-                            aux_type = np.array([count])
-                            label_textlines[type_textl] = aux_type
-                        count += 1
-        
-        coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page)
-        
-        ##### 
-        # To obtain set_of_blocks. This variable simply contains the coordinates, and
-        # then a final row indicating the order (here are already ordered), and if it 
-        # is a line, which is indicated with a -1        
-        set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0)
-        set_of_blocks = dict()
-        set_of_blocks[0] = set_of_blocks_aux
-        #print(set_of_blocks.shape)
-                        
-        # The last is the flag_error
-        #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks),
-        #     len(XML_root), bbox_page.shape)
-        flag_error = 0
-        return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error
-#                        imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error
-#                        imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error
-                        
-
-    def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', 
-                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'):
-                
-        if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml
-            
-        for ind_page in range_pages:
-            imarray, margins, ind_limits, _, _, \
-                _, _, _, XML_root, _, flag_error = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0,
-                                                               name_outxml = self.name_outxml)
-            
-            if flag_error:
-                print(str(ind_page) + ': non existing page!')
-            else:
-                imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root)
-
-                self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, 
-                                flag_plot, flag_save_figs)   
-
-    def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', 
-                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'):
-        
-        if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml
-        
-        for ind_page in range_pages:
-            imarray, margins, ind_limits, _, _, \
-                _, _, _, _, _, flag_error= self._plot_generic_open(ind_page, suffix_xml, level_proc = 4,
-                                                        name_outxml = self.name_outxml)
-
-            if flag_error:
-                print(str(ind_page) + ': non existing page!')
-            else:            
-                im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1)
-
-                self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)             
-                
-    def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', 
-                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
-                         name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1):
-        
-        if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml     
-        if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml              
-        
-        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
-        for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar): 
-                imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \
-                    self._plot_generic_open(ind_page, suffix_xml, level_proc = 5,
-                                            name_outxml = self.name_outxml)
-                #print(label_textlines,list_allcoords_textlines)
-            else:
-                imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
-                #print(len(array_elements))
-                
-            if flag_error:
-                print(str(ind_page) + ': non existing page!')
-            else:             
-                im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)     
-                self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file,
-                                       flag_plot, flag_save_figs, flag_legend)
-                            
-                
-    def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', 
-                           flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
-                           name_outcorrxml = '04_correctedxml', flag_compute = 0):
-        
-        if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml
-        if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml            
-        
-        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
-        for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar):            
-                imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \
-                    self._plot_generic_open(ind_page, suffix_xml, level_proc = 6,
-                                            name_outxml = self.name_outxml)
-            else: 
-                imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \
-                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                   
-                                    
-            #print(set_of_blocks)
-            if flag_error:
-                print(str(ind_page) + ': non existing page!')
-            else:            
-                im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) 
-
-                self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)            
-                
-    def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', 
-                        flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
-                        name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1):
-        # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes
-        if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml
-        if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml              
-        
-        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
-        for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar):             
-                imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \
-                    self._plot_generic_open(ind_page, suffix_xml, level_proc = 7,
-                                            name_outxml = self.name_outxml)
-            else:                    
-                imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \
-                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                  
-            
-            if flag_error:
-                print(str(ind_page) + ': non existing page!')
-            else:            
-                im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl)
-
-                self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)
-        
-    def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '',
-                   flag_plot = 1, flag_save_figs = 0, dpi = 200):
-        if flag_plot:
-            fig, axes = plt.subplots(1, 1, figsize=(8, 10))
-            axes.axis('off')
-            axes.imshow(im_met) 
-            plt.title(str_title)
-        if flag_save_figs:
-            format_fig = 'png'
-            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
-                        + '_page' + str(ind_page) + '.' + format_fig)
-            fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)       
-    
-    def check_discussion(self):
-        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
-        flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
-        command = 'rm -rf ./' + str(self.year)
-        #print(command)
-        utils_proc.call_with_out(command)
-
-        return flag_discussion
-
-    def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '',
-                           flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200):
-        #print(groups)
-        if flag_legend:
-            in_coord = 0
-            coords = in_coord + np.array([0, 0, 10, 10])
-            inc_page = 20
-            flag_notinto = 1
-            for ind_g, i_g in enumerate(groups):
-                if ind_g >= int(len(groups)/2) and flag_notinto:
-                    flag_notinto = 0
-                    coords[0] = in_coord
-                    coords[1] += int(im_met.shape[1]/1.5)
-                    coords[2] = in_coord + 10
-                    coords[3] += int(im_met.shape[1]/1.5)                    
-                im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6)        
-                coords[0] += inc_page
-                coords[2] += inc_page
-        
-        if flag_plot:
-            fig, axes = plt.subplots(1, 1, figsize=(8, 10))
-            axes.axis('off')
-            axes.imshow(im_met) 
-            plt.title(str_title)
-        
-        if flag_legend:
-            coords = in_coord + np.array([0, 0, 10, 10])
-            flag_notinto = 1
-            for ind_g, i_g in enumerate(groups):
-                if ind_g >= int(len(groups)/2) and flag_notinto:
-                    flag_notinto = 0
-                    coords[0] = in_coord
-                    coords[1] += int(im_met.shape[1]/1.5)
-                    coords[2] = in_coord + 10
-                    coords[3] += int(im_met.shape[1]/1.5)                    
-                plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left')      
-                coords[0] += inc_page
-                coords[2] += inc_page   
-        
-        if flag_save_figs:
-            format_fig = 'png'
-            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
-                        + '_page' + str(ind_page) + '.' + format_fig)
-            fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)             
-            
-        
-                           
\ No newline at end of file
+version https://git-lfs.github.com/spec/v1
+oid sha256:149424a5b7e006d37e7802e942fdfee48b36a171d0d24d9be203344a80f01cae
+size 36219
-- 
GitLab