diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index c888295e92ef91eeda4ba155b41b9ec446c490bc..8808fb4602673e53ee2b98b6f238c937df9630ac 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -1,3 +1,695 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1b2a2b225fb4605aad5cfe69b441a9e08c1991e95008246ca4b1c25207653fe5
-size 36362
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 28 13:31:06 2018
+
+@author: luissalamanca
+"""
+
+import sys, os
+
+from colour import Color
+import matplotlib.image as mpimg
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.pyplot as plt
+import numpy as np
+import xml.etree.ElementTree as ET
+import copy
+import time 
+import tarfile
+import pickle
+
+from pdf2image import convert_from_path, convert_from_bytes
+
+import utils_proc
+import utils_annot
+import plot_tools
+import preproc_docs
+
+
+
+
+
+# Definition of classes and methods associated
+
+class Document:
+    
+    limit_year = 1950
+    flag_end_run = 1
+    name_inpdf = '00_rawpdfs'
+    name_inmeta = '01_rawmeta'
+    
+    def __init__(self, input_file, folder_database):
+        self.year = int(input_file.split('/')[-2])
+        self.id_doc = input_file.split('/')[-1].split('.')[0]
+        self.input_file = input_file
+        _, self.name_file = os.path.split(input_file)
+        self.path_file = folder_database + str(self.year) + '/'
+        self.name_wo_ext = os.path.splitext(self.name_file)[0]
+        self.folder_database = folder_database
+        self._meta_ext()
+        
+    def _meta_ext(self):
+    # Both for the correction and the extraction of the metadata information
+        name_file = str(self.year) + '/' + self.id_doc + '.xml'
+        name_file_db = str(self.year) + '/' + self.id_doc + '.db'
+        name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz'
+        self.name_meta = [name_tar, name_file, name_file_db]        
+    
+    def meta_correct(self, name_outmeta = '03_correctedmeta'):
+        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
+        utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta)
+        name_meta_corr = utils_proc.correct_metadata(self.year, self.id_doc, self.flag_end_run)
+        name_tar = utils_proc.addto_tar(name_meta_corr, self.folder_database, name_file = name_outmeta)
+        self.name_outmeta = name_outmeta
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+    
+    def pdf2imgobj(self, resolution = 100):
+        
+        self.resolution = resolution
+        utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf)
+        self.imgobj = convert_from_path(self.input_file, dpi = resolution)
+        command = 'rm -rf ./' + str(self.year)     
+        utils_proc.call_with_out(command)
+
+    def _get_pages(self, pages = 'all'):
+        if 'imgobj' not in self.__dict__.keys():
+            self.pdf2imgobj()
+        if pages == 'all':
+            self.n_pages = np.arange(len(self.imgobj))
+        elif isinstance(pages,str):
+            self.n_pages = np.array(pages.split(',')).astype(np.uint32)
+        else:
+            self.n_pages = np.array(pages)  
+
+    def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1,
+                name_outxml = '02_extractedxml'):
+        # To extract the embedded text of the pdf into an xml file
+        if 'imgobj' not in self.__dict__.keys():
+            self.pdf2imgobj()
+        self._get_pages(pages = pages)
+        
+        utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf)
+        name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml,
+                                      flag_end = self.flag_end_run)
+        if flag_save:
+            name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outxml)
+        else:
+            print('Not saving to tar')
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
+            
+        self.name_xml = [name_tar, name_xml]
+        if flag_save:
+            h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml)
+        else:
+            h_xml = name_xml
+        self.name_outxml = name_outxml
+        XML_tree = ET.parse(h_xml)
+        self.XML_main = XML_tree.getroot()
+        self.n_pages = np.arange(len(self.XML_main))
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)        
+        
+    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, 
+                    ind_page = 0, textb_textl = 1):        
+        # The page refers here to the page of the imgobj, which might not correspond
+        # to the one of the xml. For that reason we use n_pages to obtain the index
+        # for the xml
+        # textb_textl =  1 for textboxes, and 2 for textlines
+        if (XML_root == None) and (XML_main == None):
+            return print('Not possible! - You need to provide a valid XML\n')
+        if np.sum(imarray.shape) == 0:
+            if 'imgobj' not in self.__dict__.keys():
+                imarray = np.array(self.imgobj[ind_page])
+            else:
+                return print('Not possible! - You need to convert first the pdf to image\n')
+        
+        if XML_root == None:
+            XML_root = ET.Element('pages')
+            ind_abs = np.argwhere(self.n_pages == ind_page)
+            XML_root.append(XML_main[ind_abs])
+        
+        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
+        
+        imarray_textb = np.copy(imarray)
+        
+        if textb_textl == 1:
+            coord_textboxes = np.array([]).reshape((4,0))
+            for ind_el in range(0, len(XML_root[0])):
+                if XML_root[0][ind_el].tag == 'textbox':
+                    coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64)
+                    coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1)
+                    imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, 
+                                                                      bbox_page, color_vec = 'blue', alpha = True, 
+                                                                      filled = False, thick_line = 6) 
+            return imarray_textb, coord_textboxes
+        elif textb_textl == 2:   
+            imarray_textl = np.copy(imarray)
+            coord_textline = np.array([]).reshape((4,0))
+            all_font_sizes = np.array([])  
+            for ind_el in range(0, len(XML_root[0])):
+                for ind_line in range(0, len(XML_root[0][ind_el])):
+                    if XML_root[0][ind_el][ind_line].tag == 'textline':
+                        coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
+                        if len(XML_root[0][ind_el][ind_line]):
+                            all_font_sizes = np.concatenate((all_font_sizes, 
+                                                             np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64)))
+                        coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
+                        imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, 
+                                                       color_vec = 'red', alpha = True, filled = False, thick_line = 6)  
+                        
+            all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True)        
+            info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])),
+                                              counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64)))                        
+            
+            return imarray_textb, coord_textline, all_font_sizes, info_font_sizes        
+    
+    def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1,
+                    pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml',
+                    name_outcorrxml = '04_correctedxml', flag_save = 1):
+        
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+        
+        start_time = time.time()
+        if 'imgobj' not in self.__dict__.keys():
+            self.pdf2imgobj()
+            
+        if 'XML_main' not in self.__dict__.keys():
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz'
+            if os.path.isfile(name_tar):
+                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml'
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outxml)[0]:
+                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outxml)
+                    XML_tree = ET.parse(h_xml)
+                    self.XML_main = XML_tree.getroot()
+            else:
+                # TODO if already exists 02_extractedxml
+                self.pdf2xml(pages = pages, suffix_xml = suffix_xml)
+        
+        self._get_pages(pages = pages)
+        flag_central = 1
+        if self.year > self.limit_year:
+            flag_central = 0
+        flag_2col = 1
+        
+        XML_new = ET.Element('pages')
+                
+        for ind_abs, ind_page in enumerate(self.n_pages): 
+            
+            XML_root = ET.Element('pages')
+            #print(ind_abs,len(self.XML_main))
+            XML_root.append(self.XML_main[ind_abs])
+            imarray = np.array(self.imgobj[ind_page])
+        
+            if XML_root[0][0].tag == 'textbox':
+                bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
+                dim_img = imarray.shape[:2]
+                _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page)
+            
+                # Image with textboxes highlighted
+                imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root)
+    
+                # Image with textlines highlighted, BUT also, array with all textlines 
+                # coordinates, and the fontsizes, required for later
+                _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root,
+                                                                           textb_textl = 2)                
+                
+                #####
+                # Central vertical line and horizontal lines, through Hough transform
+                coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
+                                                                          flag_2col, flag_central)                    
+                                        
+                #####
+                # Obtain lateral margins
+                margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+                                               coord_horz.astype(np.uint32))
+                        
+                # Top and bottom line
+                ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+                                               coord_horz.astype(np.uint32))
+                #print(info_font_sizes)                                      
+                #####
+                # Label the textboxes based on a set of simple rules that make use of 
+                # the margins and the fontsizes
+                label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
+                    preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est
+                                            
+                #####
+                # Order the textlines, taken all them together, in order to later merge
+                # in a single textbox textlines that so far form different textboxes
+                set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
+                                                          list_allcoords_textlines, margins)
+                
+                # Given the ordered textlines, group them in new textboxes, creating a 
+                # XML, This uses some criteria of distance between paragraphs
+                XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
+                                                    rescale_factor, centrall_ord, ind_page, dim_img)
+                
+                # Append to the new XML
+                XML_new.append(XML_enrich[0])
+                
+                
+                if flag_plots:
+                    im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def)
+                    im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1)
+                    im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)
+                    im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])    
+                    im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page)
+                    
+                    # Create figure with 4 subplots, for showing all results
+                    if flag_save_figs:
+                        path_output_img = self.path_file + '/previews'
+                        if flag_save_figs:
+                            if not os.path.exists(path_output_img):
+                                os.makedirs(path_output_img)
+                
+                    if flag_parallel:
+                        if flag_save_figs:
+                            name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl'
+                            with open(name_pickle, 'wb') as f:  # Python 3: open(..., 'wb')
+                                pickle.dump([im_met2, im_met3, im_met4, im_met5], f)
+                    
+                    else:
+                        fig, axes = plt.subplots(1, 4, figsize=(30, 10))
+                        ax = axes.ravel()
+                        ax[0].axis('off')
+                        ax[0].imshow(im_met2) 
+                        ax[1].axis('off')
+                        ax[1].imshow(im_met3)
+                        ax[2].axis('off')
+                        ax[2].imshow(im_met4)        
+                        ax[3].axis('off')
+                        ax[3].imshow(im_met5)
+                        
+                        if flag_save_figs:
+                            format_fig = 'png'
+                            name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig
+                            fig.savefig(name_fig, format = format_fig, dpi = 200)
+                            plt.close(fig)
+                
+        name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml'  
+        
+        tree = ET.ElementTree(XML_new)
+        self.XML_main_corr = XML_new
+        if not os.path.exists('./' + str(self.year)):
+            os.makedirs('./' + str(self.year))
+        tree.write(name_xml_prev, encoding = 'utf-8')
+        XML_new = preproc_docs.get_text_onefile(self.XML_main_corr)
+        name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' 
+        tree = ET.ElementTree(XML_new)
+        tree.write(name_xml, encoding = 'utf-8')
+        
+        if flag_save:
+            name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml)
+        else:
+            print('Not saving to tar')
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'        
+        
+        self.name_outcorrxml = name_outcorrxml
+        self.name_xml_corr = [name_tar, name_xml]
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)        
+        
+        print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))  
+
+        #XML_tree = ET.parse(name_xml)
+        #self.XML_main = XML_tree.getroot()
+    
+    def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0,
+                           name_outxml = '02_extractedxml'):
+        # ind_page has to be a scalar
+        
+        if 'imgobj' not in self.__dict__.keys():
+            self.pdf2imgobj()
+        if 'XML_main' not in self.__dict__.keys():
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
+            if os.path.isfile(name_tar):
+                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml'
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outxml)[0]:
+                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outxml)
+                    XML_tree = ET.parse(h_xml)
+                    self.XML_main = XML_tree.getroot()
+            else:
+                self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
+            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
+        else:
+            #print('Run this')
+            self._get_pages()
+            ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,))
+
+        #print(ind_abs, type(ind_abs))
+        #print(self.XML_main, len(self.imgobj))
+        
+        if ind_page > (len(self.XML_main) - 1):
+            flag_error = 1
+            return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
+        
+        flag_central = 1
+        if self.year > self.limit_year:
+            flag_central = 0
+        flag_2col = 1        
+        
+        XML_root = ET.Element('pages')
+        XML_root.append(self.XML_main[ind_abs[0]])
+        imarray = np.array(self.imgobj[ind_page])
+        
+        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
+        dim_img = imarray.shape[:2]
+        
+        _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root,
+                                                                               textb_textl = 2)        
+        margins = [] 
+        ind_limits = [] 
+        label_textlines = []
+        list_allcoords_textlines = []
+        set_of_blocks = []
+        XML_enrich = []
+        
+        if level_proc > 0:
+            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
+                                                                                   flag_2col, flag_central)  
+            
+        if level_proc > 1:            
+            _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page)
+           
+        if level_proc > 2:            
+            #####
+            # Obtain lateral margins
+            margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+                                           coord_horz.astype(np.uint32))                
+            
+        if level_proc > 3:            
+            # Top and bottom line
+            ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+                                           coord_horz.astype(np.uint32))
+            
+        if level_proc > 4:            
+            label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
+                preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes)   
+            
+        if level_proc > 5:            
+            set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
+                                                                   list_allcoords_textlines, margins)            
+            
+        if level_proc > 6:            
+            XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
+                                                rescale_factor, centrall_ord, ind_page, dim_img)        
+        
+        # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we
+        # send a 1 instead
+        flag_error = 0
+        return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \
+            set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error
+            
+    def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'):
+        
+        if 'imgobj' not in self.__dict__.keys():
+            self.pdf2imgobj()
+        if 'XML_main_corr' not in self.__dict__.keys():            
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
+            if os.path.isfile(name_tar):
+                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
+                #print(name_xml)
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:    
+                    #print('Run this')
+                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
+                    XML_tree = ET.parse(h_xml)
+                    self.XML_main_corr = XML_tree.getroot()
+                else:
+                    print('You need to have the tar file to use flag_compute = 0!')
+                    flag_error = 1
+                    return 0, 0, 0, 0, 0, 0, flag_error
+                    #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
+            else:
+                print('You need to have the tar file to use flag_compute = 0!')
+                flag_error = 1
+                return 0, 0, 0, 0, 0, 0, flag_error
+                #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
+            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
+        else:
+            #print('Run this')
+            self._get_pages()
+            ind_abs = np.argwhere(self.n_pages == ind_page).reshape((-1,))
+
+        #print(ind_abs, type(ind_abs))
+        #print(self.XML_main, len(self.imgobj))
+        
+        if ind_page > (len(self.XML_main_corr) - 1):
+            flag_error = 1
+            return 0, 0, 0, 0, 0, 0, flag_error
+        
+        XML_root = ET.Element('pages')
+        XML_root.append(self.XML_main_corr[ind_abs[0]])
+        imarray = np.array(self.imgobj[ind_page])
+        
+        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
+        dim_img = imarray.shape[:2]        
+        
+        ######        
+        # For obtaining label_textlines, list_allcoords_textlines
+        coord_textline = np.array([]).reshape((4,0))
+        label_textlines = dict()
+        count = 0
+        count_l = 0
+        vec_textline_lines = list()
+        for ind_el in range(0, len(XML_root[0])):
+            for ind_line in range(0, len(XML_root[0][ind_el])):
+                if XML_root[0][ind_el][ind_line].tag == 'textline':
+                    if 'type' in XML_root[0][ind_el][ind_line].attrib:
+                        coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
+                        coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
+                    
+                        type_textl = XML_root[0][ind_el][ind_line].attrib['type']
+                        #print(ind_el)
+                        if XML_root[0][ind_el].attrib['type_textbox'] == 'line':
+                            vec_textline_lines.append(-1)
+                        else:
+                            vec_textline_lines.append(count_l)
+                            count_l += 1
+                        #print(type_textl)
+                        if type_textl in label_textlines.keys():
+                            aux_type = label_textlines[type_textl]
+                            aux_type = np.concatenate((aux_type, np.array([count]))).reshape((-1,))
+                            label_textlines[type_textl] = aux_type
+                        else:
+                            aux_type = np.array([count])
+                            label_textlines[type_textl] = aux_type
+                        count += 1
+        
+        coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page)
+        
+        ##### 
+        # To obtain set_of_blocks. This variable simply contains the coordinates, and
+        # then a final row indicating the order (here are already ordered), and if it 
+        # is a line, which is indicated with a -1        
+        set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0)
+        set_of_blocks = dict()
+        set_of_blocks[0] = set_of_blocks_aux
+        #print(set_of_blocks.shape)
+                        
+        # The last is the flag_error
+        #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks),
+        #     len(XML_root), bbox_page.shape)
+        flag_error = 0
+        return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error
+#                        imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error
+#                        imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error
+                        
+
+    def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', 
+                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'):
+                
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+            
+        for ind_page in range_pages:
+            imarray, margins, ind_limits, _, _, \
+                _, _, _, XML_root, _, flag_error = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0,
+                                                               name_outxml = self.name_outxml)
+            
+            if flag_error:
+                print(str(ind_page) + ': non existing page!')
+            else:
+                imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root)
+
+                self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, 
+                                flag_plot, flag_save_figs)   
+
+    def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', 
+                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'):
+        
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+        
+        for ind_page in range_pages:
+            imarray, margins, ind_limits, _, _, \
+                _, _, _, _, _, flag_error= self._plot_generic_open(ind_page, suffix_xml, level_proc = 4,
+                                                        name_outxml = self.name_outxml)
+
+            if flag_error:
+                print(str(ind_page) + ': non existing page!')
+            else:            
+                im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1)
+
+                self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file,
+                       flag_plot, flag_save_figs)             
+                
+    def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', 
+                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
+                         name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1):
+        
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml     
+        if 'name_outcorrxml' not in self.__dict__.keys():
+            self.name_outcorrxml = name_outcorrxml              
+        
+        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
+        for ind_page in range_pages:
+            if flag_compute or not os.path.isfile(name_tar): 
+                imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \
+                    self._plot_generic_open(ind_page, suffix_xml, level_proc = 5,
+                                            name_outxml = self.name_outxml)
+                #print(label_textlines,list_allcoords_textlines)
+            else:
+                imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
+                #print(len(array_elements))
+                
+            if flag_error:
+                print(str(ind_page) + ': non existing page!')
+            else:             
+                im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)     
+                self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file,
+                                       flag_plot, flag_save_figs, flag_legend)
+                            
+                
+    def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', 
+                           flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
+                           name_outcorrxml = '04_correctedxml', flag_compute = 0):
+        
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+        if 'name_outcorrxml' not in self.__dict__.keys():
+            self.name_outcorrxml = name_outcorrxml            
+        
+        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
+        for ind_page in range_pages:
+            if flag_compute or not os.path.isfile(name_tar):            
+                imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \
+                    self._plot_generic_open(ind_page, suffix_xml, level_proc = 6,
+                                            name_outxml = self.name_outxml)
+            else: 
+                imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \
+                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                   
+                                    
+            #print(set_of_blocks)
+            if flag_error:
+                print(str(ind_page) + ': non existing page!')
+            else:            
+                im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) 
+
+                self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file,
+                       flag_plot, flag_save_figs)            
+                
+    def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', 
+                        flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
+                        name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1):
+        # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+        if 'name_outcorrxml' not in self.__dict__.keys():
+            self.name_outcorrxml = name_outcorrxml              
+        
+        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
+        for ind_page in range_pages:
+            if flag_compute or not os.path.isfile(name_tar):             
+                imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \
+                    self._plot_generic_open(ind_page, suffix_xml, level_proc = 7,
+                                            name_outxml = self.name_outxml)
+            else:                    
+                imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \
+                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                  
+            
+            if flag_error:
+                print(str(ind_page) + ': non existing page!')
+            else:            
+                im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl)
+
+                self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file,
+                       flag_plot, flag_save_figs)
+        
+    def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '',
+                   flag_plot = 1, flag_save_figs = 0, dpi = 200):
+        if flag_plot:
+            fig, axes = plt.subplots(1, 1, figsize=(8, 10))
+            axes.axis('off')
+            axes.imshow(im_met) 
+            plt.title(str_title)
+        if flag_save_figs:
+            format_fig = 'png'
+            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
+                        + '_page' + str(ind_page) + '.' + format_fig)
+            fig.savefig(name_fig, format = format_fig, dpi = dpi)
+            plt.close(fig)       
+    
+    def check_discussion(self):
+        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
+        flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+
+        return flag_discussion
+
+    def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '',
+                           flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200):
+        #print(groups)
+        if flag_legend:
+            in_coord = 0
+            coords = in_coord + np.array([0, 0, 10, 10])
+            inc_page = 20
+            flag_notinto = 1
+            for ind_g, i_g in enumerate(groups):
+                if ind_g >= int(len(groups)/2) and flag_notinto:
+                    flag_notinto = 0
+                    coords[0] = in_coord
+                    coords[1] += int(im_met.shape[1]/1.5)
+                    coords[2] = in_coord + 10
+                    coords[3] += int(im_met.shape[1]/1.5)                    
+                im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6)        
+                coords[0] += inc_page
+                coords[2] += inc_page
+        
+        if flag_plot:
+            fig, axes = plt.subplots(1, 1, figsize=(8, 10))
+            axes.axis('off')
+            axes.imshow(im_met) 
+            plt.title(str_title)
+        
+        if flag_legend:
+            coords = in_coord + np.array([0, 0, 10, 10])
+            flag_notinto = 1
+            for ind_g, i_g in enumerate(groups):
+                if ind_g >= int(len(groups)/2) and flag_notinto:
+                    flag_notinto = 0
+                    coords[0] = in_coord
+                    coords[1] += int(im_met.shape[1]/1.5)
+                    coords[2] = in_coord + 10
+                    coords[3] += int(im_met.shape[1]/1.5)                    
+                plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left')      
+                coords[0] += inc_page
+                coords[2] += inc_page   
+        
+        if flag_save_figs:
+            format_fig = 'png'
+            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
+                        + '_page' + str(ind_page) + '.' + format_fig)
+            fig.savefig(name_fig, format = format_fig, dpi = dpi)
+            plt.close(fig)             
+            
+        
+                           
\ No newline at end of file
diff --git a/src/python/utils_proc.py b/src/python/utils_proc.py
index b222bbe176471ff507d02453d37fa3083ebd9648..62e74547d3d1802fd357f84d812b9e9c28e1c630 100644
--- a/src/python/utils_proc.py
+++ b/src/python/utils_proc.py
@@ -1,3 +1,208 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d39e3bdb2d4572a0bb7f488d038795c975fe9db4d2873cf36d19f1bf7a87c6ae
-size 7891
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 28 13:44:58 2018
+
+@author: luissalamanca
+"""
+
+# Just some useful various functions
+
+import os
+import numpy as np
+import git
+import subprocess
+import tarfile
+from pdf2image import convert_from_path, convert_from_bytes
+import re
+import xml.etree.ElementTree as ET
+
+#git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
+#git_root = git_repo.git.rev_parse("--show-toplevel")
+
+PATH_PDF2TXT = 'pdf2txt.py'
+#print(PATH_PDF2TXT)
+#PATH_PDF2TXT = "/Users/luissalamanca/anaconda3/envs/py36/bin/pdf2txt.py"
+HEIGHT_CHAR = 12
+WIDTH_CHAR = 6
+
+
+def call_with_out(full_comm):
+    ## call date command ##
+    #print(full_comm)
+    p = subprocess.Popen(full_comm, stdout=subprocess.PIPE, shell=True) 
+    (output, err) = p.communicate()
+    p_status = p.wait() ## Wait for function to terminate.
+    #print("Command output : ", output)
+#    print("Command exit status/return code : ", p_status)
+    return output
+ 
+# This is a bit cumbersome, but wand.image is not importing the pdf from Spyder, 
+# but it is working from the terminal. Thus, I just run function 
+def pdf2png(input_file, res = 300, output_file = '', output_path = ''):    
+    if len(output_file) == 0:
+        output_file = os.path.split(os.path.splitext(input_file)[0])[1]
+    if len(output_path) == 0:
+        output_path = os.path.split(os.path.splitext(input_file)[0])[0]        
+    convert_from_path(input_file, dpi = res, output_folder = output_path, fmt = 'png')
+
+def pdf2xml(input_file, output_file = '', output_path = '', fmt = 'xml', suffix_str = '_data', 
+            page_n = 1, flag_end = 0):
+    # Also to txt or html
+    if len(output_file) == 0:
+        output_file = os.path.split(os.path.splitext(input_file)[0])[1]
+    if len(output_path) == 0:
+        output_path = os.path.split(os.path.splitext(input_file)[0])[0] 
+    ext = fmt
+    if fmt == 'text':
+        ext = 'txt'  
+        
+    string_nums = np.array2string(page_n[0])
+    for str in page_n[1:]:
+        string_nums = string_nums + ',' + np.array2string(str)
+    name_out = output_path + "/" + output_file + suffix_str + "." + ext
+    
+    if flag_end:
+        full_comm = PATH_PDF2TXT + " -o " \
+            + name_out + " -t " + fmt + " -p " \
+            + string_nums + " " + input_file   
+    else:
+        full_comm = "python " + PATH_PDF2TXT + " " + input_file + " -o " \
+                    + name_out + " -t " + fmt + " -p " \
+                    + string_nums
+
+             
+#    print(full_comm)
+    call_with_out(full_comm)
+    return name_out
+
+def get_list(year, folder_database, name_file):
+    with tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz') as tar:
+       members = tar.getmembers()
+    
+       files_aux = [
+           tarinfo for tarinfo in members
+           if tarinfo.isfile()           
+       ]
+       
+       files = list()
+       list_ids = list()
+       for f in files_aux:
+           aux_l = str(f).split(' ')[1][1:-1]
+           files.append(aux_l)
+           list_ids.append(aux_l.split('/')[-1].split('.')[0])
+           
+       list_ids = np.unique(np.array(list_ids)).tolist()
+       tar.close()
+       return files, list_ids
+   
+def get_handlerfile(input_file, folder_database, name_file):
+    if input_file[:2] != './':
+        year = input_file.split('/')[0]  
+        input_file = './' + input_file 
+    else:
+        year = input_file.split('/')[1]
+    return tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz').extractfile(input_file)
+    
+def tar_extractfile(input_file, folder_database, name_file):
+    if input_file[:2] != './':
+        year = input_file.split('/')[0]  
+        input_file = './' + input_file 
+    else:
+        year = input_file.split('/')[1] 
+    with tarfile.open(folder_database + '/' + str(year) +'/' + name_file + '.tar.gz') as tar:
+        tar.extract(input_file)    
+        tar.close()
+        return input_file
+    
+def addto_tar(input_file, folder_database, name_file):
+    if input_file[:2] != './':
+        year = input_file.split('/')[0]  
+        input_file = './' + input_file 
+    else:
+        year = input_file.split('/')[1] 
+    name_tar = folder_database + '/' + str(year) +'/' + name_file + '.tar.gz'  
+    if os.path.isfile(name_tar):
+        list_files_intar = get_list(year, folder_database, name_file)[0]
+        # Just to check if the file exists already inside the tar. In that case, we
+        # first extract everything, except for it, and tar everything again
+        if input_file in list_files_intar:
+            for in_f in list_files_intar:
+                if in_f!= input_file:
+                    #print(in_f, folder_database, name_file)
+                    tar_extractfile(in_f, folder_database, name_file)
+            tf = tarfile.open(name_tar, mode="w")
+            for in_f in list_files_intar:
+                tf.add(in_f)
+                    
+        else:
+            tf = tarfile.open(name_tar, mode="a")
+            tf.add(input_file)
+    else:
+        tf = tarfile.open(name_tar, mode="w")
+        tf.add(input_file)
+    tf.close()
+    return name_tar
+
+def compress_tar(infile, outname = ''):
+    if len(outname) == 0:
+        outname = infile
+    
+    # tar -xf data/AB/${year}/02_extractedxml.tar.gz
+    # tar -czvf data/AB/${year}/02_extractedxml.tar.gz ./${year}/
+    # rm -rf ${year}
+    year = infile.split('/')[-2] 
+    c1 = 'tar -xf ' + infile
+    call_with_out(c1)
+    c2 = 'tar -czvf ' + outname + ' ./' + str(year) + '/'
+    call_with_out(c2)
+    c3 = 'rm -rf ' + str(year)
+    call_with_out(c3)
+    
+
+def correct_metadata(year, id_doc, flag_end):
+    
+    keywords = ('In','Jahr','Band','Session','Rat','Sitzung','GeschÃ¤ftsnummer',
+                'Datum','Seite','Ref. No')
+    max_sep = 6 # Just a parameter to capture the textlines from the db file
+    
+    full_path = str(year) + '/' + str(id_doc)
+    command = 'cp ' + full_path + '.db ' + full_path + '_db.pdf'
+    call_with_out(command)
+    name_xml = pdf2xml(full_path + '_db.pdf', page_n = np.array([1]), suffix_str = '', flag_end = flag_end)
+    
+    XML_tree = ET.parse(name_xml)
+    XML_root = XML_tree.getroot()
+
+    coord_textline = np.array([]).reshape((6,0))
+    all_text_list = list()
+    for ind_el in range(len(XML_root[0])):
+        for ind_line in range(len(XML_root[0][ind_el])):
+            if XML_root[0][ind_el][ind_line].tag == 'textline':
+                coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
+                coord_textline_aux = np.concatenate((coord_textline_aux, np.array([ind_el,ind_line])))
+                coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((6,1))), axis = 1)
+                text_str = ''
+                aux_str = text_str.join([XML_root[0][ind_el][ind_line][ind_t].text for ind_t in range(len(XML_root[0][ind_el][ind_line]))])
+                all_text_list.append(aux_str)
+                
+    xml_extrameta = ET.Element('META_FROM_DB')
+    for keyw in keywords:
+        #print(keyw)
+        ind_textl = np.min(np.argwhere(np.array([all_text_list]) == keyw + '\n')[:,1])
+        coord_r = coord_textline[1,ind_textl]
+        distance = abs(coord_textline[1,:] - coord_r)
+        ind_valtextl = np.setdiff1d(np.argwhere(distance < max_sep),ind_textl)[0]
+        if ind_valtextl.size:
+            xml_extrameta.attrib[keyw.replace('. ','').upper()] = all_text_list[ind_valtextl][:-1]
+        else:
+            xml_extrameta.attrib[keyw.replace('. ','').upper()] = ''
+            
+    path_xml_meta = full_path + '.xml'
+    XML_tree_meta = ET.parse(path_xml_meta)
+    XML_root_meta = XML_tree_meta.getroot()
+    XML_root_meta[0].append(xml_extrameta)
+    
+    tree = ET.ElementTree(XML_root_meta)
+    tree.write(full_path + '_metacorr.xml', encoding = 'utf-8')
+    return full_path + '_metacorr.xml'
\ No newline at end of file