From b100be6d5813bb06f2bed1ba03f5fc3c30005ce0 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Thu, 6 Dec 2018 15:51:00 +0100
Subject: [PATCH] WIP again

---
 src/python/def_classes.py             |  50 ++++
 src/python/preproc_docs.py            | 390 +++++++++++++-------------
 src/python/run_extract_discussions.py |   8 +-
 src/python/utils_annot.py             |  52 ++++
 4 files changed, 302 insertions(+), 198 deletions(-)

diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index f90a20a6..30917e30 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -48,6 +48,7 @@ class Document:
         self.name_wo_ext = os.path.splitext(self.name_file)[0]
         self.folder_database = folder_database
         self._meta_ext()
+        self._xml_ext()
 
     def _meta_ext(self):
     # Both for the correction and the extraction of the metadata information
@@ -56,6 +57,13 @@ class Document:
         name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz'
         self.name_meta = [name_tar, name_file, name_file_db]
 
+    def _xml_ext(self, suffix_xml = '_data', name_outcorrxml = '04_correctedxml'):
+    # For the extraction, correction and annotation of the xmls
+    # TODO for extraction and annotation
+        name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml'
+        name_tar = self.folder_database + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
+        self.name_xml_corr = [name_tar, name_xml]
+
     def meta_correct(self, name_outmeta = '03_correctedmeta'):
         utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
         utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta)
@@ -309,6 +317,7 @@ class Document:
         
         self.name_outcorrxml = name_outcorrxml
         self.name_xml_corr = [name_tar, name_xml]
+        self._xml_ext(suffix_xml, self.name_outcorrxml)
         command = 'rm -rf ./' + str(self.year)
         #print(command)
         utils_proc.call_with_out(command)
@@ -684,6 +693,47 @@ class Document:
     def check_discussion(self):
         utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
         flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+
+        return flag_discussion
+
+
+
+    def annotate_speakers(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'):
+
+        if 'name_outcorrxml' not in self.__dict__.keys():
+            self.name_outcorrxml = name_outcorrxml
+
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+
+        if 'XML_main_corr' not in self.__dict__.keys():
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
+            if os.path.isfile(name_tar):
+                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outcorrxml)[0]:
+                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outcorrxml)
+                    XML_tree = ET.parse(h_xml)
+                    self.XML_main_corr = XML_tree.getroot()
+                else:
+                    self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0,
+                        pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml,
+                        name_outcorrxml = self.name_outcorrxml)
+            else:
+                # TODO if already exists 02_extractedxml
+                self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0,
+                    pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml,
+                    name_outcorrxml = self.name_outcorrxml)
+
+        #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
+        XML_corr = utils_annot.get_text_corrected(self.XML_main_corr)
+
+
+
+
+
         command = 'rm -rf ./' + str(self.year)
         #print(command)
         utils_proc.call_with_out(command)
diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py
index 8b4349f0..b9c9bbb0 100644
--- a/src/python/preproc_docs.py
+++ b/src/python/preproc_docs.py
@@ -6,7 +6,7 @@ Created on Fri Sep 28 13:39:10 2018
 @author: luissalamanca
 """
 
-# File for all the functions used for preprocessing. 
+# File for all the functions used for preprocessing.
 
 import numpy as np
 import os
@@ -46,42 +46,42 @@ import tables
 HEIGHT_CHAR = 12
 WIDTH_CHAR = 6
 
-def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, 
+def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
                                  flag_2col, flag_central = 1):
     # Using the coordinates of the boxes, we put the rest to 0, and then estimate
     # the central line
-    # Here, since we use the image, we have to rely again on a ref00 in topleft, and 
+    # Here, since we use the image, we have to rely again on a ref00 in topleft, and
     # the corners in topleftbottomright
-    # We also look for horizontal lines 
+    # We also look for horizontal lines
     # We assume that we will only have one vertical line, and then many horizontal
     # lines, either spanning the whole image, or at both sides of the central line
-            
+
     coord, rescale_factor = adapt_coordtoimg(img, coord, dim_bbox_page)
     img_aux = np.abs(255 - img[:,:,0])
     img_aux[img_aux < 20] = 0
     img_aux[img_aux >= 20] = 255
     img_aux_in = np.copy(img_aux)
-    
-    
+
+
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     gap_central = int(4 * width_resc)
     top_bbox_red = 0 #int(height_resc/2)
-    
+
     for ind in range(coord.shape[1]):
         img_aux[(coord[0,ind] + top_bbox_red):coord[2,ind],coord[1,ind]:coord[3,ind]] = 0
-    
+
     # Also remove possible mark and artefacts in the edges
     img_aux[:,:int(img_aux.shape[1]/20)] = 0
     img_aux[:int(img_aux.shape[0]/20),:] = 0
     img_aux[int(19 * img_aux.shape[0]/20):,:] = 0
-    img_aux[:,int(19 * img_aux.shape[1]/20):] = 0    
-    
+    img_aux[:,int(19 * img_aux.shape[1]/20):] = 0
+
     img_prev = np.copy(img_aux)
-    
+
     img_aux_rem = remove_small_objects(label(img_aux), 2 * width_resc)
     #img_aux = dilation(img_aux_rem, selem = np.ones((11,11)))
-    img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) 
+    img_aux = dilation(img_aux_rem, selem = np.ones((5,5)))
     max_val = np.max(img_aux)
     if max_val > 0:
         img_aux_norm = (255 * img_aux/max_val).astype(np.uint8)
@@ -90,13 +90,13 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
     else:
         img_aux[:] = 0
     #print(np.unique(img_aux))
-    
+
     # Remove big objects, like the shields and other logos
     #img_label = label(img_aux)
 
     edges = canny(img_aux, 2, 1, 25)
     #img_cent = np.copy(img_aux)
-    
+
     if flag_2col:
         if flag_central:
             img_cent = np.copy(img_prev)
@@ -108,37 +108,37 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
             #lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = 2 * width_resc,
             #                                 line_gap = width_resc)
             lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc),
-                                             line_gap = int(width_resc))            
+                                             line_gap = int(width_resc))
         else:
             sum_img_aux_in = np.sum(img_aux_in, axis = 0)
             sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)]
-            
+
             #plt.plot(sum_img_aux_in)
             #sum_img_aux_in[sum_img_aux_in < np.max(sum_img_aux_in)/10] = 0
             # We need to substract the baseline value, in order to account for
             # central headers and stuff like that
             sum_img_aux_in = sum_img_aux_in - np.min(sum_img_aux_in)
-            #not_end_vect = 1 
+            #not_end_vect = 1
             #while not_end_vect:
             ind_min_start = np.argwhere((sum_img_aux_in) < np.mean(sum_img_aux_in)/10)
-            ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start)   
+            ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start)
             ind_min_start = int(2*img_aux.shape[1]/5) + np.min(ind_min_start)
             ind_central = int((ind_min_start + ind_min_end)/2)
-            coord_vert_def = np.array([1, ind_central - int(width_resc/2), 
+            coord_vert_def = np.array([1, ind_central - int(width_resc/2),
                                    img_aux_in.shape[0], ind_central + int(width_resc/2)])
-            #print(lines_vert,img_aux.shape)  
-        
-    theta = np.linspace(-5*pi/8, -3* pi/8,num = 90)    
-    #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90)    
+            #print(lines_vert,img_aux.shape)
+
+    theta = np.linspace(-5*pi/8, -3* pi/8,num = 90)
+    #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90)
     #lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = 2 * width_resc,
-    #                                 line_gap = width_resc)      
+    #                                 line_gap = width_resc)
     lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = int(2 * width_resc),
-                                     line_gap = int(width_resc))  
-        
-    # These lines are given in a standard xy coordinate, with the corner in the 
-    # bottom left    
+                                     line_gap = int(width_resc))
+
+    # These lines are given in a standard xy coordinate, with the corner in the
+    # bottom left
     lines_horz = np.transpose(np.asarray(lines_horz).reshape((len(lines_horz),4)))
-    
+
 
     lines_horz = np.concatenate((np.minimum(lines_horz[1,:],lines_horz[3,:]).reshape((1,lines_horz.shape[1])),
                                  np.minimum(lines_horz[0,:],lines_horz[2,:]).reshape((1,lines_horz.shape[1])),
@@ -150,15 +150,15 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
                                      np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])),
                                      np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])),
                                      np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32)
-        
-    
+
+
     #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True)
-    #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True)      
-      
+    #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True)
+
     # First clean the vertical from unexpected outliers
     if flag_central:
-        sum_rows = np.sum(img_cent, axis = 0)/255    
-        ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) 
+        sum_rows = np.sum(img_cent, axis = 0)/255
+        ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)])
         ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]),
                                     np.argwhere([(ind_central - gap_central) < aux_l2 < (ind_central + gap_central) for aux_l2 in lines_vert[3,:]]))
         if len(ind_valid):
@@ -169,19 +169,19 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
             coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc])
 
     #ind_central = np.mean(coord_vert_def[[1,3]])
-    
+
     # And now, just iterate over the horizontal lines, merging them if required.
     return clean_horz_vert_lines(lines_horz, coord_vert_def, width_resc, height_resc,
                                  ind_central, gap_central, img_aux.shape)
-    
-    
+
+
 def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_resc,
                           ind_central, gap_central, dim_page):
     # We just iterate over all the horizontal lines, merging them if required
     coord_horz = np.array([]).reshape((4,0)).astype(np.int32)
     min_length_line = 2 * width_resc
-    
-    while coord_horz_pre.size > 3:        
+
+    while coord_horz_pre.size > 3:
         if coord_horz_pre.shape[1] == 1:
             coord_horz = np.concatenate((coord_horz, coord_horz_pre[:,0].reshape((4,1))), axis = 1)
             coord_horz_pre = np.array([])
@@ -190,33 +190,33 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
             #print(coord_horz_curr)
             coord_horz_check = coord_horz_pre[:,1:]
             flag_stay = 1
-            while flag_stay:                
+            while flag_stay:
                 # Boxes to the right
                 ind_val1 = np.intersect1d(np.argwhere((abs(coord_horz_check[1,:] - coord_horz_curr[3]) < (width_resc * 10))),
                                          np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc))))
                 # Boxes to the left
                 ind_val2 = np.intersect1d(np.argwhere((abs(coord_horz_check[3,:] - coord_horz_curr[1]) < (width_resc * 10))),
-                                         np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc))))  
-                                
+                                         np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc))))
+
                 ind_val = np.unique(np.concatenate((ind_val1,ind_val2)))
                 if len(ind_val) > 0:
                     for i_b in range(len(ind_val)):
-                        coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), 
+                        coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])),
                                                     np.min((coord_horz_curr[1],coord_horz_check[1,ind_val[i_b]])),
-                                                    np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), 
+                                                    np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])),
                                                     np.max((coord_horz_curr[3],coord_horz_check[3,ind_val[i_b]]))])
                     coord_horz_check = coord_horz_check[:,np.setdiff1d(np.arange(coord_horz_check.shape[1]),
                                                                        ind_val)]
-                    #coord_horz_check = np.delete(coord_horz_check, ind_val, 1)    
+                    #coord_horz_check = np.delete(coord_horz_check, ind_val, 1)
                     if coord_horz_check.shape[1] == 0:
                         flag_stay = 0
                         coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1)
                         coord_horz_pre = np.array([])
-                else: 
+                else:
                     flag_stay = 0
                     coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1)
                     coord_horz_pre = coord_horz_check[:,:]
-    
+
     # Remove overlapping boxes
     coord_horz_def = np.array([]).reshape((4,0))
     while coord_horz.size > 3:
@@ -226,15 +226,15 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
                                               np.argwhere((width_resc/2 + coord_horz_curr[2]) > coord_horz[2,:]),
                                               np.argwhere((width_resc/2 + coord_horz_curr[3]) > coord_horz[3,:])))
         ind_overlap = np.setdiff1d(ind_overlap,0)
-        
+
         coord_horz_def = np.concatenate((coord_horz_def, coord_horz_curr.reshape((4,1))), axis = 1)
         coord_horz = coord_horz[:,np.setdiff1d(np.arange(1,coord_horz.shape[1]),ind_overlap)]
         #coord_horz = np.delete(coord_horz, ind_overlap, 1)
-        
+
         if coord_horz.size == 4:
             coord_horz_def = np.concatenate((coord_horz_def, coord_horz.reshape((4,1))), axis = 1)
             coord_horz = np.array([0])
-    
+
     ind_val_long = np.argwhere((coord_horz_def[3,:] - coord_horz_def[1,:]) > (3 * (coord_horz_def[2,:] - coord_horz_def[0,:])))
     coord_horz_def = coord_horz_def[:,ind_val_long].reshape((4,ind_val_long.shape[0]))
 
@@ -245,35 +245,35 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
         coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0]))
     else:
         coord_horz_def = np.array([]).reshape((4,0))
-    
-   
+
+
     # To identify the topline
     '''
     ind_topline = identify_topline(coord_horz_def, width_resc, dim_page)
-    if str_page == 'firsts':                               
+    if str_page == 'firsts':
         # We correct the top of the vertical line in case it is cutting some of the horizontal lines
         ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)),
                                                np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)),
                                                np.argwhere(coord_horz_def[0,:] > coord_vert_def[0])))
         ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline)
-        
+
         coord_vert_def = np.array([np.max(np.concatenate((np.array([coord_vert_def[0]]),coord_horz_def[2,ind_val_horz]))),coord_vert_def[1],
-                                   coord_vert_def[2],coord_vert_def[3]])  
+                                   coord_vert_def[2],coord_vert_def[3]])
     elif str_page == 'lasts':
-        # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines        
+        # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines
         ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)),
                                                np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)),
-                                               np.argwhere(coord_horz_def[2,:] < coord_vert_def[2])))        
+                                               np.argwhere(coord_horz_def[2,:] < coord_vert_def[2])))
         ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline)
-        
+
         coord_vert_def = np.array([coord_vert_def[0],coord_vert_def[1],
-                                   np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]])  
+                                   np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]])
     '''
-    
+
     coord_vert_def[1] = np.max((coord_vert_def[1], int(ind_central - width_resc)))
     coord_vert_def[3] = np.min((coord_vert_def[3], int(ind_central + width_resc)))
-    
-    # Finally, remove short central lines, likely artefacts of the calculation 
+
+    # Finally, remove short central lines, likely artefacts of the calculation
     # of the central vertical line
     length_lines = coord_horz_def[3,:] - coord_horz_def[1,:]
     ind_wrong = reduce(np.intersect1d, (np.argwhere(length_lines < 2* min_length_line),
@@ -283,98 +283,98 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
     if len(ind_val):
         coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0]))
     else:
-        coord_horz_def = np.array([]).reshape((4,0))    
-    
-    return coord_vert_def, coord_horz_def  
+        coord_horz_def = np.array([]).reshape((4,0))
+
+    return coord_vert_def, coord_horz_def
 
 def identify_topline(coord_horz, width_resc, dim_page):
     # Two rules for identifying the top line
     ind_topline = reduce(np.intersect1d, (np.argwhere(coord_horz[2,:] < dim_page[0]/8),
                                                np.argwhere((coord_horz[3,:] - coord_horz[1,:]) > width_resc * 60)))
-    
+
     return ind_topline
 
 def lateral_margins(img, dim_bbox_page, coord_vert, coord_horz):
-    
+
     coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page)
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     gap_central = int(3 * width_resc)
     thres_margin = 0.1
-    
+
     img_aux = np.abs(255 - img[:,:,0])
     for ind in range(coord_horz.shape[1]):
         img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0
-    
+
     img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0
     central_line = (coord_vert[1] + coord_vert[3])/2
-    
+
     # Also remove possible mark and artefacts in the edges
     img_aux[:,:gap_central] = 0
     img_aux[:int(gap_central/2),:] = 0
     img_aux[(img_aux.shape[1] - gap_central):,:] = 0
-    img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 
-    
+    img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0
+
     sum_imarray_aux = np.sum(img_aux, axis = 0)
     sum_imarray_aux = 1000*sum_imarray_aux.astype(np.float64)/np.max(sum_imarray_aux)
     mean_val_rows_left = np.mean(sum_imarray_aux[:int(central_line - gap_central)])
     mean_val_rows_right = np.mean(sum_imarray_aux[int(central_line + gap_central):])
-        
+
     left_margin = np.min(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_left))
     right_margin = np.max(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_right))
-    
+
     return left_margin, right_margin, left_margin/rescale_factor[0,1], right_margin/rescale_factor[0,1]
 
 def bottomtop_margins(img, dim_bbox_page, coord_vert, coord_horz):
 
     val_thres = 300 # In this case we don't use the mean of sum_cols because we have
-                    
+
     coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page)
     img_aux = np.abs(255 - img[:,:,0])
-    
+
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     gap_central = int(3 * width_resc)
-        
+
     for ind in range(coord_horz.shape[1]):
         img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0
-    
+
     img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0
-    
+
     sum_cols = np.sum(img_aux, axis = 1)/255
     sum_cols = 1000 * sum_cols/np.max(sum_cols)
-    
+
     # Now, limit by using the horizontal lines
     ind_topline = identify_topline(coord_horz, width_resc, img_aux.shape)
-    
+
     if len(ind_topline) > 0:
         ind_min_textbox = np.max(coord_horz[2,ind_topline])
         sum_cols[:ind_min_textbox] = 0
-        
+
     #plt.figure()
     #plt.plot(sum_cols)
-    ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), 
+    ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)),
                            np.max(np.argwhere(sum_cols > val_thres))])
-    
-    return ind_limits  
+
+    return ind_limits
 
 def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_font_sizes):
-    
+
     # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines
     # and k for characters
-    
+
     coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page)
     list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64))
     list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page)
-    
+
     font_main_block = info_font_sizes[0, np.argmax(info_font_sizes[1,:])]
     thres_font = font_main_block/5 # To compensate for error in the fontsize between columns
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     gap_central = int(2 * width_resc)
     indentation = int(4 * width_resc)
-    
+
     ind_central = (coord_vert_def[3] + coord_vert_def[1])/2
-    
+
     # First pass just to discover main blocks
     list_col1 = list()
     list_col2 = list()
@@ -383,25 +383,25 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
     all_mean_heights = np.array([]).reshape((1,0))
     list_allcoords_textlines = np.array([]).reshape((4,0))
     relative_ref_textline = np.array([], dtype = np.uint32).reshape((3,0))
-    
+
     count_text = 0
-    
+
     for ind_block in range(len(xml_page)-2):
         xml_block = xml_page[ind_block]
         list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]
                                                       if 'bbox' in o.attrib]).astype(np.float64))
-        #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64))    
+        #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64))
         if len(list_coords_textline)>3:
             list_coords_textline
             list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page)
             list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1)
             relative_ref_textline_aux = np.zeros((3,list_coords_textline.shape[1]))
-            
+
             relative_ref_textline_aux[0,:] = count_text + np.arange(list_coords_textline.shape[1])
             relative_ref_textline_aux[1,:] = ind_block
             relative_ref_textline_aux[2,:] = np.arange(list_coords_textline.shape[1])
             relative_ref_textline = np.concatenate((relative_ref_textline,relative_ref_textline_aux.astype(np.uint32)), axis = 1)
-            
+
             for ind_textl in range(list_coords_textline.shape[1]):
                 all_heights = np.array([])
                 xml_textline = xml_block[ind_textl]
@@ -412,10 +412,10 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                             all_heights = np.append(all_heights, float(xml_text.attrib['size']))
                     #fontsize = fontsize_fromtextline(img[bbox_textline[0]:bbox_textline[2],
                     #                                        bbox_textline[1]:bbox_textline[3],0])
-        
+
                     fontsize = np.average(all_heights)
                     all_mean_heights = np.append(all_mean_heights, fontsize)
-                    
+
                     # Normal font
                     #if ((font_main_block - thres_font) < mean_height < (font_main_block + thres_font)):
                     if ((font_main_block - thres_font) < fontsize < (font_main_block + thres_font)):
@@ -426,18 +426,18 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                             if len(xml_block[0]) < 12:
                                 list_pagen.append(count_text)
                             else:
-                                list_textinheader.append(count_text)                    
+                                list_textinheader.append(count_text)
                         elif ((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))):
                             list_col1.append(count_text)
                         # Right side of the central line
-                        elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): 
+                        elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))):
                             list_col2.append(count_text)
                     count_text += 1
-                
+
     discovered_blocks = np.concatenate((np.array(list_col1),np.array(list_col2),
                                         np.array(list_pagen),np.array(list_textinheader)))
     blocks_left = np.setdiff1d(np.arange(list_allcoords_textlines.shape[1]),discovered_blocks)
-    
+
     if len(list_col1):
         bbox_col1 = np.array([np.min(list_allcoords_textlines[0,list_col1]),
                               np.min(list_allcoords_textlines[1,list_col1]),
@@ -445,17 +445,17 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                               np.max(list_allcoords_textlines[3,list_col1])])
     else:
         bbox_col1 = np.array([0,0,10,10]) # Dummy value
-        
+
     if len(list_col2):
         bbox_col2 = np.array([np.min(list_allcoords_textlines[0,list_col2]),
                               np.min(list_allcoords_textlines[1,list_col2]),
                               np.max(list_allcoords_textlines[2,list_col2]),
                               np.max(list_allcoords_textlines[3,list_col2])])
     else:
-        bbox_col2 = np.array([0,0,10,10]) # Dummy value    
+        bbox_col2 = np.array([0,0,10,10]) # Dummy value
 
     list_header = list()
-    list_header_singlecol = list()   
+    list_header_singlecol = list()
     list_footnote = list()
     list_notidentified = list()
     for ind_textline in blocks_left:
@@ -463,7 +463,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
         if xml_textline.tag == 'textline':
             bbox_textline = list_allcoords_textlines[:,ind_textline]
             # Small fontsize and below current bboxes of main blocks
-            if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and 
+            if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and
                 (bbox_textline[2] > bbox_col1[2]) and (bbox_textline[2] > bbox_col2[2])):
                 list_footnote.append(ind_textline)
             # Large fontsizes
@@ -473,7 +473,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                     list_header.append(ind_textline)
                 # To the left or right of the central line
                 elif (((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))) or
-                      ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): 
+                      ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))):
                     list_header_singlecol.append(ind_textline)
             # Standard fontsize
             elif ((font_main_block - thres_font) < all_mean_heights[ind_textline] < (font_main_block + thres_font)):
@@ -483,13 +483,13 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                     list_col1.append(ind_textline)
                 # Contained into the bbox of the right column
                 elif (((bbox_col2[0] - height_resc) < bbox_textline[0]) and ((bbox_col2[1] - width_resc) < bbox_textline[1])
-                        and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])):                    
+                        and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])):
                     list_col2.append(ind_textline)
             else:
                 list_notidentified.append(ind_textline)
-    
+
     label_textlines = dict()
-    label_textlines['text_col1'] = list_col1    
+    label_textlines['text_col1'] = list_col1
     label_textlines['text_col2'] = list_col2
     label_textlines['footnote'] = list_footnote
     label_textlines['pagen'] = list_pagen
@@ -497,8 +497,8 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
     label_textlines['header'] = list_header
     label_textlines['header_singlecol'] = list_header_singlecol
     label_textlines['notidentified'] = list_notidentified
-    
-    vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str)             
+
+    vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str)
     vec_labels_textline[list_col1] = 'text_col1'
     vec_labels_textline[list_col2] = 'text_col2'
     vec_labels_textline[list_footnote] = 'footnote'
@@ -507,56 +507,56 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
     vec_labels_textline[list_header] = 'header'
     vec_labels_textline[list_header_singlecol] = 'header_singlecol'
     vec_labels_textline[list_notidentified] = 'notidentified'
-    
-    # relative_ref_textline: three rows with the following, the aboslute reference 
+
+    # relative_ref_textline: three rows with the following, the aboslute reference
     # for the textline, the number of the block, and the number of the textline inside
     # that block
     return label_textlines, list_allcoords_textlines, relative_ref_textline, all_mean_heights, vec_labels_textline
 
-          
+
 def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textlines, margins):
-    # Two steps, first ordering the textlines, grouping them in big blocks separated 
+    # Two steps, first ordering the textlines, grouping them in big blocks separated
     # by horizontal lines. Then, inside these groups, we group them in textboxes,
     # incorporating this to the XML
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     widht_resc = WIDTH_CHAR * rescale_factor[0,1]
-    
+
     gap_central = 3 * widht_resc
     gap_row = height_resc/2
-    
-    # This parameters is intended for removing artefacts such as small dots in the 
+
+    # This parameters is intended for removing artefacts such as small dots in the
     # text. But we have to be careful, as we can remove valuable characters.
     # I first set a value of 3 * width_resc/4
-    min_width_textl = 6 * widht_resc/4 
-        
+    min_width_textl = 6 * widht_resc/4
+
     central_line = (coord_vert_def[3] + coord_vert_def[1])/2
     array_coords_textl = np.concatenate((list_allcoords_textlines[:,:],
                                         np.arange(list_allcoords_textlines.shape[1]).reshape((1,list_allcoords_textlines.shape[1]))))
-    
+
     # Clean from to thin lines, thatn are just probably artefacts
     all_widths = array_coords_textl[3,:] - array_coords_textl[1,:]
     ind_valid = np.argwhere(all_widths > min_width_textl)
     array_coords_textl = array_coords_textl[:,ind_valid].reshape((5,len(ind_valid)))
-    
+
     ind_centralines = np.intersect1d(np.argwhere(coord_horz[1,:] < (central_line - gap_central)),
                                      np.argwhere(coord_horz[3,:] > (central_line + gap_central)))
     ind_sepfootnotes = np.intersect1d(np.argwhere(coord_horz[1,:] < (margins[0] + 2 * widht_resc)),
-                                      np.argwhere(coord_horz[3,:] < (central_line - gap_central))) 
+                                      np.argwhere(coord_horz[3,:] < (central_line - gap_central)))
     ind_centralines = np.union1d(ind_centralines,ind_sepfootnotes)
     ind_collines = np.setdiff1d(np.arange(coord_horz.shape[1]),ind_centralines)
-    
+
     array_coords_centrall = coord_horz[:,ind_centralines]
     array_coords_coll = coord_horz[:,ind_collines]
     array_coords_coll = np.concatenate((array_coords_coll,
                                         -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1]))))
-    
-    not_visited = 1    
+
+    not_visited = 1
     toprow = 0
     count_b = 0
     set_of_blocks = dict()
     array_coords_centrall_ord = np.array([]).reshape((4,0))
     while not_visited:
-        
+
         if array_coords_centrall.size > 3:
             bottomrow = np.min(array_coords_centrall[0,:])
             ind_bottomrow = np.argmin(array_coords_centrall[0,:])
@@ -573,47 +573,47 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl
         coord_cat = np.concatenate((array_coords_textl[:,ind_textl_proc].reshape(5,len(ind_textl_proc)),
                                     array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))),
                                     axis = 1)
-        
+
         if coord_cat.size > 0:
             flag_col = 1
-            ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line)   
+            ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line)
             if ind_currcord == -1:
-                ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)  
+                ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)
                 flag_col = 2
-                
+
             order_coords = np.array([]).reshape(5,0)
             while coord_cat.size > 4:
                 order_coords = np.concatenate((order_coords,coord_cat[:,ind_currcord].reshape(5,1)), axis = 1)
                 curr_coord = coord_cat[:,ind_currcord]
                 coord_cat = np.delete(coord_cat,ind_currcord,1)
                 if coord_cat.size > 4:
-                    if flag_col == 1:                
+                    if flag_col == 1:
                         ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, max_col = central_line)
-                        
+
                         if ind_currcord == -1:
                             ind_currcord = next_textline_samecol(coord_cat, gap_row, max_col = central_line)
-                        
+
                         if ind_currcord == -1 :
-                            ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)   
+                            ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)
                             flag_col = 2
-                                            
+
                     elif flag_col == 2:
                         ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, min_col = central_line)
-                        
+
                         if ind_currcord == -1:
                             ind_currcord = next_textline_samecol(coord_cat, gap_row, min_col = central_line)
-                        
+
                         if ind_currcord == -1 :
                             flag_col = 1
                             ind_currcord = 0
-                    
+
         else:
-             order_coords = np.array([]).reshape(5,0) 
-             
+             order_coords = np.array([]).reshape(5,0)
+
         toprow = np.copy(bottomrow)
         set_of_blocks[count_b] = order_coords
         count_b += 1
-        
+
     return set_of_blocks, array_coords_centrall_ord
 
 def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000):
@@ -629,13 +629,13 @@ def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000):
         return curr_ind
     else:
         return -1
-    
+
 def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10000):
     curr_row = curr_coord[2]
     #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col),
-    #                          np.argwhere(coords[3,:] > min_col))    
+    #                          np.argwhere(coords[3,:] > min_col))
     ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)),
-                              np.argwhere(coords[1,:] > (min_col - gap_row)))      
+                              np.argwhere(coords[1,:] > (min_col - gap_row)))
     if len(ind_valid):
         min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (curr_row - gap_row)),
                                  np.argwhere(coords[2,ind_valid] < (curr_row + gap_row)))
@@ -646,41 +646,41 @@ def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10
         else:
             return -1
     else:
-        return -1 
-    
+        return -1
+
 def next_textline_samecol(coords, gap_row, min_col = 0, max_col = 10000):
     #print(coords, max_col, min_col)
     #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col),
-    #                           np.argwhere(coords[3,:] > min_col))    
+    #                           np.argwhere(coords[3,:] > min_col))
     ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)),
-                              np.argwhere(coords[1,:] > (min_col - gap_row)))      
+                              np.argwhere(coords[1,:] > (min_col - gap_row)))
     if len(ind_valid):
         min_row = np.min(coords[2,ind_valid])
         min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (min_row - gap_row)),
-                                 np.argwhere(coords[2,ind_valid] < (min_row + gap_row)))     
+                                 np.argwhere(coords[2,ind_valid] < (min_row + gap_row)))
         ind_valid_min = ind_valid[min_row]
         ind_next_textl = ind_valid_min[np.argmin(coords[1,ind_valid_min])]
         return ind_next_textl
     else:
-        return -1     
+        return -1
 
 
 def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescale_factor,
                            centrall_ord, ind_page, dim_img):
-    
+
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     widht_resc = WIDTH_CHAR * rescale_factor[0,1]
-        
+
     max_inrow_sep = 4 * widht_resc
-    max_incol_sep = 1 * height_resc    
+    max_incol_sep = 1 * height_resc
     gap_row = height_resc/2
     similarity_fonts = 0.95
     indentation = 2 * widht_resc
-    
-    centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, 
+
+    centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True,
                                                 rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft',
                                                 refCorners = 'topleftbottomright')
-    
+
     # Start creating the xml
     xml_e = []
     xml_e = ET.Element('pages')
@@ -689,21 +689,21 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
     page_el.attrib['bbox'] = xml_t[0].attrib['bbox']
     page_el.attrib['rotate'] = '0'
     xml_e.append(page_el)
-    
+
     val_type_col1 = {'text_col1', 'notidentified', 'header_singlecol', 'text_inheader'}
     val_type_col2 = {'text_col2', 'notidentified', 'header_singlecol', 'text_inheader'}
-    
-    
+
+
     count_b = 0
     text_b = ET.SubElement(page_el, 'textbox')
-    text_b.attrib['id'] = str(count_b)    
-    text_b.attrib['block'] = '0'    
+    text_b.attrib['id'] = str(count_b)
+    text_b.attrib['block'] = '0'
     for ind_b in range(len(set_of_blocks)):
         all_el = set_of_blocks[ind_b].astype(np.int64)
         all_bbox = np.array([]).reshape((4,0))
         for ind_c in range(all_el.shape[1]):
-            curr_el = all_el[:,ind_c]  
-            flag_copy_textb = 1   
+            curr_el = all_el[:,ind_c]
+            flag_copy_textb = 1
             # If it is a textline with text
             if curr_el[4] > -1:
                 all_bbox = np.concatenate((all_bbox, curr_el[:4].reshape((4,1))), axis = 1)
@@ -713,12 +713,12 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
                 text_l.attrib['type'] = type_textl
                 text_b.append(text_l)
                 type_textbox = 'text'
-                
+
                 # To check if it satisfies the conditions for being a new textbox
                 if ind_c < (all_el.shape[1] - 1):
-                    next_el = all_el[:,ind_c + 1] 
-                    if next_el[4] > -1:                  
-                        if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) 
+                    next_el = all_el[:,ind_c + 1]
+                    if next_el[4] > -1:
+                        if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1))
                             or ((type_textl in val_type_col2) and (labels_textl[int(next_el[4])] in val_type_col2))
                             or (type_textl == labels_textl[int(next_el[4])])):
                             # Object to the right or beneath
@@ -728,33 +728,33 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
                                 # Accounting for footnotes or other stuff
                                 curr_fontsize = curr_el[3] - curr_el[1]
                                 next_fontsize = next_el[3] - next_el[1]
-                                if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < 
+                                if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize <
                                     (curr_fontsize + next_fontsize * similarity_fonts)):
                                     # Finally, account for indentation
                                     if ((np.min(all_bbox[1,:]) + indentation) > next_el[1]):
                                         flag_copy_textb = 0
-                                
-                # Attributes and stuff in case we need to store as textbox      
+
+                # Attributes and stuff in case we need to store as textbox
                 if flag_copy_textb:
                     bbox_text_b = np.array([np.min(all_bbox[0,:]),np.min(all_bbox[1,:]),
                                             np.max(all_bbox[2,:]),np.max(all_bbox[3,:])])
-                    bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, 
+                    bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True,
                             rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft',
                             refCorners = 'topleftbottomright')
                     all_bbox = np.array([]).reshape((4,0))
-            # Instead, if we have a line        
+            # Instead, if we have a line
             else:
                 bbox_text_b = curr_el[:4]
                 text_l = ET.SubElement(text_b, 'textline')
                 text_l.attrib['type'] = 'col_lines'
-                bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, 
+                bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True,
                                             rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft',
                                             refCorners = 'topleftbottomright')
                 text_l.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2]
                 type_textbox = 'line'
-            
+
             # Creating the new textbox
-            if flag_copy_textb:    
+            if flag_copy_textb:
                 text_b.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2]
                 text_b.attrib['type_textbox'] = type_textbox
                 count_b += 1
@@ -762,10 +762,10 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
                     pass
                 else:
                     text_b = ET.SubElement(page_el, 'textbox')
-                    text_b.attrib['id'] = str(count_b)    
-                    text_b.attrib['block'] = str(ind_b) 
+                    text_b.attrib['id'] = str(count_b)
+                    text_b.attrib['block'] = str(ind_b)
                 all_bbox = np.array([]).reshape((4,0))
-                
+
         if (ind_b < (len(set_of_blocks) - 1)):
             text_l = ET.SubElement(text_b, 'textline')
             text_l.attrib['type'] = 'central_lines'
@@ -775,20 +775,20 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
             text_b.attrib['type_textbox'] = 'line'
             count_b += 1
             text_b = ET.SubElement(page_el, 'textbox')
-            text_b.attrib['id'] = str(count_b)    
-            text_b.attrib['block'] = str(ind_b) 
+            text_b.attrib['id'] = str(count_b)
+            text_b.attrib['block'] = str(ind_b)
             all_bbox = np.array([]).reshape((4,0))
-    
-    
-    # Just add the two final elements from the original xml    
+
+
+    # Just add the two final elements from the original xml
     page_el.append(xml_t[0][-2]) # Figure
     page_el.append(xml_t[0][-2]) # Layout
-    
+
     return xml_e
 
- 
+
 def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_col2')):
-    
+
     # helper function to clean text
     # !!! so far only removing new lines and primitive dehyphenation
     def clean_text(text):
@@ -796,10 +796,11 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
         text = text.replace('\n', ' ')
 
         # account for hyphenation (not completely correct...)
+        # TODO: needs to be improved
         text = text.replace('- ', '')
 
         return text
-    
+
     # initialize textbox count and empty dictionary
 
     XML_new = copy.deepcopy(XML_root)
@@ -814,7 +815,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
             if (textbox.tag == 'textbox'):
                 if 'type_textbox' in textbox.attrib.keys():
                     if (textbox.attrib['type_textbox'] == 'text'):
-                        
+
                         # initialize string
 
                         #print(textbox.tag, textbox.attrib)
@@ -827,7 +828,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
                             if textline.tag == 'textline':
                             #print(textline.tag, textline.attrib)
                             # for every text (actually just a letter)
-            
+
                                 for ind_ch, text in enumerate(textline):
                                     #print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl]))
                                     # extend string
@@ -847,7 +848,6 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
                                 complete_text += '[/font]'
                                 complete_text = clean_text(complete_text)
                                 XML_new[ind_p][ind_t][ind_tl].text = complete_text
-            
-        
+
+
     return XML_new
-    
\ No newline at end of file
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 0dc23f86..d801bb06 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -45,8 +45,8 @@ suffix_correctedxml = '_datacorr'
 # print some output
 print(year)
 print(type(year))
-print(input_correctedxml)
 print(input_lastnames)
+print(input_correctedxml)
 print(input_correctedmeta)
 print(folder_database)
 #%%
@@ -94,11 +94,11 @@ print(files_to_process)
 with open(input_lastnames, 'rb') as f:
     df_lastnames = pickle.load(f)
 
-print(df_lastnames)
+#print(df_lastnames)
 df_lastnames.columns
 #%%
 
-file_tarpath = './1891/20026440_datacorr.xml'
+file_tarpath = './1891/20026447_datacorr.xml'
 
 file_number = file_tarpath.split('/')[-1][:8]
 metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta)
@@ -112,6 +112,7 @@ file_doc
 if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
     print(file_number + '\n')
 
+    file_doc.annotate_speakers()
 
 
 
@@ -136,6 +137,7 @@ for file_tarpath in files_to_process:
     # if document is a discussion
     if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
         print(file_number + '\n')
+        file_doc.annotate_speakers()
 
 #%%
         ## get dictionary with text
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 2dffa7d0..f2b44c22 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -47,8 +47,60 @@ def check_if_discussion(path_meta_xml_file,
     return True
 
 
+# function to get text of corrected XML
 
 
+def get_text_corrected(XML_root):
+
+    # create new XML as a copy of the corrected one
+    XML_new = copy.deepcopy(XML_root)
+
+    # for every page
+    for ind_p, page in enumerate(XML_root):
+        print(page.tag, page.attrib)
+
+        # for every textbox on that page
+        for ind_t, textbox in enumerate(page):
+            if (textbox.tag == 'textbox'):
+                if 'type_textbox' in textbox.attrib.keys():
+                    if (textbox.attrib['type_textbox'] == 'text'):
+                        print(textbox.tag, textbox.attrib)
+
+                        # for every textline in that textbox
+                        for ind_tl, textline in enumerate(textbox):
+                            if textline.tag == 'textline':
+                                print(textline.tag, textline.attrib)
+
+                                print(textline.text)
+
+
+
+
+
+
+                            ## for every text (actually just a letter)
+                                #for ind_ch, text in enumerate(textline):
+                                    ##print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl]))
+                                    ## extend string
+                                    #if 'font' in text.attrib.keys():
+                                        #if (text.attrib['font'] != prev_fonttype) or (text.attrib['size'] != str(prev_fontsize)):
+                                            #if flag_in:
+                                                #complete_text += '[/font]'
+                                            #else:
+                                                #flag_in = 1
+                                            #complete_text += '[font face="' + text.attrib['size'] + '" size="' + text.attrib['font'] + '"]'
+                                            #prev_fontsize = text.attrib['size']
+                                            #prev_fonttype = text.attrib['font']
+                                    #complete_text = complete_text + text.text
+                                    #child_new = XML_new[ind_p][ind_t][ind_tl][0] # Because we are removing elements
+                                    #XML_new[ind_p][ind_t][ind_tl].remove(child_new)
+                                ## clean text
+                                #complete_text += '[/font]'
+                                #complete_text = clean_text(complete_text)
+                                #XML_new[ind_p][ind_t][ind_tl].text = complete_text
+
+
+    return XML_new
 
 
 
-- 
GitLab