From 0a8ac5d6cd0bf37fa15253475b4e8bee4b396e2d Mon Sep 17 00:00:00 2001
From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch>
Date: Tue, 25 Feb 2020 11:12:21 +0100
Subject: [PATCH] Changes for improving, hopefully, the xml correction

---
 src/python/def_classes.py  | 11 +++++---
 src/python/preproc_docs.py | 56 ++++++++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 2439079d..210e2ab5 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -40,7 +40,7 @@ class Document:
 
     limit_year_f1 = 1950
     limit_id_f3 = 110001491
-    flag_end_run = 1
+    flag_end_run = 0 # 0, we run the command python pdf2txt.py to convert to xml, 1, we juts run it like pdf2txt.py
     name_inpdf = '00_rawpdfs'
     name_inmeta = '01_rawmeta'
     name_inmeta_corr = '03_correctedmeta'
@@ -61,6 +61,8 @@ class Document:
         self.name_wo_ext = os.path.splitext(self.name_file)[0]
         self.folder_database = folder_database
         self.flag_tar = flag_tar
+        self.flag_forcecomp = False # Only update the xml corrected if it is too old
+        self.flag_order = 1 # To indicate that we will reevaluate the order of textlines to reorder them
                 
         self._meta_corr_ext()
         if flag_type == 1:
@@ -285,7 +287,7 @@ class Document:
                 #####
                 # Central vertical line and horizontal lines, through Hough transform
                 coord_vert_def, coord_horz, flag_2col, sections_page = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page,
-                                                                          flag_2col, flag_central)
+                                                                          self.flag_type, flag_central)
                 print('Page %d: flag_2col %d' % (ind_page, flag_2col))
 
                 #####
@@ -307,7 +309,7 @@ class Document:
                 #####
                 # Order the textlines, taken all them together, in order to later merge
                 # in a single textbox textlines that so far form different textboxes
-                flag_order = 1
+                flag_order = self.flag_order
                 if self.flag_type == 2:
                     flag_order = 0
                 set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz,
@@ -1244,7 +1246,8 @@ class Document:
     def xml(self):
         if not hasattr(self, "_XML_main_corr") or self._XML_main_corr is None:
             # the instance does not have a pointer to the xml
-            self._XML_main_corr = self.load_or_compute_xml()
+            force_compute = self.flag_forcecomp
+            self._XML_main_corr = self.load_or_compute_xml(force_compute=force_compute)
 
         return self._XML_main_corr
 
diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py
index da1904e5..f4cb0c13 100644
--- a/src/python/preproc_docs.py
+++ b/src/python/preproc_docs.py
@@ -71,7 +71,7 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
 
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
-    gap_central = int(4 * width_resc)
+    gap_central = int(2 * width_resc)
     top_bbox_red = 0 #int(height_resc/2)
 
     for ind in range(coord.shape[1]):
@@ -128,7 +128,7 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
     # regions of 1 or 2 columns. For documents of type 1, always to 1
     dim_page = img_aux.shape
     flag_2col, sections_page = n_cols_perblock(coord, coord_horz, dim_page, gap_central)
-    print(flag_2col, sections_page)
+    print(flag_2col)#, sections_page)
     if flag_type == 1:
         flag_2col = 1
         aux_sections_page = copy.deepcopy(sections_page)
@@ -155,19 +155,26 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
                 lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc),
                                                  line_gap = int(width_resc))
                 if len(lines_vert):
-                    lines_vert = lines_vert.reshape((-1,4)) + np.array([0,in_dim,0,in_dim]).reshape((1,4))
+                    lines_vert = np.concatenate(lines_vert, axis = 0).reshape((-1,4)) + np.array([0,in_dim,0,in_dim]).reshape((1,4))
                     aux_lines_vert.append(lines_vert)
                     sections_page[ikd][2] = 2
 
             #print(lines_vert,img_aux.shape)
-
+        #print(flag_2col, sections_page)
         if flag_central:
-            lines_vert = np.transpose(np.concatenate(aux_lines_vert, axis = 0).reshape((-1,4)))
-            #lines_vert = np.transpose(np.asarray(lines_vert).reshape((len(lines_vert),4)))
-            lines_vert = np.concatenate((np.minimum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])),
-                                         np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])),
-                                         np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])),
-                                         np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32)
+            if len(aux_lines_vert):
+                lines_vert = np.transpose(np.concatenate(aux_lines_vert, axis = 0).reshape((-1,4)))
+                #lines_vert = np.transpose(np.asarray(lines_vert).reshape((len(lines_vert),4)))
+                lines_vert = np.concatenate((np.minimum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])),
+                                             np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])),
+                                             np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])),
+                                             np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32)
+            else:
+                flag_central = 0
+                flag_2col = 0
+                coord_vert_def = np.zeros((4,1))
+                
+                
         else:
             sum_img_aux_in = np.sum(img_aux_in, axis = 0)
             sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)]
@@ -185,13 +192,14 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
             ind_central = int((ind_min_start + ind_min_end)/2)
             coord_vert_def = np.array([1, ind_central - int(width_resc/2),
                                    img_aux_in.shape[0], ind_central + int(width_resc/2)])
-    
+            coord_vert_def = clean_vert_line(coord_vert_def, width_resc, ind_central)
     
         #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True)
         #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True)
     
         # First clean the vertical from unexpected outliers
         if flag_central:
+            img_cent = np.copy(img_prev)
             sum_rows = np.sum(img_cent, axis = 0)/255
             ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)])
             ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]),
@@ -203,7 +211,7 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
             else:
                 coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc])
     
-        coord_vert_def = clean_vert_line(coord_vert_def, width_resc, ind_central)
+            coord_vert_def = clean_vert_line(coord_vert_def, width_resc, ind_central)
         #ind_central = np.mean(coord_vert_def[[1,3]])
         
     else:
@@ -214,7 +222,6 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
     else:
         coord_horz_def = coord_horz
     
-    # And now, just iterate over the horizontal lines, merging them if required.
     return coord_vert_def.astype(int), coord_horz_def.astype(int), flag_2col, sections_page
 
 def n_cols_perblock(coord_textl, coord_horz_def, dim_page, gap_central):
@@ -685,7 +692,7 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     widht_resc = WIDTH_CHAR * rescale_factor[0,1]
 
-    gap_central = 3 * widht_resc
+    gap_central = int(2 * widht_resc)
     gap_row = height_resc/2
     
     sections_page_mat = np.array([]).reshape((3,0))
@@ -725,21 +732,31 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl
                                         -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1]))))
     
     not_visited = 1
-    toprow = 0
     count_b = 0
+    toprow = 0
     set_of_blocks = dict()
     array_coords_centrall_ord = np.array([]).reshape((4,0))
+    #for i_col in range(sections_page_mat.shape[1]):
     while not_visited:
-
+        #if array_coords_centrall.size > 3:
+        #toprow = sections_page_mat[0,i_col]
+            #bottomrow = np.min(array_coords_centrall[0,:])
+        #bottomrow = sections_page_mat[1,i_col]
+        #if i_col < (sections_page_mat.shape[1] - 1):
         if array_coords_centrall.size > 3:
             bottomrow = np.min(array_coords_centrall[0,:])
             ind_bottomrow = np.argmin(array_coords_centrall[0,:])
             array_coords_centrall_ord = np.concatenate((array_coords_centrall_ord,
                                                         array_coords_centrall[:,ind_bottomrow].reshape((4,1))), axis = 1)
+            aux_ch_coords = array_coords_centrall[:,ind_bottomrow]
+            ind_horz_cent = aux_ch_coords[2] - aux_ch_coords[0]
+            ind_val = np.intersect1d(np.argwhere(sections_page_mat[0,:] <= ind_horz_cent),
+                                     np.argwhere(sections_page_mat[1,:] > ind_horz_cent))
             array_coords_centrall = np.delete(array_coords_centrall,ind_bottomrow,1)
         else:
             bottomrow = 10000
             not_visited = 0
+            ind_val = sections_page_mat.shape[1] - 1
             
         ind_textl_proc = np.intersect1d(np.argwhere((array_coords_textl[2,:] - gap_row) >= toprow),
                                         np.argwhere((array_coords_textl[2,:] - gap_row) < bottomrow))
@@ -755,9 +772,11 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl
                                     array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))),
                                     axis = 1)
 
-        n_cols = identify_n_cols(coord_cat[:4,:], central_line, gap_central)
-        flag_2col_aux = n_cols - 1
+        #n_cols = identify_n_cols(coord_cat[:4,:], central_line, gap_central)
+        #flag_2col_aux = n_cols - 1
 
+        flag_2col_aux = int(sections_page_mat[2,ind_val] - 1)
+        
         if coord_cat.size > 0:
             if flag_order:
                 flag_col = 1
@@ -822,6 +841,7 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl
         toprow = np.copy(bottomrow)
         set_of_blocks[count_b] = order_coords
         count_b += 1
+        
 
     return set_of_blocks, array_coords_centrall_ord
 
-- 
GitLab