From c32c808ee035b6bcc15be8feefdc7a3357cf7c90 Mon Sep 17 00:00:00 2001
From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch>
Date: Thu, 23 Apr 2020 11:40:08 +0200
Subject: [PATCH] More st**id stuff on the correction, this time discarding
 lines

---
 src/python/preproc_docs.py | 17 ++++++++++++-----
 src/python/test_correct.py |  4 ++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py
index caacaa67..7b134802 100644
--- a/src/python/preproc_docs.py
+++ b/src/python/preproc_docs.py
@@ -337,7 +337,7 @@ def remove_black_sides(img_aux, coord):
         valid_side = (ind_left < len(sum_img)/2)
     else:
         valid_side = (ind_left > (len(sum_img) - ind_right))
-    if 1:
+    if 0:
         plt.figure()
         plt.plot(sum_img)
         plt.plot([0,len(sum_img)],[max_th,max_th])
@@ -358,8 +358,9 @@ def remove_black_sides(img_aux, coord):
         max_right_coord = int((ind_right + inc_start[ind_inc_start])/2)
         #print(ind_right,ind_inc_start,max_right_coord)
     # One of the sides is outside the valid, plus the textline is small
-    ind_notvalid_coord = np.ravel((np.argwhere((coord[1,:] < min_left_coord) | (coord[3,:] > max_right_coord) & 
+    ind_notvalid_coord = np.ravel((np.argwhere(((coord[1,:] < min_left_coord) | (coord[3,:] > max_right_coord)) & 
                                                ((coord[3,:] - coord[1,:]) < len(sum_img)/10))))
+    print(ind_notvalid_coord)
     ind_valid_coord = np.setdiff1d(np.arange(coord.shape[1]),ind_notvalid_coord)
     #img_aux_clean = copy.copy(img_aux)
     #img_aux_clean[:,0:min_left_coord] = 0
@@ -639,13 +640,18 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
 
     # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines
     # and k for characters
-
+    def val_el(list_el):
+        ind_notvalid_coord = np.ravel((np.argwhere(((list_el[1,:] < min_left_coord) | (list_el[3,:] > max_right_coord)) & 
+                                       ((list_el[3,:] - list_el[1,:]) < (max_right_coord-min_left_coord)/10))))
+        ind_valid_coord = np.setdiff1d(np.arange(list_el.shape[1]),ind_notvalid_coord)
+        return ind_valid_coord
+    
     coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page)
     list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64))
     list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page)
     
     # Again, to remove the textlines outside areas of interest
-    ind_valid_coord = np.ravel(np.argwhere((list_coords_blocks[1,:] > min_left_coord) & (list_coords_blocks[3,:] < max_right_coord)))
+    ind_valid_coord = val_el(list_coords_blocks)
     list_coords_blocks = list_coords_blocks[:,ind_valid_coord]
     
     #sections_page_mat = np.zeros((3,len(sections_page)))
@@ -692,7 +698,8 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
             #list_coords_textline
             list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page)
             # Again, to remove the textlines outside areas of interest
-            ind_valid_coord = np.ravel(np.argwhere((list_coords_textline[1,:] > min_left_coord) & (list_coords_textline[3,:] < max_right_coord)))
+            #ind_valid_coord = np.ravel(np.argwhere((list_coords_textline[1,:] > min_left_coord) & (list_coords_textline[3,:] < max_right_coord)))
+            ind_valid_coord = val_el(list_coords_textline)
             list_coords_textline = list_coords_textline[:,ind_valid_coord]
             
             list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1)
diff --git a/src/python/test_correct.py b/src/python/test_correct.py
index aeab818c..b8081bc2 100644
--- a/src/python/test_correct.py
+++ b/src/python/test_correct.py
@@ -5,11 +5,11 @@ os.environ['DEMOCRASCI_DATA'] = "/Users/luissalamanca/My_stuff/05_SDSCresearch/0
 import def_classes as defc
 
 year = 1982 
-year = 1905
+year = 1911
 folder_database = '../../data/AB_other/SessionOverviews_tar/'
 folder_database = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar'
 iddoc = '110001467'
-iddoc = '110000179'
+iddoc = '110000271'
 input_file = "./{}/{}.pdf".format(year, iddoc)
 
 doc = defc.Document(input_file, folder_database, flag_type = 3)
-- 
GitLab