From 0a8ac5d6cd0bf37fa15253475b4e8bee4b396e2d Mon Sep 17 00:00:00 2001 From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch> Date: Tue, 25 Feb 2020 11:12:21 +0100 Subject: [PATCH] Changes for improving, hopefully, the xml correction --- src/python/def_classes.py | 11 +++++--- src/python/preproc_docs.py | 56 ++++++++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 2439079d..210e2ab5 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -40,7 +40,7 @@ class Document: limit_year_f1 = 1950 limit_id_f3 = 110001491 - flag_end_run = 1 + flag_end_run = 0 # 0, we run the command python pdf2txt.py to convert to xml, 1, we juts run it like pdf2txt.py name_inpdf = '00_rawpdfs' name_inmeta = '01_rawmeta' name_inmeta_corr = '03_correctedmeta' @@ -61,6 +61,8 @@ class Document: self.name_wo_ext = os.path.splitext(self.name_file)[0] self.folder_database = folder_database self.flag_tar = flag_tar + self.flag_forcecomp = False # Only update the xml corrected if it is too old + self.flag_order = 1 # To indicate that we will reevaluate the order of textlines to reorder them self._meta_corr_ext() if flag_type == 1: @@ -285,7 +287,7 @@ class Document: ##### # Central vertical line and horizontal lines, through Hough transform coord_vert_def, coord_horz, flag_2col, sections_page = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, - flag_2col, flag_central) + self.flag_type, flag_central) print('Page %d: flag_2col %d' % (ind_page, flag_2col)) ##### @@ -307,7 +309,7 @@ class Document: ##### # Order the textlines, taken all them together, in order to later merge # in a single textbox textlines that so far form different textboxes - flag_order = 1 + flag_order = self.flag_order if self.flag_type == 2: flag_order = 0 set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, @@ -1244,7 +1246,8 @@ class Document: def xml(self): if not hasattr(self, "_XML_main_corr") or self._XML_main_corr is None: # the instance does not have a pointer to the xml - self._XML_main_corr = self.load_or_compute_xml() + force_compute = self.flag_forcecomp + self._XML_main_corr = self.load_or_compute_xml(force_compute=force_compute) return self._XML_main_corr diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py index da1904e5..f4cb0c13 100644 --- a/src/python/preproc_docs.py +++ b/src/python/preproc_docs.py @@ -71,7 +71,7 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, width_resc = WIDTH_CHAR * rescale_factor[0,1] height_resc = HEIGHT_CHAR * rescale_factor[0,1] - gap_central = int(4 * width_resc) + gap_central = int(2 * width_resc) top_bbox_red = 0 #int(height_resc/2) for ind in range(coord.shape[1]): @@ -128,7 +128,7 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, # regions of 1 or 2 columns. For documents of type 1, always to 1 dim_page = img_aux.shape flag_2col, sections_page = n_cols_perblock(coord, coord_horz, dim_page, gap_central) - print(flag_2col, sections_page) + print(flag_2col)#, sections_page) if flag_type == 1: flag_2col = 1 aux_sections_page = copy.deepcopy(sections_page) @@ -155,19 +155,26 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc), line_gap = int(width_resc)) if len(lines_vert): - lines_vert = lines_vert.reshape((-1,4)) + np.array([0,in_dim,0,in_dim]).reshape((1,4)) + lines_vert = np.concatenate(lines_vert, axis = 0).reshape((-1,4)) + np.array([0,in_dim,0,in_dim]).reshape((1,4)) aux_lines_vert.append(lines_vert) sections_page[ikd][2] = 2 #print(lines_vert,img_aux.shape) - + #print(flag_2col, sections_page) if flag_central: - lines_vert = np.transpose(np.concatenate(aux_lines_vert, axis = 0).reshape((-1,4))) - #lines_vert = np.transpose(np.asarray(lines_vert).reshape((len(lines_vert),4))) - lines_vert = np.concatenate((np.minimum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), - np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])), - np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), - np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32) + if len(aux_lines_vert): + lines_vert = np.transpose(np.concatenate(aux_lines_vert, axis = 0).reshape((-1,4))) + #lines_vert = np.transpose(np.asarray(lines_vert).reshape((len(lines_vert),4))) + lines_vert = np.concatenate((np.minimum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), + np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])), + np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), + np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32) + else: + flag_central = 0 + flag_2col = 0 + coord_vert_def = np.zeros((4,1)) + + else: sum_img_aux_in = np.sum(img_aux_in, axis = 0) sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)] @@ -185,13 +192,14 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, ind_central = int((ind_min_start + ind_min_end)/2) coord_vert_def = np.array([1, ind_central - int(width_resc/2), img_aux_in.shape[0], ind_central + int(width_resc/2)]) - + coord_vert_def = clean_vert_line(coord_vert_def, width_resc, ind_central) #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True) #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True) # First clean the vertical from unexpected outliers if flag_central: + img_cent = np.copy(img_prev) sum_rows = np.sum(img_cent, axis = 0)/255 ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]), @@ -203,7 +211,7 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, else: coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc]) - coord_vert_def = clean_vert_line(coord_vert_def, width_resc, ind_central) + coord_vert_def = clean_vert_line(coord_vert_def, width_resc, ind_central) #ind_central = np.mean(coord_vert_def[[1,3]]) else: @@ -214,7 +222,6 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, else: coord_horz_def = coord_horz - # And now, just iterate over the horizontal lines, merging them if required. return coord_vert_def.astype(int), coord_horz_def.astype(int), flag_2col, sections_page def n_cols_perblock(coord_textl, coord_horz_def, dim_page, gap_central): @@ -685,7 +692,7 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl height_resc = HEIGHT_CHAR * rescale_factor[0,1] widht_resc = WIDTH_CHAR * rescale_factor[0,1] - gap_central = 3 * widht_resc + gap_central = int(2 * widht_resc) gap_row = height_resc/2 sections_page_mat = np.array([]).reshape((3,0)) @@ -725,21 +732,31 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1])))) not_visited = 1 - toprow = 0 count_b = 0 + toprow = 0 set_of_blocks = dict() array_coords_centrall_ord = np.array([]).reshape((4,0)) + #for i_col in range(sections_page_mat.shape[1]): while not_visited: - + #if array_coords_centrall.size > 3: + #toprow = sections_page_mat[0,i_col] + #bottomrow = np.min(array_coords_centrall[0,:]) + #bottomrow = sections_page_mat[1,i_col] + #if i_col < (sections_page_mat.shape[1] - 1): if array_coords_centrall.size > 3: bottomrow = np.min(array_coords_centrall[0,:]) ind_bottomrow = np.argmin(array_coords_centrall[0,:]) array_coords_centrall_ord = np.concatenate((array_coords_centrall_ord, array_coords_centrall[:,ind_bottomrow].reshape((4,1))), axis = 1) + aux_ch_coords = array_coords_centrall[:,ind_bottomrow] + ind_horz_cent = aux_ch_coords[2] - aux_ch_coords[0] + ind_val = np.intersect1d(np.argwhere(sections_page_mat[0,:] <= ind_horz_cent), + np.argwhere(sections_page_mat[1,:] > ind_horz_cent)) array_coords_centrall = np.delete(array_coords_centrall,ind_bottomrow,1) else: bottomrow = 10000 not_visited = 0 + ind_val = sections_page_mat.shape[1] - 1 ind_textl_proc = np.intersect1d(np.argwhere((array_coords_textl[2,:] - gap_row) >= toprow), np.argwhere((array_coords_textl[2,:] - gap_row) < bottomrow)) @@ -755,9 +772,11 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))), axis = 1) - n_cols = identify_n_cols(coord_cat[:4,:], central_line, gap_central) - flag_2col_aux = n_cols - 1 + #n_cols = identify_n_cols(coord_cat[:4,:], central_line, gap_central) + #flag_2col_aux = n_cols - 1 + flag_2col_aux = int(sections_page_mat[2,ind_val] - 1) + if coord_cat.size > 0: if flag_order: flag_col = 1 @@ -822,6 +841,7 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl toprow = np.copy(bottomrow) set_of_blocks[count_b] = order_coords count_b += 1 + return set_of_blocks, array_coords_centrall_ord -- GitLab