diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py index caacaa6750a94b3c5b353458d7e2dd119fe5179c..7b134802640b57c3045f69c246d6805a3b6907e1 100644 --- a/src/python/preproc_docs.py +++ b/src/python/preproc_docs.py @@ -337,7 +337,7 @@ def remove_black_sides(img_aux, coord): valid_side = (ind_left < len(sum_img)/2) else: valid_side = (ind_left > (len(sum_img) - ind_right)) - if 1: + if 0: plt.figure() plt.plot(sum_img) plt.plot([0,len(sum_img)],[max_th,max_th]) @@ -358,8 +358,9 @@ def remove_black_sides(img_aux, coord): max_right_coord = int((ind_right + inc_start[ind_inc_start])/2) #print(ind_right,ind_inc_start,max_right_coord) # One of the sides is outside the valid, plus the textline is small - ind_notvalid_coord = np.ravel((np.argwhere((coord[1,:] < min_left_coord) | (coord[3,:] > max_right_coord) & + ind_notvalid_coord = np.ravel((np.argwhere(((coord[1,:] < min_left_coord) | (coord[3,:] > max_right_coord)) & ((coord[3,:] - coord[1,:]) < len(sum_img)/10)))) + print(ind_notvalid_coord) ind_valid_coord = np.setdiff1d(np.arange(coord.shape[1]),ind_notvalid_coord) #img_aux_clean = copy.copy(img_aux) #img_aux_clean[:,0:min_left_coord] = 0 @@ -639,13 +640,18 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines # and k for characters - + def val_el(list_el): + ind_notvalid_coord = np.ravel((np.argwhere(((list_el[1,:] < min_left_coord) | (list_el[3,:] > max_right_coord)) & + ((list_el[3,:] - list_el[1,:]) < (max_right_coord-min_left_coord)/10)))) + ind_valid_coord = np.setdiff1d(np.arange(list_el.shape[1]),ind_notvalid_coord) + return ind_valid_coord + coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page) list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64)) list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page) # Again, to remove the textlines outside areas of interest - ind_valid_coord = np.ravel(np.argwhere((list_coords_blocks[1,:] > min_left_coord) & (list_coords_blocks[3,:] < max_right_coord))) + ind_valid_coord = val_el(list_coords_blocks) list_coords_blocks = list_coords_blocks[:,ind_valid_coord] #sections_page_mat = np.zeros((3,len(sections_page))) @@ -692,7 +698,8 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon #list_coords_textline list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page) # Again, to remove the textlines outside areas of interest - ind_valid_coord = np.ravel(np.argwhere((list_coords_textline[1,:] > min_left_coord) & (list_coords_textline[3,:] < max_right_coord))) + #ind_valid_coord = np.ravel(np.argwhere((list_coords_textline[1,:] > min_left_coord) & (list_coords_textline[3,:] < max_right_coord))) + ind_valid_coord = val_el(list_coords_textline) list_coords_textline = list_coords_textline[:,ind_valid_coord] list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1) diff --git a/src/python/test_correct.py b/src/python/test_correct.py index aeab818c6f2b56f3264af5f053ee989150b8f04d..b8081bc23aa2af3f78b0c7e45e619b0de918f5a9 100644 --- a/src/python/test_correct.py +++ b/src/python/test_correct.py @@ -5,11 +5,11 @@ os.environ['DEMOCRASCI_DATA'] = "/Users/luissalamanca/My_stuff/05_SDSCresearch/0 import def_classes as defc year = 1982 -year = 1905 +year = 1911 folder_database = '../../data/AB_other/SessionOverviews_tar/' folder_database = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar' iddoc = '110001467' -iddoc = '110000179' +iddoc = '110000271' input_file = "./{}/{}.pdf".format(year, iddoc) doc = defc.Document(input_file, folder_database, flag_type = 3)