Many functions for correcting other files and parsin

1d882b27 · Luis Salamanca · 888c2507 · 1d882b27 · 1d882b27 · 1d882b27
Commit 1d882b27 authored 5 years ago by Luis Salamanca
--- a/.gitignore
+++ b/.gitignore
 # Created by https://www.gitignore.io/api/macos,python,R,linux,vim,emacs

+src/python/test_debug.py
+
 ### Emacs ###
 # -*- mode: gitignore; -*-
 *~

--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
--- a/src/python/ipython.html
+++ b/src/python/ipython.html
--- a/src/python/plot_tools.py
+++ b/src/python/plot_tools.py
@@ -16,6 +16,7 @@ import re
 from PIL import Image, ImageDraw, ImageFont

 import matplotlib.pyplot as plt
+import matplotlib.colors as colors

 HEIGHT_CHAR = 12
 WIDTH_CHAR = 6
@@ -25,11 +26,17 @@ def lines_box(img, coord, color_vec, thick_line):
    # corners in topleftbottomright, and rows and colums
    if isinstance(color_vec,str):
        color_vec = np.uint8(np.array(Color(color_vec).rgb)*255)
-    #print(type(color_vec[0]))    
+    #print(type(color_vec[0]))  
+    '''
    img[(coord[0]):(coord[0]+thick_line),coord[1]:coord[3],:] = color_vec
    img[(coord[2]-thick_line):(coord[2]),coord[1]:coord[3],:] = color_vec
    img[coord[0]:coord[2],(coord[1]):(coord[1]+thick_line),:] = color_vec
    img[coord[0]:coord[2],(coord[3]-thick_line):coord[3],:] = color_vec
+    '''
+    img[(coord[0]-thick_line):(coord[0]),coord[1]:coord[3],:] = color_vec
+    img[(coord[2]):(coord[2]+thick_line),coord[1]:coord[3],:] = color_vec
+    img[coord[0]:coord[2],(coord[1]-thick_line):(coord[1]),:] = color_vec
+    img[coord[0]:coord[2],(coord[3]):(coord[3]+thick_line),:] = color_vec
    return img

 def transform_coord(coord, dim_page = np.array([3000,1800]), invert_xy = False, 
@@ -220,9 +227,10 @@ def plot_horzvertlines(img, coord_horz, coord_vert_def):
    for ind_h in range(coord_horz.shape[1]):
        img_lines = lines_box(img_lines, coord_horz[:,ind_h].astype(np.uint32), 
                        color_vec = 'red', thick_line = 6)
-       
-    img_lines = lines_box(img_lines, coord_vert_def.astype(np.uint32), 
-                        color_vec = 'green', thick_line = 6)
+    
+    if np.sum(coord_vert_def):
+        img_lines = lines_box(img_lines, coord_vert_def.astype(np.uint32), 
+                            color_vec = 'green', thick_line = 6)
    
    return img_lines

@@ -235,14 +243,56 @@ def plot_correctedXML(img, XML_enrich, bbox_page, flag_lines_textl = 1):
        if XML_enrich[0][ind_el].tag == 'textbox':
            if 'bbox' in XML_enrich[0][ind_el].attrib:
                coord_textbox = np.array(XML_enrich[0][ind_el].attrib['bbox'].split(',')).astype(np.float64)
-                if flag_lines_textl < 3:
+                if flag_lines_textl < 3 and ('type_textbox' in XML_enrich[0][ind_el].attrib):
                    if XML_enrich[0][ind_el].attrib['type_textbox'] == 'line':
                        img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'blue', alpha = True, filled = False, thick_line = 6)
-                if (flag_lines_textl == 1) or (flag_lines_textl == 3):
+                if ((flag_lines_textl == 1) or (flag_lines_textl == 3)) and ('type_textbox' in XML_enrich[0][ind_el].attrib):
                    if XML_enrich[0][ind_el].attrib['type_textbox'] == 'text':
                        img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'red', alpha = True, filled = False, thick_line = 6) 
+                if not 'type_textbox' in XML_enrich[0][ind_el].attrib:
+                    img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'cyan', alpha = True, filled = False, thick_line = 6) 
+                
    return img_xml

+def plot_scatter_colorc(xy_cols, vec_to_use, title = ''):    
+    
+    vec_col = np.zeros(len(vec_to_use))
+    for counter, keyw in enumerate(np.unique(vec_to_use)):
+        ind_w = np.argwhere(vec_to_use == keyw)
+        vec_col[ind_w] = counter    
+    
+    plt.figure(figsize=(40, 20))
+
+    # define the colormap
+    cmap = plt.cm.gist_ncar
+    # extract all colors from the .jet map
+    cmaplist = np.array([cmap(i) for i in range(cmap.N)])
+    cmaplist = cmaplist[np.round(np.linspace(0,250, num = len(np.unique(vec_to_use)))).astype(int),:]
+    # create the new map
+    cmap = cmap.from_list('Custom cmap', cmaplist, len(np.unique(vec_to_use)))
+    
+    sca = plt.scatter(xy_cols[:,0], xy_cols[:,1], c = vec_col, cmap = cmap)
+
+    colorbar_text(sca, np.unique(vec_to_use))
+    plt.legend(np.unique(vec_to_use))
+    #plt.box(False)
+    plt.xticks([])
+    plt.yticks([])
+    plt.title(title)
+    plt.tight_layout()
+    
+
+    
+def colorbar_text(mappable, textlabels):
+    cbar = plt.colorbar(mappable)
+
+    cbar.ax.get_yaxis().set_ticks([])
+    int_sep = float(1)/len(textlabels)
+    x_loc = np.linspace(0 + int_sep/2, 1 - int_sep/2, len(textlabels))#[::-1]
+    for j, lab in enumerate(textlabels):
+        cbar.ax.text(1.5, x_loc[j], lab, ha='left', va='center', fontsize = 14)
+    cbar.ax.get_yaxis().labelpad = 15
+    
 def plot_save_parallel(folder_pickles):
    # Using files from pickle. Provides directly the folder with the pickles to 
    # convert. This was implemented for the sake of running things in parallel, 
@@ -264,3 +314,97 @@ def plot_save_parallel(folder_pickles):
            name_fig = full_filename[:-3] + format_fig
            fig.savefig(name_fig, format = format_fig, dpi = 200)
            plt.close(fig)
+
+def remove_margscan(imarray, coord_textboxes):
+    
+    thres = 0.3 * 255 # Thres over total number of pixels for x and y, to remove certain
+                # columns and rows
+                
+    img_aux = np.abs(255 - imarray[:,:,0])
+    img_aux[img_aux < 20] = 0
+    img_aux[img_aux >= 20] = 255
+    img_aux_in = np.copy(img_aux)                    
+                
+    next_to_check_c = int(img_aux_in.shape[1]/25)
+    next_to_check_r = int(img_aux_in.shape[0]/25)
+    
+    img_aux_in[np.min(coord_textboxes[0,:]):np.max(coord_textboxes[2,:]),
+               np.min(coord_textboxes[1,:]):np.max(coord_textboxes[3,:])] = 0   
+    sum_cols = np.sum(img_aux_in, axis = 1)
+    sum_rows = np.sum(img_aux_in, axis = 0)
+    
+    sum_cols[int(np.round(len(sum_cols)/3)):2*int(np.round(len(sum_cols)/3))] = 0
+    sum_rows[int(np.round(len(sum_rows)/3)):2*int(np.round(len(sum_rows)/3))] = 0    
+    
+    sum_cols[np.argwhere(sum_cols < thres*img_aux_in.shape[0])] = 0
+    sum_rows[np.argwhere(sum_rows < thres*img_aux_in.shape[1])] = 0
+    
+    #plt.plot(sum_cols)
+    #plt.figure()
+    #plt.plot(sum_rows)
+    
+    # Vertical marks
+    ## Left side
+    not_end = 1
+    max_ind = 0
+    while not_end:
+       # print(max_ind,(max_ind + next_to_check_c))
+        aux = np.argwhere(sum_rows[max_ind:(max_ind + next_to_check_c)])
+        if len(aux):
+            max_ind_aux = max_ind + np.max(aux)
+            if max_ind_aux > max_ind:
+                max_ind = max_ind_aux
+            else:
+                not_end = 0
+        else:
+            not_end = 0
+    imarray[:,:max_ind,:] = 255 
+    
+    ## Right side
+    not_end = 1
+    min_ind = -1
+    while not_end:
+        aux = np.argwhere(sum_rows[(min_ind - next_to_check_c):min_ind])
+        if len(aux):
+            min_ind_aux = min_ind - next_to_check_c + np.min(aux)
+            if min_ind_aux < min_ind:
+                min_ind = min_ind_aux
+            else:
+                not_end = 0
+        else:
+            not_end = 0
+    imarray[:,min_ind:,:] = 255    
+    
+    # Horizontal marks
+    ## Top side
+    not_end = 1
+    max_ind = 0
+    while not_end:
+        aux = np.argwhere(sum_cols[max_ind:(max_ind + next_to_check_r)])
+        if len(aux):
+            max_ind_aux = max_ind + np.max(aux)
+            if max_ind_aux > max_ind:
+                max_ind = max_ind_aux
+            else:
+                not_end = 0
+        else:
+            not_end = 0
+    imarray[:max_ind,:,:] = 255 
+    
+    ## Bottom side
+    not_end = 1
+    min_ind = -1
+    while not_end:
+        aux = np.argwhere(sum_cols[(min_ind - next_to_check_r):min_ind])
+        if len(aux):
+            min_ind_aux = min_ind - next_to_check_r + np.min(aux)
+            if min_ind_aux < min_ind:
+                min_ind = min_ind_aux
+            else:
+                not_end = 0
+        else:
+            not_end = 0
+        imarray[min_ind:,:,:] = 255       
+    
+
+    return imarray
\ No newline at end of file
--- a/src/python/preproc_docs.py
+++ b/src/python/preproc_docs.py
--- a/src/python/utils_proc.py
+++ b/src/python/utils_proc.py
@@ -33,8 +33,8 @@ def call_with_out(full_comm):
    p = subprocess.Popen(full_comm, stdout=subprocess.PIPE, shell=True) 
    (output, err) = p.communicate()
    p_status = p.wait() ## Wait for function to terminate.
-    print("Command output : ", output)
-    print("Command exit status/return code : ", p_status)
+    #print("Command output : ", output)
+    #print("Command exit status/return code : ", p_status)
    return output
 
 # This is a bit cumbersome, but wand.image is not importing the pdf from Spyder, 
@@ -206,3 +206,17 @@ def correct_metadata(year, id_doc, flag_end):
    tree = ET.ElementTree(XML_root_meta)
    tree.write(full_path + '_metacorr.xml', encoding = 'utf-8')
    return full_path + '_metacorr.xml'
+
+def find_all(str_s, text_s):
+    ind_l = list()
+    not_end = 1
+    count = 0
+    while not_end:
+        ind_f = str_s[count:].find(text_s)
+        #print(ind_f, count)
+        if ind_f > -1:
+            ind_l.append(ind_f + count)
+            count += ind_f + len(text_s)
+        else:
+            not_end = 0
+    return np.array(ind_l)
\ No newline at end of file