Skip to content
Snippets Groups Projects
Commit 1d882b27 authored by Luis Salamanca's avatar Luis Salamanca
Browse files

Many functions for correcting other files and parsin

parent 888c2507
No related branches found
No related tags found
No related merge requests found
Pipeline #3783 passed
# Created by https://www.gitignore.io/api/macos,python,R,linux,vim,emacs
src/python/test_debug.py
### Emacs ###
# -*- mode: gitignore; -*-
*~
......
This diff is collapsed.
This diff is collapsed.
......@@ -16,6 +16,7 @@ import re
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import matplotlib.colors as colors
HEIGHT_CHAR = 12
WIDTH_CHAR = 6
......@@ -25,11 +26,17 @@ def lines_box(img, coord, color_vec, thick_line):
# corners in topleftbottomright, and rows and colums
if isinstance(color_vec,str):
color_vec = np.uint8(np.array(Color(color_vec).rgb)*255)
#print(type(color_vec[0]))
#print(type(color_vec[0]))
'''
img[(coord[0]):(coord[0]+thick_line),coord[1]:coord[3],:] = color_vec
img[(coord[2]-thick_line):(coord[2]),coord[1]:coord[3],:] = color_vec
img[coord[0]:coord[2],(coord[1]):(coord[1]+thick_line),:] = color_vec
img[coord[0]:coord[2],(coord[3]-thick_line):coord[3],:] = color_vec
'''
img[(coord[0]-thick_line):(coord[0]),coord[1]:coord[3],:] = color_vec
img[(coord[2]):(coord[2]+thick_line),coord[1]:coord[3],:] = color_vec
img[coord[0]:coord[2],(coord[1]-thick_line):(coord[1]),:] = color_vec
img[coord[0]:coord[2],(coord[3]):(coord[3]+thick_line),:] = color_vec
return img
def transform_coord(coord, dim_page = np.array([3000,1800]), invert_xy = False,
......@@ -220,9 +227,10 @@ def plot_horzvertlines(img, coord_horz, coord_vert_def):
for ind_h in range(coord_horz.shape[1]):
img_lines = lines_box(img_lines, coord_horz[:,ind_h].astype(np.uint32),
color_vec = 'red', thick_line = 6)
img_lines = lines_box(img_lines, coord_vert_def.astype(np.uint32),
color_vec = 'green', thick_line = 6)
if np.sum(coord_vert_def):
img_lines = lines_box(img_lines, coord_vert_def.astype(np.uint32),
color_vec = 'green', thick_line = 6)
return img_lines
......@@ -235,14 +243,56 @@ def plot_correctedXML(img, XML_enrich, bbox_page, flag_lines_textl = 1):
if XML_enrich[0][ind_el].tag == 'textbox':
if 'bbox' in XML_enrich[0][ind_el].attrib:
coord_textbox = np.array(XML_enrich[0][ind_el].attrib['bbox'].split(',')).astype(np.float64)
if flag_lines_textl < 3:
if flag_lines_textl < 3 and ('type_textbox' in XML_enrich[0][ind_el].attrib):
if XML_enrich[0][ind_el].attrib['type_textbox'] == 'line':
img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'blue', alpha = True, filled = False, thick_line = 6)
if (flag_lines_textl == 1) or (flag_lines_textl == 3):
if ((flag_lines_textl == 1) or (flag_lines_textl == 3)) and ('type_textbox' in XML_enrich[0][ind_el].attrib):
if XML_enrich[0][ind_el].attrib['type_textbox'] == 'text':
img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'red', alpha = True, filled = False, thick_line = 6)
if not 'type_textbox' in XML_enrich[0][ind_el].attrib:
img_xml = highlight_text(img_xml, coord_textbox, bbox_page, color_vec = 'cyan', alpha = True, filled = False, thick_line = 6)
return img_xml
def plot_scatter_colorc(xy_cols, vec_to_use, title = ''):
vec_col = np.zeros(len(vec_to_use))
for counter, keyw in enumerate(np.unique(vec_to_use)):
ind_w = np.argwhere(vec_to_use == keyw)
vec_col[ind_w] = counter
plt.figure(figsize=(40, 20))
# define the colormap
cmap = plt.cm.gist_ncar
# extract all colors from the .jet map
cmaplist = np.array([cmap(i) for i in range(cmap.N)])
cmaplist = cmaplist[np.round(np.linspace(0,250, num = len(np.unique(vec_to_use)))).astype(int),:]
# create the new map
cmap = cmap.from_list('Custom cmap', cmaplist, len(np.unique(vec_to_use)))
sca = plt.scatter(xy_cols[:,0], xy_cols[:,1], c = vec_col, cmap = cmap)
colorbar_text(sca, np.unique(vec_to_use))
plt.legend(np.unique(vec_to_use))
#plt.box(False)
plt.xticks([])
plt.yticks([])
plt.title(title)
plt.tight_layout()
def colorbar_text(mappable, textlabels):
cbar = plt.colorbar(mappable)
cbar.ax.get_yaxis().set_ticks([])
int_sep = float(1)/len(textlabels)
x_loc = np.linspace(0 + int_sep/2, 1 - int_sep/2, len(textlabels))#[::-1]
for j, lab in enumerate(textlabels):
cbar.ax.text(1.5, x_loc[j], lab, ha='left', va='center', fontsize = 14)
cbar.ax.get_yaxis().labelpad = 15
def plot_save_parallel(folder_pickles):
# Using files from pickle. Provides directly the folder with the pickles to
# convert. This was implemented for the sake of running things in parallel,
......@@ -264,3 +314,97 @@ def plot_save_parallel(folder_pickles):
name_fig = full_filename[:-3] + format_fig
fig.savefig(name_fig, format = format_fig, dpi = 200)
plt.close(fig)
def remove_margscan(imarray, coord_textboxes):
thres = 0.3 * 255 # Thres over total number of pixels for x and y, to remove certain
# columns and rows
img_aux = np.abs(255 - imarray[:,:,0])
img_aux[img_aux < 20] = 0
img_aux[img_aux >= 20] = 255
img_aux_in = np.copy(img_aux)
next_to_check_c = int(img_aux_in.shape[1]/25)
next_to_check_r = int(img_aux_in.shape[0]/25)
img_aux_in[np.min(coord_textboxes[0,:]):np.max(coord_textboxes[2,:]),
np.min(coord_textboxes[1,:]):np.max(coord_textboxes[3,:])] = 0
sum_cols = np.sum(img_aux_in, axis = 1)
sum_rows = np.sum(img_aux_in, axis = 0)
sum_cols[int(np.round(len(sum_cols)/3)):2*int(np.round(len(sum_cols)/3))] = 0
sum_rows[int(np.round(len(sum_rows)/3)):2*int(np.round(len(sum_rows)/3))] = 0
sum_cols[np.argwhere(sum_cols < thres*img_aux_in.shape[0])] = 0
sum_rows[np.argwhere(sum_rows < thres*img_aux_in.shape[1])] = 0
#plt.plot(sum_cols)
#plt.figure()
#plt.plot(sum_rows)
# Vertical marks
## Left side
not_end = 1
max_ind = 0
while not_end:
# print(max_ind,(max_ind + next_to_check_c))
aux = np.argwhere(sum_rows[max_ind:(max_ind + next_to_check_c)])
if len(aux):
max_ind_aux = max_ind + np.max(aux)
if max_ind_aux > max_ind:
max_ind = max_ind_aux
else:
not_end = 0
else:
not_end = 0
imarray[:,:max_ind,:] = 255
## Right side
not_end = 1
min_ind = -1
while not_end:
aux = np.argwhere(sum_rows[(min_ind - next_to_check_c):min_ind])
if len(aux):
min_ind_aux = min_ind - next_to_check_c + np.min(aux)
if min_ind_aux < min_ind:
min_ind = min_ind_aux
else:
not_end = 0
else:
not_end = 0
imarray[:,min_ind:,:] = 255
# Horizontal marks
## Top side
not_end = 1
max_ind = 0
while not_end:
aux = np.argwhere(sum_cols[max_ind:(max_ind + next_to_check_r)])
if len(aux):
max_ind_aux = max_ind + np.max(aux)
if max_ind_aux > max_ind:
max_ind = max_ind_aux
else:
not_end = 0
else:
not_end = 0
imarray[:max_ind,:,:] = 255
## Bottom side
not_end = 1
min_ind = -1
while not_end:
aux = np.argwhere(sum_cols[(min_ind - next_to_check_r):min_ind])
if len(aux):
min_ind_aux = min_ind - next_to_check_r + np.min(aux)
if min_ind_aux < min_ind:
min_ind = min_ind_aux
else:
not_end = 0
else:
not_end = 0
imarray[min_ind:,:,:] = 255
return imarray
\ No newline at end of file
This diff is collapsed.
......@@ -33,8 +33,8 @@ def call_with_out(full_comm):
p = subprocess.Popen(full_comm, stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
p_status = p.wait() ## Wait for function to terminate.
print("Command output : ", output)
print("Command exit status/return code : ", p_status)
#print("Command output : ", output)
#print("Command exit status/return code : ", p_status)
return output
# This is a bit cumbersome, but wand.image is not importing the pdf from Spyder,
......@@ -206,3 +206,17 @@ def correct_metadata(year, id_doc, flag_end):
tree = ET.ElementTree(XML_root_meta)
tree.write(full_path + '_metacorr.xml', encoding = 'utf-8')
return full_path + '_metacorr.xml'
def find_all(str_s, text_s):
ind_l = list()
not_end = 1
count = 0
while not_end:
ind_f = str_s[count:].find(text_s)
#print(ind_f, count)
if ind_f > -1:
ind_l.append(ind_f + count)
count += ind_f + len(text_s)
else:
not_end = 0
return np.array(ind_l)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment