From b100be6d5813bb06f2bed1ba03f5fc3c30005ce0 Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Thu, 6 Dec 2018 15:51:00 +0100 Subject: [PATCH] WIP again --- src/python/def_classes.py | 50 ++++ src/python/preproc_docs.py | 390 +++++++++++++------------- src/python/run_extract_discussions.py | 8 +- src/python/utils_annot.py | 52 ++++ 4 files changed, 302 insertions(+), 198 deletions(-) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index f90a20a6..30917e30 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -48,6 +48,7 @@ class Document: self.name_wo_ext = os.path.splitext(self.name_file)[0] self.folder_database = folder_database self._meta_ext() + self._xml_ext() def _meta_ext(self): # Both for the correction and the extraction of the metadata information @@ -56,6 +57,13 @@ class Document: name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' self.name_meta = [name_tar, name_file, name_file_db] + def _xml_ext(self, suffix_xml = '_data', name_outcorrxml = '04_correctedxml'): + # For the extraction, correction and annotation of the xmls + # TODO for extraction and annotation + name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' + name_tar = self.folder_database + str(self.year) + '/' + name_outcorrxml + '.tar.gz' + self.name_xml_corr = [name_tar, name_xml] + def meta_correct(self, name_outmeta = '03_correctedmeta'): utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta) @@ -309,6 +317,7 @@ class Document: self.name_outcorrxml = name_outcorrxml self.name_xml_corr = [name_tar, name_xml] + self._xml_ext(suffix_xml, self.name_outcorrxml) command = 'rm -rf ./' + str(self.year) #print(command) utils_proc.call_with_out(command) @@ -684,6 +693,47 @@ class Document: def check_discussion(self): utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + return flag_discussion + + + + def annotate_speakers(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'): + + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + if 'name_outxml' not in self.__dict__.keys(): + self.name_outxml = name_outxml + + if 'XML_main_corr' not in self.__dict__.keys(): + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outcorrxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outcorrxml) + XML_tree = ET.parse(h_xml) + self.XML_main_corr = XML_tree.getroot() + else: + self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, + pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, + name_outcorrxml = self.name_outcorrxml) + else: + # TODO if already exists 02_extractedxml + self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0, + pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml, + name_outcorrxml = self.name_outcorrxml) + + #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) + XML_corr = utils_annot.get_text_corrected(self.XML_main_corr) + + + + + command = 'rm -rf ./' + str(self.year) #print(command) utils_proc.call_with_out(command) diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py index 8b4349f0..b9c9bbb0 100644 --- a/src/python/preproc_docs.py +++ b/src/python/preproc_docs.py @@ -6,7 +6,7 @@ Created on Fri Sep 28 13:39:10 2018 @author: luissalamanca """ -# File for all the functions used for preprocessing. +# File for all the functions used for preprocessing. import numpy as np import os @@ -46,42 +46,42 @@ import tables HEIGHT_CHAR = 12 WIDTH_CHAR = 6 -def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, +def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, flag_2col, flag_central = 1): # Using the coordinates of the boxes, we put the rest to 0, and then estimate # the central line - # Here, since we use the image, we have to rely again on a ref00 in topleft, and + # Here, since we use the image, we have to rely again on a ref00 in topleft, and # the corners in topleftbottomright - # We also look for horizontal lines + # We also look for horizontal lines # We assume that we will only have one vertical line, and then many horizontal # lines, either spanning the whole image, or at both sides of the central line - + coord, rescale_factor = adapt_coordtoimg(img, coord, dim_bbox_page) img_aux = np.abs(255 - img[:,:,0]) img_aux[img_aux < 20] = 0 img_aux[img_aux >= 20] = 255 img_aux_in = np.copy(img_aux) - - + + width_resc = WIDTH_CHAR * rescale_factor[0,1] height_resc = HEIGHT_CHAR * rescale_factor[0,1] gap_central = int(4 * width_resc) top_bbox_red = 0 #int(height_resc/2) - + for ind in range(coord.shape[1]): img_aux[(coord[0,ind] + top_bbox_red):coord[2,ind],coord[1,ind]:coord[3,ind]] = 0 - + # Also remove possible mark and artefacts in the edges img_aux[:,:int(img_aux.shape[1]/20)] = 0 img_aux[:int(img_aux.shape[0]/20),:] = 0 img_aux[int(19 * img_aux.shape[0]/20):,:] = 0 - img_aux[:,int(19 * img_aux.shape[1]/20):] = 0 - + img_aux[:,int(19 * img_aux.shape[1]/20):] = 0 + img_prev = np.copy(img_aux) - + img_aux_rem = remove_small_objects(label(img_aux), 2 * width_resc) #img_aux = dilation(img_aux_rem, selem = np.ones((11,11))) - img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) + img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) max_val = np.max(img_aux) if max_val > 0: img_aux_norm = (255 * img_aux/max_val).astype(np.uint8) @@ -90,13 +90,13 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, else: img_aux[:] = 0 #print(np.unique(img_aux)) - + # Remove big objects, like the shields and other logos #img_label = label(img_aux) edges = canny(img_aux, 2, 1, 25) #img_cent = np.copy(img_aux) - + if flag_2col: if flag_central: img_cent = np.copy(img_prev) @@ -108,37 +108,37 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, #lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = 2 * width_resc, # line_gap = width_resc) lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc), - line_gap = int(width_resc)) + line_gap = int(width_resc)) else: sum_img_aux_in = np.sum(img_aux_in, axis = 0) sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)] - + #plt.plot(sum_img_aux_in) #sum_img_aux_in[sum_img_aux_in < np.max(sum_img_aux_in)/10] = 0 # We need to substract the baseline value, in order to account for # central headers and stuff like that sum_img_aux_in = sum_img_aux_in - np.min(sum_img_aux_in) - #not_end_vect = 1 + #not_end_vect = 1 #while not_end_vect: ind_min_start = np.argwhere((sum_img_aux_in) < np.mean(sum_img_aux_in)/10) - ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start) + ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start) ind_min_start = int(2*img_aux.shape[1]/5) + np.min(ind_min_start) ind_central = int((ind_min_start + ind_min_end)/2) - coord_vert_def = np.array([1, ind_central - int(width_resc/2), + coord_vert_def = np.array([1, ind_central - int(width_resc/2), img_aux_in.shape[0], ind_central + int(width_resc/2)]) - #print(lines_vert,img_aux.shape) - - theta = np.linspace(-5*pi/8, -3* pi/8,num = 90) - #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90) + #print(lines_vert,img_aux.shape) + + theta = np.linspace(-5*pi/8, -3* pi/8,num = 90) + #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90) #lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = 2 * width_resc, - # line_gap = width_resc) + # line_gap = width_resc) lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = int(2 * width_resc), - line_gap = int(width_resc)) - - # These lines are given in a standard xy coordinate, with the corner in the - # bottom left + line_gap = int(width_resc)) + + # These lines are given in a standard xy coordinate, with the corner in the + # bottom left lines_horz = np.transpose(np.asarray(lines_horz).reshape((len(lines_horz),4))) - + lines_horz = np.concatenate((np.minimum(lines_horz[1,:],lines_horz[3,:]).reshape((1,lines_horz.shape[1])), np.minimum(lines_horz[0,:],lines_horz[2,:]).reshape((1,lines_horz.shape[1])), @@ -150,15 +150,15 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])), np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])), np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32) - - + + #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True) - #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True) - + #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True) + # First clean the vertical from unexpected outliers if flag_central: - sum_rows = np.sum(img_cent, axis = 0)/255 - ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) + sum_rows = np.sum(img_cent, axis = 0)/255 + ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]), np.argwhere([(ind_central - gap_central) < aux_l2 < (ind_central + gap_central) for aux_l2 in lines_vert[3,:]])) if len(ind_valid): @@ -169,19 +169,19 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc]) #ind_central = np.mean(coord_vert_def[[1,3]]) - + # And now, just iterate over the horizontal lines, merging them if required. return clean_horz_vert_lines(lines_horz, coord_vert_def, width_resc, height_resc, ind_central, gap_central, img_aux.shape) - - + + def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_resc, ind_central, gap_central, dim_page): # We just iterate over all the horizontal lines, merging them if required coord_horz = np.array([]).reshape((4,0)).astype(np.int32) min_length_line = 2 * width_resc - - while coord_horz_pre.size > 3: + + while coord_horz_pre.size > 3: if coord_horz_pre.shape[1] == 1: coord_horz = np.concatenate((coord_horz, coord_horz_pre[:,0].reshape((4,1))), axis = 1) coord_horz_pre = np.array([]) @@ -190,33 +190,33 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res #print(coord_horz_curr) coord_horz_check = coord_horz_pre[:,1:] flag_stay = 1 - while flag_stay: + while flag_stay: # Boxes to the right ind_val1 = np.intersect1d(np.argwhere((abs(coord_horz_check[1,:] - coord_horz_curr[3]) < (width_resc * 10))), np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) # Boxes to the left ind_val2 = np.intersect1d(np.argwhere((abs(coord_horz_check[3,:] - coord_horz_curr[1]) < (width_resc * 10))), - np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) - + np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc)))) + ind_val = np.unique(np.concatenate((ind_val1,ind_val2))) if len(ind_val) > 0: for i_b in range(len(ind_val)): - coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), + coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), np.min((coord_horz_curr[1],coord_horz_check[1,ind_val[i_b]])), - np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), + np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), np.max((coord_horz_curr[3],coord_horz_check[3,ind_val[i_b]]))]) coord_horz_check = coord_horz_check[:,np.setdiff1d(np.arange(coord_horz_check.shape[1]), ind_val)] - #coord_horz_check = np.delete(coord_horz_check, ind_val, 1) + #coord_horz_check = np.delete(coord_horz_check, ind_val, 1) if coord_horz_check.shape[1] == 0: flag_stay = 0 coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1) coord_horz_pre = np.array([]) - else: + else: flag_stay = 0 coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1) coord_horz_pre = coord_horz_check[:,:] - + # Remove overlapping boxes coord_horz_def = np.array([]).reshape((4,0)) while coord_horz.size > 3: @@ -226,15 +226,15 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res np.argwhere((width_resc/2 + coord_horz_curr[2]) > coord_horz[2,:]), np.argwhere((width_resc/2 + coord_horz_curr[3]) > coord_horz[3,:]))) ind_overlap = np.setdiff1d(ind_overlap,0) - + coord_horz_def = np.concatenate((coord_horz_def, coord_horz_curr.reshape((4,1))), axis = 1) coord_horz = coord_horz[:,np.setdiff1d(np.arange(1,coord_horz.shape[1]),ind_overlap)] #coord_horz = np.delete(coord_horz, ind_overlap, 1) - + if coord_horz.size == 4: coord_horz_def = np.concatenate((coord_horz_def, coord_horz.reshape((4,1))), axis = 1) coord_horz = np.array([0]) - + ind_val_long = np.argwhere((coord_horz_def[3,:] - coord_horz_def[1,:]) > (3 * (coord_horz_def[2,:] - coord_horz_def[0,:]))) coord_horz_def = coord_horz_def[:,ind_val_long].reshape((4,ind_val_long.shape[0])) @@ -245,35 +245,35 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0])) else: coord_horz_def = np.array([]).reshape((4,0)) - - + + # To identify the topline ''' ind_topline = identify_topline(coord_horz_def, width_resc, dim_page) - if str_page == 'firsts': + if str_page == 'firsts': # We correct the top of the vertical line in case it is cutting some of the horizontal lines ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)), np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)), np.argwhere(coord_horz_def[0,:] > coord_vert_def[0]))) ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline) - + coord_vert_def = np.array([np.max(np.concatenate((np.array([coord_vert_def[0]]),coord_horz_def[2,ind_val_horz]))),coord_vert_def[1], - coord_vert_def[2],coord_vert_def[3]]) + coord_vert_def[2],coord_vert_def[3]]) elif str_page == 'lasts': - # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines + # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)), np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)), - np.argwhere(coord_horz_def[2,:] < coord_vert_def[2]))) + np.argwhere(coord_horz_def[2,:] < coord_vert_def[2]))) ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline) - + coord_vert_def = np.array([coord_vert_def[0],coord_vert_def[1], - np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]]) + np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]]) ''' - + coord_vert_def[1] = np.max((coord_vert_def[1], int(ind_central - width_resc))) coord_vert_def[3] = np.min((coord_vert_def[3], int(ind_central + width_resc))) - - # Finally, remove short central lines, likely artefacts of the calculation + + # Finally, remove short central lines, likely artefacts of the calculation # of the central vertical line length_lines = coord_horz_def[3,:] - coord_horz_def[1,:] ind_wrong = reduce(np.intersect1d, (np.argwhere(length_lines < 2* min_length_line), @@ -283,98 +283,98 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res if len(ind_val): coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0])) else: - coord_horz_def = np.array([]).reshape((4,0)) - - return coord_vert_def, coord_horz_def + coord_horz_def = np.array([]).reshape((4,0)) + + return coord_vert_def, coord_horz_def def identify_topline(coord_horz, width_resc, dim_page): # Two rules for identifying the top line ind_topline = reduce(np.intersect1d, (np.argwhere(coord_horz[2,:] < dim_page[0]/8), np.argwhere((coord_horz[3,:] - coord_horz[1,:]) > width_resc * 60))) - + return ind_topline def lateral_margins(img, dim_bbox_page, coord_vert, coord_horz): - + coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page) width_resc = WIDTH_CHAR * rescale_factor[0,1] gap_central = int(3 * width_resc) thres_margin = 0.1 - + img_aux = np.abs(255 - img[:,:,0]) for ind in range(coord_horz.shape[1]): img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0 - + img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0 central_line = (coord_vert[1] + coord_vert[3])/2 - + # Also remove possible mark and artefacts in the edges img_aux[:,:gap_central] = 0 img_aux[:int(gap_central/2),:] = 0 img_aux[(img_aux.shape[1] - gap_central):,:] = 0 - img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 - + img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 + sum_imarray_aux = np.sum(img_aux, axis = 0) sum_imarray_aux = 1000*sum_imarray_aux.astype(np.float64)/np.max(sum_imarray_aux) mean_val_rows_left = np.mean(sum_imarray_aux[:int(central_line - gap_central)]) mean_val_rows_right = np.mean(sum_imarray_aux[int(central_line + gap_central):]) - + left_margin = np.min(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_left)) right_margin = np.max(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_right)) - + return left_margin, right_margin, left_margin/rescale_factor[0,1], right_margin/rescale_factor[0,1] def bottomtop_margins(img, dim_bbox_page, coord_vert, coord_horz): val_thres = 300 # In this case we don't use the mean of sum_cols because we have - + coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page) img_aux = np.abs(255 - img[:,:,0]) - + height_resc = HEIGHT_CHAR * rescale_factor[0,1] width_resc = WIDTH_CHAR * rescale_factor[0,1] gap_central = int(3 * width_resc) - + for ind in range(coord_horz.shape[1]): img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0 - + img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0 - + sum_cols = np.sum(img_aux, axis = 1)/255 sum_cols = 1000 * sum_cols/np.max(sum_cols) - + # Now, limit by using the horizontal lines ind_topline = identify_topline(coord_horz, width_resc, img_aux.shape) - + if len(ind_topline) > 0: ind_min_textbox = np.max(coord_horz[2,ind_topline]) sum_cols[:ind_min_textbox] = 0 - + #plt.figure() #plt.plot(sum_cols) - ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), + ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), np.max(np.argwhere(sum_cols > val_thres))]) - - return ind_limits + + return ind_limits def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_font_sizes): - + # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines # and k for characters - + coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page) list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64)) list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page) - + font_main_block = info_font_sizes[0, np.argmax(info_font_sizes[1,:])] thres_font = font_main_block/5 # To compensate for error in the fontsize between columns width_resc = WIDTH_CHAR * rescale_factor[0,1] height_resc = HEIGHT_CHAR * rescale_factor[0,1] gap_central = int(2 * width_resc) indentation = int(4 * width_resc) - + ind_central = (coord_vert_def[3] + coord_vert_def[1])/2 - + # First pass just to discover main blocks list_col1 = list() list_col2 = list() @@ -383,25 +383,25 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon all_mean_heights = np.array([]).reshape((1,0)) list_allcoords_textlines = np.array([]).reshape((4,0)) relative_ref_textline = np.array([], dtype = np.uint32).reshape((3,0)) - + count_text = 0 - + for ind_block in range(len(xml_page)-2): xml_block = xml_page[ind_block] list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:] if 'bbox' in o.attrib]).astype(np.float64)) - #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64)) + #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64)) if len(list_coords_textline)>3: list_coords_textline list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page) list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1) relative_ref_textline_aux = np.zeros((3,list_coords_textline.shape[1])) - + relative_ref_textline_aux[0,:] = count_text + np.arange(list_coords_textline.shape[1]) relative_ref_textline_aux[1,:] = ind_block relative_ref_textline_aux[2,:] = np.arange(list_coords_textline.shape[1]) relative_ref_textline = np.concatenate((relative_ref_textline,relative_ref_textline_aux.astype(np.uint32)), axis = 1) - + for ind_textl in range(list_coords_textline.shape[1]): all_heights = np.array([]) xml_textline = xml_block[ind_textl] @@ -412,10 +412,10 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon all_heights = np.append(all_heights, float(xml_text.attrib['size'])) #fontsize = fontsize_fromtextline(img[bbox_textline[0]:bbox_textline[2], # bbox_textline[1]:bbox_textline[3],0]) - + fontsize = np.average(all_heights) all_mean_heights = np.append(all_mean_heights, fontsize) - + # Normal font #if ((font_main_block - thres_font) < mean_height < (font_main_block + thres_font)): if ((font_main_block - thres_font) < fontsize < (font_main_block + thres_font)): @@ -426,18 +426,18 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon if len(xml_block[0]) < 12: list_pagen.append(count_text) else: - list_textinheader.append(count_text) + list_textinheader.append(count_text) elif ((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))): list_col1.append(count_text) # Right side of the central line - elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): + elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): list_col2.append(count_text) count_text += 1 - + discovered_blocks = np.concatenate((np.array(list_col1),np.array(list_col2), np.array(list_pagen),np.array(list_textinheader))) blocks_left = np.setdiff1d(np.arange(list_allcoords_textlines.shape[1]),discovered_blocks) - + if len(list_col1): bbox_col1 = np.array([np.min(list_allcoords_textlines[0,list_col1]), np.min(list_allcoords_textlines[1,list_col1]), @@ -445,17 +445,17 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon np.max(list_allcoords_textlines[3,list_col1])]) else: bbox_col1 = np.array([0,0,10,10]) # Dummy value - + if len(list_col2): bbox_col2 = np.array([np.min(list_allcoords_textlines[0,list_col2]), np.min(list_allcoords_textlines[1,list_col2]), np.max(list_allcoords_textlines[2,list_col2]), np.max(list_allcoords_textlines[3,list_col2])]) else: - bbox_col2 = np.array([0,0,10,10]) # Dummy value + bbox_col2 = np.array([0,0,10,10]) # Dummy value list_header = list() - list_header_singlecol = list() + list_header_singlecol = list() list_footnote = list() list_notidentified = list() for ind_textline in blocks_left: @@ -463,7 +463,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon if xml_textline.tag == 'textline': bbox_textline = list_allcoords_textlines[:,ind_textline] # Small fontsize and below current bboxes of main blocks - if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and + if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and (bbox_textline[2] > bbox_col1[2]) and (bbox_textline[2] > bbox_col2[2])): list_footnote.append(ind_textline) # Large fontsizes @@ -473,7 +473,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon list_header.append(ind_textline) # To the left or right of the central line elif (((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))) or - ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): + ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): list_header_singlecol.append(ind_textline) # Standard fontsize elif ((font_main_block - thres_font) < all_mean_heights[ind_textline] < (font_main_block + thres_font)): @@ -483,13 +483,13 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon list_col1.append(ind_textline) # Contained into the bbox of the right column elif (((bbox_col2[0] - height_resc) < bbox_textline[0]) and ((bbox_col2[1] - width_resc) < bbox_textline[1]) - and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])): + and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])): list_col2.append(ind_textline) else: list_notidentified.append(ind_textline) - + label_textlines = dict() - label_textlines['text_col1'] = list_col1 + label_textlines['text_col1'] = list_col1 label_textlines['text_col2'] = list_col2 label_textlines['footnote'] = list_footnote label_textlines['pagen'] = list_pagen @@ -497,8 +497,8 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon label_textlines['header'] = list_header label_textlines['header_singlecol'] = list_header_singlecol label_textlines['notidentified'] = list_notidentified - - vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str) + + vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str) vec_labels_textline[list_col1] = 'text_col1' vec_labels_textline[list_col2] = 'text_col2' vec_labels_textline[list_footnote] = 'footnote' @@ -507,56 +507,56 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon vec_labels_textline[list_header] = 'header' vec_labels_textline[list_header_singlecol] = 'header_singlecol' vec_labels_textline[list_notidentified] = 'notidentified' - - # relative_ref_textline: three rows with the following, the aboslute reference + + # relative_ref_textline: three rows with the following, the aboslute reference # for the textline, the number of the block, and the number of the textline inside # that block return label_textlines, list_allcoords_textlines, relative_ref_textline, all_mean_heights, vec_labels_textline - + def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textlines, margins): - # Two steps, first ordering the textlines, grouping them in big blocks separated + # Two steps, first ordering the textlines, grouping them in big blocks separated # by horizontal lines. Then, inside these groups, we group them in textboxes, # incorporating this to the XML height_resc = HEIGHT_CHAR * rescale_factor[0,1] widht_resc = WIDTH_CHAR * rescale_factor[0,1] - + gap_central = 3 * widht_resc gap_row = height_resc/2 - - # This parameters is intended for removing artefacts such as small dots in the + + # This parameters is intended for removing artefacts such as small dots in the # text. But we have to be careful, as we can remove valuable characters. # I first set a value of 3 * width_resc/4 - min_width_textl = 6 * widht_resc/4 - + min_width_textl = 6 * widht_resc/4 + central_line = (coord_vert_def[3] + coord_vert_def[1])/2 array_coords_textl = np.concatenate((list_allcoords_textlines[:,:], np.arange(list_allcoords_textlines.shape[1]).reshape((1,list_allcoords_textlines.shape[1])))) - + # Clean from to thin lines, thatn are just probably artefacts all_widths = array_coords_textl[3,:] - array_coords_textl[1,:] ind_valid = np.argwhere(all_widths > min_width_textl) array_coords_textl = array_coords_textl[:,ind_valid].reshape((5,len(ind_valid))) - + ind_centralines = np.intersect1d(np.argwhere(coord_horz[1,:] < (central_line - gap_central)), np.argwhere(coord_horz[3,:] > (central_line + gap_central))) ind_sepfootnotes = np.intersect1d(np.argwhere(coord_horz[1,:] < (margins[0] + 2 * widht_resc)), - np.argwhere(coord_horz[3,:] < (central_line - gap_central))) + np.argwhere(coord_horz[3,:] < (central_line - gap_central))) ind_centralines = np.union1d(ind_centralines,ind_sepfootnotes) ind_collines = np.setdiff1d(np.arange(coord_horz.shape[1]),ind_centralines) - + array_coords_centrall = coord_horz[:,ind_centralines] array_coords_coll = coord_horz[:,ind_collines] array_coords_coll = np.concatenate((array_coords_coll, -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1])))) - - not_visited = 1 + + not_visited = 1 toprow = 0 count_b = 0 set_of_blocks = dict() array_coords_centrall_ord = np.array([]).reshape((4,0)) while not_visited: - + if array_coords_centrall.size > 3: bottomrow = np.min(array_coords_centrall[0,:]) ind_bottomrow = np.argmin(array_coords_centrall[0,:]) @@ -573,47 +573,47 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl coord_cat = np.concatenate((array_coords_textl[:,ind_textl_proc].reshape(5,len(ind_textl_proc)), array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))), axis = 1) - + if coord_cat.size > 0: flag_col = 1 - ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line) + ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line) if ind_currcord == -1: - ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) + ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) flag_col = 2 - + order_coords = np.array([]).reshape(5,0) while coord_cat.size > 4: order_coords = np.concatenate((order_coords,coord_cat[:,ind_currcord].reshape(5,1)), axis = 1) curr_coord = coord_cat[:,ind_currcord] coord_cat = np.delete(coord_cat,ind_currcord,1) if coord_cat.size > 4: - if flag_col == 1: + if flag_col == 1: ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, max_col = central_line) - + if ind_currcord == -1: ind_currcord = next_textline_samecol(coord_cat, gap_row, max_col = central_line) - + if ind_currcord == -1 : - ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) + ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line) flag_col = 2 - + elif flag_col == 2: ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, min_col = central_line) - + if ind_currcord == -1: ind_currcord = next_textline_samecol(coord_cat, gap_row, min_col = central_line) - + if ind_currcord == -1 : flag_col = 1 ind_currcord = 0 - + else: - order_coords = np.array([]).reshape(5,0) - + order_coords = np.array([]).reshape(5,0) + toprow = np.copy(bottomrow) set_of_blocks[count_b] = order_coords count_b += 1 - + return set_of_blocks, array_coords_centrall_ord def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000): @@ -629,13 +629,13 @@ def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000): return curr_ind else: return -1 - + def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10000): curr_row = curr_coord[2] #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), - # np.argwhere(coords[3,:] > min_col)) + # np.argwhere(coords[3,:] > min_col)) ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)), - np.argwhere(coords[1,:] > (min_col - gap_row))) + np.argwhere(coords[1,:] > (min_col - gap_row))) if len(ind_valid): min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (curr_row - gap_row)), np.argwhere(coords[2,ind_valid] < (curr_row + gap_row))) @@ -646,41 +646,41 @@ def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10 else: return -1 else: - return -1 - + return -1 + def next_textline_samecol(coords, gap_row, min_col = 0, max_col = 10000): #print(coords, max_col, min_col) #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col), - # np.argwhere(coords[3,:] > min_col)) + # np.argwhere(coords[3,:] > min_col)) ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)), - np.argwhere(coords[1,:] > (min_col - gap_row))) + np.argwhere(coords[1,:] > (min_col - gap_row))) if len(ind_valid): min_row = np.min(coords[2,ind_valid]) min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (min_row - gap_row)), - np.argwhere(coords[2,ind_valid] < (min_row + gap_row))) + np.argwhere(coords[2,ind_valid] < (min_row + gap_row))) ind_valid_min = ind_valid[min_row] ind_next_textl = ind_valid_min[np.argmin(coords[1,ind_valid_min])] return ind_next_textl else: - return -1 + return -1 def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescale_factor, centrall_ord, ind_page, dim_img): - + height_resc = HEIGHT_CHAR * rescale_factor[0,1] widht_resc = WIDTH_CHAR * rescale_factor[0,1] - + max_inrow_sep = 4 * widht_resc - max_incol_sep = 1 * height_resc + max_incol_sep = 1 * height_resc gap_row = height_resc/2 similarity_fonts = 0.95 indentation = 2 * widht_resc - - centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, + + centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', refCorners = 'topleftbottomright') - + # Start creating the xml xml_e = [] xml_e = ET.Element('pages') @@ -689,21 +689,21 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal page_el.attrib['bbox'] = xml_t[0].attrib['bbox'] page_el.attrib['rotate'] = '0' xml_e.append(page_el) - + val_type_col1 = {'text_col1', 'notidentified', 'header_singlecol', 'text_inheader'} val_type_col2 = {'text_col2', 'notidentified', 'header_singlecol', 'text_inheader'} - - + + count_b = 0 text_b = ET.SubElement(page_el, 'textbox') - text_b.attrib['id'] = str(count_b) - text_b.attrib['block'] = '0' + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = '0' for ind_b in range(len(set_of_blocks)): all_el = set_of_blocks[ind_b].astype(np.int64) all_bbox = np.array([]).reshape((4,0)) for ind_c in range(all_el.shape[1]): - curr_el = all_el[:,ind_c] - flag_copy_textb = 1 + curr_el = all_el[:,ind_c] + flag_copy_textb = 1 # If it is a textline with text if curr_el[4] > -1: all_bbox = np.concatenate((all_bbox, curr_el[:4].reshape((4,1))), axis = 1) @@ -713,12 +713,12 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal text_l.attrib['type'] = type_textl text_b.append(text_l) type_textbox = 'text' - + # To check if it satisfies the conditions for being a new textbox if ind_c < (all_el.shape[1] - 1): - next_el = all_el[:,ind_c + 1] - if next_el[4] > -1: - if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) + next_el = all_el[:,ind_c + 1] + if next_el[4] > -1: + if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) or ((type_textl in val_type_col2) and (labels_textl[int(next_el[4])] in val_type_col2)) or (type_textl == labels_textl[int(next_el[4])])): # Object to the right or beneath @@ -728,33 +728,33 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal # Accounting for footnotes or other stuff curr_fontsize = curr_el[3] - curr_el[1] next_fontsize = next_el[3] - next_el[1] - if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < + if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < (curr_fontsize + next_fontsize * similarity_fonts)): # Finally, account for indentation if ((np.min(all_bbox[1,:]) + indentation) > next_el[1]): flag_copy_textb = 0 - - # Attributes and stuff in case we need to store as textbox + + # Attributes and stuff in case we need to store as textbox if flag_copy_textb: bbox_text_b = np.array([np.min(all_bbox[0,:]),np.min(all_bbox[1,:]), np.max(all_bbox[2,:]),np.max(all_bbox[3,:])]) - bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, + bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', refCorners = 'topleftbottomright') all_bbox = np.array([]).reshape((4,0)) - # Instead, if we have a line + # Instead, if we have a line else: bbox_text_b = curr_el[:4] text_l = ET.SubElement(text_b, 'textline') text_l.attrib['type'] = 'col_lines' - bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, + bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft', refCorners = 'topleftbottomright') text_l.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2] type_textbox = 'line' - + # Creating the new textbox - if flag_copy_textb: + if flag_copy_textb: text_b.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2] text_b.attrib['type_textbox'] = type_textbox count_b += 1 @@ -762,10 +762,10 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal pass else: text_b = ET.SubElement(page_el, 'textbox') - text_b.attrib['id'] = str(count_b) - text_b.attrib['block'] = str(ind_b) + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = str(ind_b) all_bbox = np.array([]).reshape((4,0)) - + if (ind_b < (len(set_of_blocks) - 1)): text_l = ET.SubElement(text_b, 'textline') text_l.attrib['type'] = 'central_lines' @@ -775,20 +775,20 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal text_b.attrib['type_textbox'] = 'line' count_b += 1 text_b = ET.SubElement(page_el, 'textbox') - text_b.attrib['id'] = str(count_b) - text_b.attrib['block'] = str(ind_b) + text_b.attrib['id'] = str(count_b) + text_b.attrib['block'] = str(ind_b) all_bbox = np.array([]).reshape((4,0)) - - - # Just add the two final elements from the original xml + + + # Just add the two final elements from the original xml page_el.append(xml_t[0][-2]) # Figure page_el.append(xml_t[0][-2]) # Layout - + return xml_e - + def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_col2')): - + # helper function to clean text # !!! so far only removing new lines and primitive dehyphenation def clean_text(text): @@ -796,10 +796,11 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c text = text.replace('\n', ' ') # account for hyphenation (not completely correct...) + # TODO: needs to be improved text = text.replace('- ', '') return text - + # initialize textbox count and empty dictionary XML_new = copy.deepcopy(XML_root) @@ -814,7 +815,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c if (textbox.tag == 'textbox'): if 'type_textbox' in textbox.attrib.keys(): if (textbox.attrib['type_textbox'] == 'text'): - + # initialize string #print(textbox.tag, textbox.attrib) @@ -827,7 +828,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c if textline.tag == 'textline': #print(textline.tag, textline.attrib) # for every text (actually just a letter) - + for ind_ch, text in enumerate(textline): #print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl])) # extend string @@ -847,7 +848,6 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c complete_text += '[/font]' complete_text = clean_text(complete_text) XML_new[ind_p][ind_t][ind_tl].text = complete_text - - + + return XML_new - \ No newline at end of file diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 0dc23f86..d801bb06 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -45,8 +45,8 @@ suffix_correctedxml = '_datacorr' # print some output print(year) print(type(year)) -print(input_correctedxml) print(input_lastnames) +print(input_correctedxml) print(input_correctedmeta) print(folder_database) #%% @@ -94,11 +94,11 @@ print(files_to_process) with open(input_lastnames, 'rb') as f: df_lastnames = pickle.load(f) -print(df_lastnames) +#print(df_lastnames) df_lastnames.columns #%% -file_tarpath = './1891/20026440_datacorr.xml' +file_tarpath = './1891/20026447_datacorr.xml' file_number = file_tarpath.split('/')[-1][:8] metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta) @@ -112,6 +112,7 @@ file_doc if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']): print(file_number + '\n') + file_doc.annotate_speakers() @@ -136,6 +137,7 @@ for file_tarpath in files_to_process: # if document is a discussion if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']): print(file_number + '\n') + file_doc.annotate_speakers() #%% ## get dictionary with text diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 2dffa7d0..f2b44c22 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -47,8 +47,60 @@ def check_if_discussion(path_meta_xml_file, return True +# function to get text of corrected XML +def get_text_corrected(XML_root): + + # create new XML as a copy of the corrected one + XML_new = copy.deepcopy(XML_root) + + # for every page + for ind_p, page in enumerate(XML_root): + print(page.tag, page.attrib) + + # for every textbox on that page + for ind_t, textbox in enumerate(page): + if (textbox.tag == 'textbox'): + if 'type_textbox' in textbox.attrib.keys(): + if (textbox.attrib['type_textbox'] == 'text'): + print(textbox.tag, textbox.attrib) + + # for every textline in that textbox + for ind_tl, textline in enumerate(textbox): + if textline.tag == 'textline': + print(textline.tag, textline.attrib) + + print(textline.text) + + + + + + + ## for every text (actually just a letter) + #for ind_ch, text in enumerate(textline): + ##print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl])) + ## extend string + #if 'font' in text.attrib.keys(): + #if (text.attrib['font'] != prev_fonttype) or (text.attrib['size'] != str(prev_fontsize)): + #if flag_in: + #complete_text += '[/font]' + #else: + #flag_in = 1 + #complete_text += '[font face="' + text.attrib['size'] + '" size="' + text.attrib['font'] + '"]' + #prev_fontsize = text.attrib['size'] + #prev_fonttype = text.attrib['font'] + #complete_text = complete_text + text.text + #child_new = XML_new[ind_p][ind_t][ind_tl][0] # Because we are removing elements + #XML_new[ind_p][ind_t][ind_tl].remove(child_new) + ## clean text + #complete_text += '[/font]' + #complete_text = clean_text(complete_text) + #XML_new[ind_p][ind_t][ind_tl].text = complete_text + + + return XML_new -- GitLab