diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 98c12cc0509acacee0a2a89eb2904962ae8dd73b..7c5282efffc2a28bb64ed4a3efc801154394b8d3 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -772,7 +772,7 @@ class Document: if (flag_op) == 1 and (type(vec_in) == str): t_aux = vec_in - text_search = [t_aux] + vec_in = [t_aux] name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' if os.path.isfile(name_tar): @@ -787,7 +787,7 @@ class Document: print('You need first to obtained the corrected XML file from the original pdf') dict_text_found = dict() - dict_all_info = dict() + #dict_all_info = dict() for ind_abs, ind_page in enumerate(self.n_pages): XML_root = ET.Element('pages') #print(ind_abs,len(self.XML_main)) @@ -796,7 +796,7 @@ class Document: bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) dim_img = imarray.shape[:2] - dict_text_found_aux = self._extract_textl_if(XML_root, text_search) + dict_text_found_aux = self._extract_textl_if(XML_root, vec_in, flag_op) dict_text_found[ind_abs] = dict_text_found_aux if flag_plot: @@ -805,79 +805,26 @@ class Document: aux_c = dict_text_found_aux[key][0] coord_textline = np.concatenate((coord_textline, aux_c.reshape((4,1))), axis = 1) - ind_c = np.argwhere([dict_text_found_aux[key][1].find(i) > -1 for i in text_search]) - colors = ['cyan','green','red','blue','yellow'] - imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, - color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4) - - if len(text_search) > 1: - aux_str = '_'.join(text_search) - else: - aux_str = text_search[0] - self._plot_save(imarray, 'Results of search:' + aux_str, 'TextFound-' + aux_str, - ind_page, self.path_file, flag_plot, flag_save_figs) - - if flag_parseall: - all_info_fonts = self._extract_all_fonts(XML_root) - dict_all_info[ind_abs] = all_info_fonts - - if flag_parseall: - return dict_text_found, dict_all_info - else: - return dict_text_found - - def parse_text(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', - text_search = '', flag_plot = 1, flag_save_figs = 0, flag_parseall = 0): - - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - - if type(text_search) == str: - t_aux = text_search - text_search = [t_aux] - - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) - XML_tree = ET.parse(h_xml) - XML_main_corr = XML_tree.getroot() - else: - print('You need first to obtained the corrected XML file from the original pdf') - else: - print('You need first to obtained the corrected XML file from the original pdf') - - dict_text_found = dict() - dict_all_info = dict() - for ind_abs, ind_page in enumerate(self.n_pages): - XML_root = ET.Element('pages') - #print(ind_abs,len(self.XML_main)) - XML_root.append(XML_main_corr[ind_abs]) - imarray = np.array(self.imgobj[ind_page]) - bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - dict_text_found_aux = self._extract_textl_if(XML_root, text_search) - dict_text_found[ind_abs] = dict_text_found_aux - - if flag_plot: - coord_textline = np.array([]).reshape((4,0)) - for key in dict_text_found_aux.keys(): - aux_c = dict_text_found_aux[key][0] - coord_textline = np.concatenate((coord_textline, aux_c.reshape((4,1))), axis = 1) + if flag_op == 1: + ind_c = np.argwhere([dict_text_found_aux[key][1].find(i) > -1 for i in vec_in]) + if len(vec_in) > 1: + aux_str = 'string: ' + '_'.join(vec_in) + else: + aux_str = 'string: ' + vec_in[0] + elif flag_op == 2: + ind_c = dict_text_found_aux[key][2] + aux_str = 'size: ' + '_'.join(np.array(vec_in).astype(str)) - ind_c = np.argwhere([dict_text_found_aux[key][1].find(i) > -1 for i in text_search]) colors = ['cyan','green','red','blue','yellow'] imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4) - if len(text_search) > 1: - aux_str = '_'.join(text_search) - else: - aux_str = text_search[0] - self._plot_save(imarray, 'Results of search:' + aux_str, 'TextFound-' + aux_str, + self._plot_save(imarray, 'Results of search ' + aux_str, 'TextFound-' + aux_str, ind_page, self.path_file, flag_plot, flag_save_figs) - + + return dict_text_found + + ''' if flag_parseall: all_info_fonts = self._extract_all_fonts(XML_root) dict_all_info[ind_abs] = all_info_fonts @@ -886,86 +833,53 @@ class Document: return dict_text_found, dict_all_info else: return dict_text_found - - def parse_fontsize(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', - font_max_min = [8,10], flag_plot = 1, flag_save_figs = 0): - - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml + ''' - - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) - XML_tree = ET.parse(h_xml) - XML_main_corr = XML_tree.getroot() - else: - print('You need first to obtained the corrected XML file from the original pdf') - else: - print('You need first to obtained the corrected XML file from the original pdf') - - dict_text_found = dict() - for ind_abs, ind_page in enumerate(self.n_pages): - XML_root = ET.Element('pages') - #print(ind_abs,len(self.XML_main)) - XML_root.append(XML_main_corr[ind_abs]) - imarray = np.array(self.imgobj[ind_page]) - bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) - dim_img = imarray.shape[:2] - dict_text_found_aux = self._extract_textl_ifsize(XML_root, font_max_min) - dict_text_found[ind_abs] = dict_text_found_aux - - if flag_plot: - coord_textline = np.array([]).reshape((4,0)) - for key in dict_text_found_aux.keys(): - aux_c = dict_text_found_aux[key][0] - coord_textline = np.concatenate((coord_textline, aux_c.reshape((4,1))), axis = 1) - - ind_c = dict_text_found_aux[key][2] - colors = ['cyan','green','red','blue','yellow'] - imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, - color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4) - - aux_str = '_'.join(np.array(font_max_min).astype(str)) - self._plot_save(imarray, 'Results of search size:' + aux_str, 'TextFoundSize-' + aux_str, - ind_page, self.path_file, flag_plot, flag_save_figs) - - return dict_text_found - - def _extract_textl_ifsize(self, XML_root, font_max_min = [8,10]): + def _extract_textl_if(self, XML_root, vec_in, flag_op): # The key is as follows: <block_id>_<textbox_id>_<y1>_<x1> dict_text_found = dict() - font_max_min_mat = np.zeros((2,len(font_max_min))) - for i in range(len(font_max_min) - 1): - font_max_min_mat[0,i] = font_max_min[i] - font_max_min_mat[1,i] = font_max_min[i + 1] + + if flag_op == 2: + font_max_min_mat = np.zeros((2,len(vec_in))) + for i in range(len(vec_in) - 1): + font_max_min_mat[0,i] = vec_in[i] + font_max_min_mat[1,i] = vec_in[i + 1] for ind_el in range(0, len(XML_root[0])): for ind_line in range(0, len(XML_root[0][ind_el])): if XML_root[0][ind_el][ind_line].tag == 'textline': #print(XML_root[0][ind_el][ind_line].text) str_text = str(XML_root[0][ind_el][ind_line].text) - ind_f2 = utils_proc.find_all(str_text, 'size="') - ind_val = list() - for i2 in range(len(ind_f2)): - f_size = np.array(str_text[ind_f2[i2] + len('size'):].split('"')[1]).astype(np.float64) - ind_int = np.intersect1d(np.argwhere(font_max_min_mat[0,:] < f_size), np.argwhere(font_max_min_mat[1,:] > f_size)) - if len(ind_int): - ind_val.append(ind_int) + if flag_op == 1: + if np.sum([str_text.find(i) > -1 for i in vec_in]): + #if (str_text.find(text_search)) > -1: + key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + dict_text_found[key_d] = [coord_textline_aux,str_text] - #print(ind_val, ind_val.shape) - if np.sum(np.array(ind_val) > -1): - #if (str_text.find(text_search)) > -1: - key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) - coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) - dict_text_found[key_d] = [coord_textline_aux, str_text, np.max(ind_val)] - - return dict_text_found + elif flag_op == 2: + ind_f2 = utils_proc.find_all(str_text, 'size="') + ind_val = list() + for i2 in range(len(ind_f2)): + f_size = np.array(str_text[ind_f2[i2] + len('size'):].split('"')[1]).astype(np.float64) + ind_int = np.intersect1d(np.argwhere(font_max_min_mat[0,:] < f_size), np.argwhere(font_max_min_mat[1,:] > f_size)) + if len(ind_int): + ind_val.append(ind_int) + + if np.sum(np.array(ind_val) > -1): + #if (str_text.find(text_search)) > -1: + key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + dict_text_found[key_d] = [coord_textline_aux, str_text, np.max(ind_val)] + + return dict_text_found + + def parse_all(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml'): @@ -1007,26 +921,8 @@ class Document: return dict_all_info, mat_all_f - def _extract_textl_if(self, XML_root, text_search = ''): - # The key is as follows: <block_id>_<textbox_id>_<y1>_<x1> - dict_text_found = dict() - - for ind_el in range(0, len(XML_root[0])): - for ind_line in range(0, len(XML_root[0][ind_el])): - if XML_root[0][ind_el][ind_line].tag == 'textline': - #print(XML_root[0][ind_el][ind_line].text) - str_text = str(XML_root[0][ind_el][ind_line].text) - if np.sum([str_text.find(i) > -1 for i in text_search]): - #if (str_text.find(text_search)) > -1: - key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) - coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) - dict_text_found[key_d] = [coord_textline_aux,str_text] - - return dict_text_found - def _extract_all_fonts(self, XML_root): + all_info_fonts = dict() for ind_el in range(0, len(XML_root[0])): for ind_line in range(0, len(XML_root[0][ind_el])): @@ -1262,3 +1158,146 @@ class Document: utils_proc.call_with_out(command) print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) + +''' + def _extract_textl_ifsize(self, XML_root, font_max_min = [8,10]): + # The key is as follows: <block_id>_<textbox_id>_<y1>_<x1> + dict_text_found = dict() + font_max_min_mat = np.zeros((2,len(font_max_min))) + for i in range(len(font_max_min) - 1): + font_max_min_mat[0,i] = font_max_min[i] + font_max_min_mat[1,i] = font_max_min[i + 1] + + for ind_el in range(0, len(XML_root[0])): + for ind_line in range(0, len(XML_root[0][ind_el])): + if XML_root[0][ind_el][ind_line].tag == 'textline': + #print(XML_root[0][ind_el][ind_line].text) + str_text = str(XML_root[0][ind_el][ind_line].text) + + ind_f2 = utils_proc.find_all(str_text, 'size="') + ind_val = list() + for i2 in range(len(ind_f2)): + f_size = np.array(str_text[ind_f2[i2] + len('size'):].split('"')[1]).astype(np.float64) + ind_int = np.intersect1d(np.argwhere(font_max_min_mat[0,:] < f_size), np.argwhere(font_max_min_mat[1,:] > f_size)) + if len(ind_int): + ind_val.append(ind_int) + + #print(ind_val, ind_val.shape) + if np.sum(np.array(ind_val) > -1): + #if (str_text.find(text_search)) > -1: + key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) + dict_text_found[key_d] = [coord_textline_aux, str_text, np.max(ind_val)] + + return dict_text_found + + def parse_text(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', + text_search = '', flag_plot = 1, flag_save_figs = 0, flag_parseall = 0): + + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + if type(text_search) == str: + t_aux = text_search + text_search = [t_aux] + + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) + XML_tree = ET.parse(h_xml) + XML_main_corr = XML_tree.getroot() + else: + print('You need first to obtained the corrected XML file from the original pdf') + else: + print('You need first to obtained the corrected XML file from the original pdf') + + dict_text_found = dict() + dict_all_info = dict() + for ind_abs, ind_page in enumerate(self.n_pages): + XML_root = ET.Element('pages') + #print(ind_abs,len(self.XML_main)) + XML_root.append(XML_main_corr[ind_abs]) + imarray = np.array(self.imgobj[ind_page]) + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + dict_text_found_aux = self._extract_textl_if(XML_root, text_search) + dict_text_found[ind_abs] = dict_text_found_aux + + if flag_plot: + coord_textline = np.array([]).reshape((4,0)) + for key in dict_text_found_aux.keys(): + aux_c = dict_text_found_aux[key][0] + coord_textline = np.concatenate((coord_textline, aux_c.reshape((4,1))), axis = 1) + + ind_c = np.argwhere([dict_text_found_aux[key][1].find(i) > -1 for i in text_search]) + colors = ['cyan','green','red','blue','yellow'] + imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, + color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4) + + if len(text_search) > 1: + aux_str = '_'.join(text_search) + else: + aux_str = text_search[0] + self._plot_save(imarray, 'Results of search:' + aux_str, 'TextFound-' + aux_str, + ind_page, self.path_file, flag_plot, flag_save_figs) + + if flag_parseall: + all_info_fonts = self._extract_all_fonts(XML_root) + dict_all_info[ind_abs] = all_info_fonts + + if flag_parseall: + return dict_text_found, dict_all_info + else: + return dict_text_found + + def parse_fontsize(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', + font_max_min = [8,10], flag_plot = 1, flag_save_figs = 0): + + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) + XML_tree = ET.parse(h_xml) + XML_main_corr = XML_tree.getroot() + else: + print('You need first to obtained the corrected XML file from the original pdf') + else: + print('You need first to obtained the corrected XML file from the original pdf') + + dict_text_found = dict() + for ind_abs, ind_page in enumerate(self.n_pages): + XML_root = ET.Element('pages') + #print(ind_abs,len(self.XML_main)) + XML_root.append(XML_main_corr[ind_abs]) + imarray = np.array(self.imgobj[ind_page]) + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + dim_img = imarray.shape[:2] + dict_text_found_aux = self._extract_textl_ifsize(XML_root, font_max_min) + dict_text_found[ind_abs] = dict_text_found_aux + + if flag_plot: + coord_textline = np.array([]).reshape((4,0)) + for key in dict_text_found_aux.keys(): + aux_c = dict_text_found_aux[key][0] + coord_textline = np.concatenate((coord_textline, aux_c.reshape((4,1))), axis = 1) + + ind_c = dict_text_found_aux[key][2] + colors = ['cyan','green','red','blue','yellow'] + imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, + color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4) + + aux_str = '_'.join(np.array(font_max_min).astype(str)) + self._plot_save(imarray, 'Results of search size:' + aux_str, 'TextFoundSize-' + aux_str, + ind_page, self.path_file, flag_plot, flag_save_figs) + + return dict_text_found +''' \ No newline at end of file