From 0f3920531b1e20ef1dbe31e37a7abcdd46a81c9b Mon Sep 17 00:00:00 2001 From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch> Date: Mon, 8 Jul 2019 10:01:11 +0200 Subject: [PATCH] Functions for parsing information --- src/python/def_classes.py | 259 +++++++++++++++++++++++++++++++------ src/python/preproc_docs.py | 7 +- 2 files changed, 224 insertions(+), 42 deletions(-) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 7c5282ef..e7254cc1 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -759,32 +759,21 @@ class Document: def parse_plot(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', - flag_op = 1, vec_in = '', flag_plot = 1, flag_save_figs = 0, flag_parseall = 0): + flag_op = 1, vec_in = '', flag_plot = 1, flag_save_figs = 0): #, flag_parseall = 0): # flag_op: # 1, search for str, which can be also font type, format vec_in = ['Times-Bold', 'Helvetica']/ # 2, consider a fontsize, and extract all the textlines that are within those margins, # format vec_in = [8, 10 , 12], look in these margins / # 3, look for one or several regular expressions, format vec_in = ['RegExp1', 'RegExp2'] / + # 4, the presence of a section of text with a certain font type and size, bith satisfied + # the input is ['FontType','SizeMin','SizeMax'] - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - if (flag_op) == 1 and (type(vec_in) == str): t_aux = vec_in vec_in = [t_aux] - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) - XML_tree = ET.parse(h_xml) - XML_main_corr = XML_tree.getroot() - else: - print('You need first to obtained the corrected XML file from the original pdf') - else: - print('You need first to obtained the corrected XML file from the original pdf') + XML_main_corr = self._open_xml(suffix_xml, name_outcorrxml) dict_text_found = dict() #dict_all_info = dict() @@ -814,11 +803,19 @@ class Document: elif flag_op == 2: ind_c = dict_text_found_aux[key][2] aux_str = 'size: ' + '_'.join(np.array(vec_in).astype(str)) + elif flag_op == 3: + ind_c = dict_text_found_aux[key][2] + aux_str = 'RegExp: ' + '_'.join(np.array(vec_in).astype(str)) + elif flag_op == 4: + ind_c = dict_text_found_aux[key][2] + aux_str = 'FontSizeandType: ' + '_'.join(np.array(vec_in).astype(str)) colors = ['cyan','green','red','blue','yellow'] imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4) - + + if not len(dict_text_found_aux.keys()): + aux_str = 'empty' self._plot_save(imarray, 'Results of search ' + aux_str, 'TextFound-' + aux_str, ind_page, self.path_file, flag_plot, flag_save_figs) @@ -835,6 +832,161 @@ class Document: return dict_text_found ''' + def combineDicts_plot(self, list_dict_pre, list_op_pre, suffix_xml='_data', name_outcorrxml='04_correctedxml', + flag_plot = 1, flag_save_figs = 0): + + XML_main_corr = self._open_xml(suffix_xml, name_outcorrxml) + + dict_def = dict() + all_keys = np.array([]) + for d in list_dict_pre: + all_keys = np.union1d(all_keys, list(d.keys())) + + for k_p in all_keys.astype(int): + list_dict = copy.copy(list_dict_pre) + list_op = copy.copy(list_op_pre) + dict_aux = dict() + while len(list_op): + aux_d = dict() + dict_and = dict() + + if 'or' == list_op[-1]: + #ind_or = np.argwhere(np.array(list_op) == 'or').reshape((-1,))[0] + ind_or = len(list_op) - 1 + if (k_p in list_dict[ind_or].keys()) and (k_p in list_dict[ind_or + 1].keys()): + vec_aux_d = np.union1d(list(list_dict[ind_or][k_p].keys()), list(list_dict[ind_or + 1][k_p].keys())) + elif (k_p in list_dict[ind_or].keys()): + vec_aux_d = list(list_dict[ind_or][k_p].keys()) + elif (k_p in list_dict[ind_or + 1].keys()): + vec_aux_d = list(list_dict[ind_or + 1][k_p].keys()) + + list_op.pop(ind_or) + + dict_aux_aux = dict() + for i_k in vec_aux_d: + if i_k in list_dict[ind_or][k_p].keys(): + dict_aux_aux[i_k] = list_dict[ind_or][k_p][i_k] + else: + dict_aux_aux[i_k] = list_dict[ind_or + 1][k_p][i_k] + list_dict.pop(ind_or + 1) + aux_d[k_p] = dict_aux_aux + list_dict[ind_or] = aux_d + elif 'and' == list_op[-1]: + #ind_or = np.argwhere(np.array(list_op) == 'and').reshape((-1,))[0] + ind_or = len(list_op) - 1 + if (k_p in list_dict[ind_or].keys()) and (k_p in list_dict[ind_or + 1].keys()): + vec_aux_d = np.intersect1d(list(list_dict[ind_or][k_p].keys()), list(list_dict[ind_or + 1][k_p].keys())) + #dict_and = list_dict[ind_or][k_p] + elif (k_p in list_dict[ind_or].keys()): + vec_aux_d = list() + elif (k_p in list_dict[ind_or + 1].keys()): + vec_aux_d = list() + + list_op.pop(ind_or) + dict_aux_aux = dict() + for i_k in vec_aux_d: + dict_aux_aux[i_k] = list_dict[ind_or][k_p][i_k] + list_dict.pop(ind_or + 1) + aux_d[k_p] = dict_aux_aux + list_dict[ind_or] = aux_d + elif 'not' == list_op[-1]: + #ind_or = np.argwhere(np.array(list_op) == 'not').reshape((-1,))[0] + ind_or = len(list_op) - 1 + if (k_p in list_dict[ind_or].keys()) and (k_p in list_dict[ind_or + 1].keys()): + vec_aux_d = np.setdiff1d(list(list_dict[ind_or][k_p].keys()), list(list_dict[ind_or + 1][k_p].keys())) + #dict_and = list_dict[ind_or][k_p] + elif (k_p in list_dict[ind_or].keys()): + vec_aux_d = list(list_dict[ind_or][k_p].keys()) + elif (k_p in list_dict[ind_or + 1].keys()): + vec_aux_d = list(list_dict[ind_or + 1][k_p].keys()) + + list_op.pop(ind_or) + dict_aux_aux = dict() + for i_k in vec_aux_d: + dict_aux_aux[i_k] = list_dict[ind_or][k_p][i_k] + list_dict.pop(ind_or + 1) + aux_d[k_p] = dict_aux_aux + list_dict[ind_or] = aux_d + + #for k_op in aux_d[k_p].keys(): + # dict_aux[k_op] = dict_and[k_op] + + dict_def[k_p] = aux_d[k_p] + + if flag_plot: + aux_str = 'Definitive: ' + ' '.join(list_op_pre) + self._plot_dict(XML_main_corr, dict_def, k_p, aux_str, 'blue', + 'Results of combination ', 'CombinationDicts-', flag_save_figs) + + ''' + XML_root = ET.Element('pages') + #print(ind_abs,len(self.XML_main)) + XML_root.append(XML_main_corr[k_p]) + imarray = np.array(self.imgobj[k_p]) + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + for key in dict_def[k_p].keys(): + aux_c = dict_def[k_p][key][0] + + + imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, + color_vec = 'blue', alpha = True, filled = False, thick_line = 4) + + if not len(dict_def[k_p].keys()): + aux_str = 'empty' + self._plot_save(imarray, 'Results of combination ' + aux_str, 'CombinationDicts-' + aux_str, + k_p, self.path_file, flag_plot, flag_save_figs) + ''' + return dict_def + + def _plot_dict(self, XML_main_corr, dict_def, k_p, aux_str, color, + title_str, name_file, flag_save_figs = 0): + XML_root = ET.Element('pages') + #print(ind_abs,len(self.XML_main)) + XML_root.append(XML_main_corr[k_p]) + imarray = np.array(self.imgobj[k_p]) + bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64) + for key in dict_def[k_p].keys(): + aux_c = dict_def[k_p][key][0] + + imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page, + color_vec = color, alpha = True, filled = False, thick_line = 4) + + if not len(dict_def[k_p].keys()): + aux_str = 'empty' + + self._plot_save(imarray, 'Results of combination ' + aux_str, 'CombinationDicts-' + aux_str, + k_p, self.path_file, 1, flag_save_figs) + + def blocksfromtextline_plot(self, dict_in, suffix_xml='_data', name_outcorrxml='04_correctedxml', + flag_plot = 1, flag_save_figs = 0): + # Returns dictionary with keys for pages, and then keys for blocks + + XML_m = self._open_xml(suffix_xml, name_outcorrxml) + + dict_array = dict() + + for k_p in dict_in.keys(): + dict_aux = dict() + list_id = np.array([int(o.get('id')) if 'id' in o.attrib else -1 for o in XML_m.findall('page[' + str(k_p + 1) + ']/textbox')]) + list_block = np.array([int(o.get('block')) if 'block' in o.attrib else -1 for o in XML_m.findall('page[' + str(k_p + 1) + ']/textbox')]) + for k in dict_in[k_p].keys(): + ind_textb = np.intersect1d(np.argwhere(list_block == int(k.split('_')[0])), np.argwhere(list_id == int(k.split('_')[1]))) + all_text_b = [o.text for o in XML_m[k_p][int(ind_textb)]] + all_text_b = '. '.join(all_text_b) + print(all_text_b) + dict_aux[k] = [np.array(XML_m[k_p][int(ind_textb)].attrib['bbox'].split(',')).astype(np.float64),all_text_b] + + dict_array[k_p] = dict_aux + + if flag_plot: + aux_str = 'Blocks' + self._plot_dict(XML_m, dict_array, k_p, aux_str, 'red', + 'Plotting blocks', 'BlocksDicts-', flag_save_figs) + + + return dict_array + + def _extract_textl_if(self, XML_root, vec_in, flag_op): # The key is as follows: <block_id>_<textbox_id>_<y1>_<x1> dict_text_found = dict() @@ -847,17 +999,17 @@ class Document: for ind_el in range(0, len(XML_root[0])): for ind_line in range(0, len(XML_root[0][ind_el])): - if XML_root[0][ind_el][ind_line].tag == 'textline': + if XML_root[0][ind_el][ind_line].tag == 'textline' and ('block' in XML_root[0][ind_el].attrib): #print(XML_root[0][ind_el][ind_line].text) str_text = str(XML_root[0][ind_el][ind_line].text) + key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), + str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) + coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) if flag_op == 1: if np.sum([str_text.find(i) > -1 for i in vec_in]): #if (str_text.find(text_search)) > -1: - key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) - coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) dict_text_found[key_d] = [coord_textline_aux,str_text] elif flag_op == 2: @@ -871,11 +1023,35 @@ class Document: if np.sum(np.array(ind_val) > -1): #if (str_text.find(text_search)) > -1: - key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'], - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]), - str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])]) - coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64) dict_text_found[key_d] = [coord_textline_aux, str_text, np.max(ind_val)] + + elif flag_op == 3: + str_text_red = str_text.replace(' ',' ') + + str_text_red = utils_annot.get_text(str_text_red) + # print(str_text) + aux_regexp = list() + [aux_regexp.append(len(re.findall(i, str_text_red))) for i in vec_in] + aux_regexp = np.array(aux_regexp) + if np.max(aux_regexp): + # For regular expressions + dict_text_found[key_d] = [coord_textline_aux, str_text, np.argmax(aux_regexp)] + + elif flag_op == 4: + ind_f2 = utils_proc.find_all(str_text, 'size="') + ind_f3 = utils_proc.find_all(str_text, 'face="') + ind_val = list() + for i2 in range(len(ind_f2)): + f_size = np.array(str_text[ind_f2[i2] + len('size'):].split('"')[1]).astype(np.float64) + f_type = np.array(str_text[ind_f3[i2] + len('face'):].split('=')[1].split(' ')[0]) + #print(f_type) + ind_f_size = np.intersect1d(np.argwhere(vec_in[1] < f_size), np.argwhere(vec_in[2] > f_size)) + if len(ind_f_size) and (f_type == vec_in[0]): + ind_val.append(ind_f_size) + + if np.sum(np.array(ind_val) > -1): + #if (str_text.find(text_search)) > -1: + dict_text_found[key_d] = [coord_textline_aux, str_text, 2] return dict_text_found @@ -883,20 +1059,7 @@ class Document: def parse_all(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml'): - if 'name_outcorrxml' not in self.__dict__.keys(): - self.name_outcorrxml = name_outcorrxml - - name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' - if os.path.isfile(name_tar): - name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' - if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: - h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) - XML_tree = ET.parse(h_xml) - XML_main_corr = XML_tree.getroot() - else: - print('You need first to obtained the corrected XML file from the original pdf') - else: - print('You need first to obtained the corrected XML file from the original pdf') + XML_main_corr = self._open_xml(suffix_xml, name_outcorrxml) dict_all_info = dict() for ind_abs, ind_page in enumerate(self.n_pages): @@ -920,6 +1083,24 @@ class Document: mat_all_f = np.concatenate((mat_all_f, aux_c.reshape((1,7))), axis = 0) return dict_all_info, mat_all_f + def _open_xml(self, suffix_xml='_data', name_outcorrxml='04_correctedxml'): + + if 'name_outcorrxml' not in self.__dict__.keys(): + self.name_outcorrxml = name_outcorrxml + + name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz' + if os.path.isfile(name_tar): + name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml' + if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]: + h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml) + XML_tree = ET.parse(h_xml) + XML_main_corr = XML_tree.getroot() + return XML_main_corr + else: + print('You need first to obtained the corrected XML file from the original pdf') + else: + print('You need first to obtained the corrected XML file from the original pdf') + def _extract_all_fonts(self, XML_root): diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py index e5084adb..70946ac2 100644 --- a/src/python/preproc_docs.py +++ b/src/python/preproc_docs.py @@ -841,9 +841,10 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal height_resc = HEIGHT_CHAR * rescale_factor[0,1] widht_resc = WIDTH_CHAR * rescale_factor[0,1] - max_inrow_sep = 4 * widht_resc - max_incol_sep = 1 * height_resc - gap_row = height_resc/2 + max_inrow_sep = 8 * widht_resc # Distance in the row, i.e., in x. Initially, 6 + max_incol_sep = 1 * height_resc # Maximum separation allowed between two rows + # to be considered still part of the same block + gap_row = height_resc/2 # Small gap allow in the same row, in y similarity_fonts = 0.95 indentation = 2 * widht_resc -- GitLab