From 0f3920531b1e20ef1dbe31e37a7abcdd46a81c9b Mon Sep 17 00:00:00 2001
From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch>
Date: Mon, 8 Jul 2019 10:01:11 +0200
Subject: [PATCH] Functions for parsing information

---
 src/python/def_classes.py  | 259 +++++++++++++++++++++++++++++++------
 src/python/preproc_docs.py |   7 +-
 2 files changed, 224 insertions(+), 42 deletions(-)

diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 7c5282ef..e7254cc1 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -759,32 +759,21 @@ class Document:
 
 
     def parse_plot(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml',
-                   flag_op = 1, vec_in = '', flag_plot = 1, flag_save_figs = 0, flag_parseall = 0):
+                   flag_op = 1, vec_in = '', flag_plot = 1, flag_save_figs = 0): #, flag_parseall = 0):
         
         # flag_op: 
         # 1, search for str, which can be also font type, format vec_in = ['Times-Bold', 'Helvetica']/ 
         # 2, consider a fontsize, and extract all the textlines that are within those margins,
         #    format vec_in = [8, 10 , 12], look in these margins / 
         # 3, look for one or several regular expressions, format vec_in = ['RegExp1', 'RegExp2'] / 
+        # 4, the presence of a section of text with a certain font type and size, bith satisfied
+        # the input is ['FontType','SizeMin','SizeMax']
         
-        if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml
-
         if (flag_op) == 1 and (type(vec_in) == str):
             t_aux = vec_in
             vec_in = [t_aux]
 
-        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'        
-        if os.path.isfile(name_tar):
-            name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
-            if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:
-                h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
-                XML_tree = ET.parse(h_xml)
-                XML_main_corr = XML_tree.getroot()     
-            else:
-                print('You need first to obtained the corrected XML file from the original pdf')                
-        else:
-            print('You need first to obtained the corrected XML file from the original pdf')
+        XML_main_corr = self._open_xml(suffix_xml, name_outcorrxml)    
             
         dict_text_found = dict()
         #dict_all_info = dict()
@@ -814,11 +803,19 @@ class Document:
                     elif flag_op == 2:
                         ind_c = dict_text_found_aux[key][2]
                         aux_str = 'size: ' + '_'.join(np.array(vec_in).astype(str))
+                    elif flag_op == 3:
+                        ind_c = dict_text_found_aux[key][2]
+                        aux_str = 'RegExp: ' + '_'.join(np.array(vec_in).astype(str))      
+                    elif flag_op == 4:
+                        ind_c = dict_text_found_aux[key][2]
+                        aux_str = 'FontSizeandType: ' + '_'.join(np.array(vec_in).astype(str))                            
                     
                     colors = ['cyan','green','red','blue','yellow']
                     imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page,
                                color_vec = colors[np.ravel(ind_c)[0]], alpha = True, filled = False, thick_line = 4)
-                        
+                
+                if not len(dict_text_found_aux.keys()):
+                    aux_str = 'empty'
                 self._plot_save(imarray, 'Results of search ' + aux_str, 'TextFound-' + aux_str, 
                                 ind_page, self.path_file, flag_plot, flag_save_figs)
         
@@ -835,6 +832,161 @@ class Document:
             return dict_text_found
         '''
 
+    def combineDicts_plot(self, list_dict_pre, list_op_pre, suffix_xml='_data', name_outcorrxml='04_correctedxml', 
+                          flag_plot = 1, flag_save_figs = 0):
+        
+        XML_main_corr = self._open_xml(suffix_xml, name_outcorrxml)      
+        
+        dict_def = dict()
+        all_keys = np.array([])
+        for d in list_dict_pre:
+            all_keys = np.union1d(all_keys, list(d.keys()))
+    
+        for k_p in all_keys.astype(int):
+            list_dict = copy.copy(list_dict_pre)
+            list_op = copy.copy(list_op_pre)
+            dict_aux = dict()
+            while len(list_op):
+                aux_d = dict()
+                dict_and = dict()
+                
+                if 'or' == list_op[-1]:
+                    #ind_or = np.argwhere(np.array(list_op) == 'or').reshape((-1,))[0]
+                    ind_or = len(list_op) - 1
+                    if (k_p in list_dict[ind_or].keys()) and (k_p in list_dict[ind_or + 1].keys()):
+                        vec_aux_d = np.union1d(list(list_dict[ind_or][k_p].keys()), list(list_dict[ind_or + 1][k_p].keys()))
+                    elif (k_p in list_dict[ind_or].keys()):
+                        vec_aux_d = list(list_dict[ind_or][k_p].keys())
+                    elif (k_p in list_dict[ind_or + 1].keys()):  
+                        vec_aux_d = list(list_dict[ind_or + 1][k_p].keys())
+                   
+                    list_op.pop(ind_or)
+                    
+                    dict_aux_aux = dict()
+                    for i_k in vec_aux_d:
+                        if i_k in list_dict[ind_or][k_p].keys():
+                            dict_aux_aux[i_k] = list_dict[ind_or][k_p][i_k]
+                        else:
+                            dict_aux_aux[i_k] = list_dict[ind_or + 1][k_p][i_k]
+                    list_dict.pop(ind_or + 1)
+                    aux_d[k_p] = dict_aux_aux
+                    list_dict[ind_or] = aux_d
+                elif 'and' == list_op[-1]:
+                    #ind_or = np.argwhere(np.array(list_op) == 'and').reshape((-1,))[0]
+                    ind_or = len(list_op) - 1
+                    if (k_p in list_dict[ind_or].keys()) and (k_p in list_dict[ind_or + 1].keys()):
+                        vec_aux_d = np.intersect1d(list(list_dict[ind_or][k_p].keys()), list(list_dict[ind_or + 1][k_p].keys()))
+                        #dict_and = list_dict[ind_or][k_p]
+                    elif (k_p in list_dict[ind_or].keys()):
+                        vec_aux_d = list()
+                    elif (k_p in list_dict[ind_or + 1].keys()):  
+                        vec_aux_d = list()
+                          
+                    list_op.pop(ind_or)                    
+                    dict_aux_aux = dict()
+                    for i_k in vec_aux_d:
+                        dict_aux_aux[i_k] = list_dict[ind_or][k_p][i_k]
+                    list_dict.pop(ind_or + 1)
+                    aux_d[k_p] = dict_aux_aux                  
+                    list_dict[ind_or] = aux_d                
+                elif 'not' == list_op[-1]:
+                    #ind_or = np.argwhere(np.array(list_op) == 'not').reshape((-1,))[0]
+                    ind_or = len(list_op) - 1
+                    if (k_p in list_dict[ind_or].keys()) and (k_p in list_dict[ind_or + 1].keys()):
+                        vec_aux_d = np.setdiff1d(list(list_dict[ind_or][k_p].keys()), list(list_dict[ind_or + 1][k_p].keys()))
+                        #dict_and = list_dict[ind_or][k_p]
+                    elif (k_p in list_dict[ind_or].keys()):
+                        vec_aux_d = list(list_dict[ind_or][k_p].keys())
+                    elif (k_p in list_dict[ind_or + 1].keys()):  
+                        vec_aux_d = list(list_dict[ind_or + 1][k_p].keys())
+                          
+                    list_op.pop(ind_or)                    
+                    dict_aux_aux = dict()
+                    for i_k in vec_aux_d:
+                        dict_aux_aux[i_k] = list_dict[ind_or][k_p][i_k]
+                    list_dict.pop(ind_or + 1)
+                    aux_d[k_p] = dict_aux_aux                  
+                    list_dict[ind_or] = aux_d                
+            
+            #for k_op in aux_d[k_p].keys():
+            #    dict_aux[k_op] = dict_and[k_op]
+   
+            dict_def[k_p] = aux_d[k_p]
+   
+            if flag_plot:
+                aux_str = 'Definitive: ' + ' '.join(list_op_pre)
+                self._plot_dict(XML_main_corr, dict_def, k_p, aux_str, 'blue',
+                       'Results of combination ', 'CombinationDicts-', flag_save_figs)
+    
+            '''
+                XML_root = ET.Element('pages')
+                #print(ind_abs,len(self.XML_main))
+                XML_root.append(XML_main_corr[k_p])                
+                imarray = np.array(self.imgobj[k_p])
+                bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)                
+                for key in dict_def[k_p].keys():
+                    aux_c = dict_def[k_p][key][0]
+                    
+                    
+                    imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page,
+                               color_vec = 'blue', alpha = True, filled = False, thick_line = 4)
+                
+                if not len(dict_def[k_p].keys()):
+                    aux_str = 'empty'
+                self._plot_save(imarray, 'Results of combination ' + aux_str, 'CombinationDicts-' + aux_str, 
+                                k_p, self.path_file, flag_plot, flag_save_figs)
+                '''
+        return dict_def    
+
+    def _plot_dict(self, XML_main_corr, dict_def, k_p, aux_str, color,
+                   title_str, name_file, flag_save_figs = 0):
+        XML_root = ET.Element('pages')
+        #print(ind_abs,len(self.XML_main))
+        XML_root.append(XML_main_corr[k_p])                
+        imarray = np.array(self.imgobj[k_p])
+        bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)                
+        for key in dict_def[k_p].keys():
+            aux_c = dict_def[k_p][key][0]
+            
+            imarray = plot_tools.highlight_text(imarray, aux_c, bbox_page,
+                       color_vec = color, alpha = True, filled = False, thick_line = 4)
+        
+        if not len(dict_def[k_p].keys()):
+            aux_str = 'empty'
+            
+        self._plot_save(imarray, 'Results of combination ' + aux_str, 'CombinationDicts-' + aux_str, 
+                        k_p, self.path_file, 1, flag_save_figs)
+
+    def blocksfromtextline_plot(self, dict_in, suffix_xml='_data', name_outcorrxml='04_correctedxml', 
+                          flag_plot = 1, flag_save_figs = 0):
+        # Returns dictionary with keys for pages, and then keys for blocks
+        
+        XML_m = self._open_xml(suffix_xml, name_outcorrxml)  
+        
+        dict_array = dict()
+            
+        for k_p in dict_in.keys():
+            dict_aux = dict()
+            list_id = np.array([int(o.get('id')) if 'id' in o.attrib else -1 for o in XML_m.findall('page[' + str(k_p + 1) + ']/textbox')])
+            list_block = np.array([int(o.get('block')) if 'block' in o.attrib else -1 for o in XML_m.findall('page[' + str(k_p + 1) + ']/textbox')])              
+            for k in dict_in[k_p].keys():
+                ind_textb = np.intersect1d(np.argwhere(list_block == int(k.split('_')[0])), np.argwhere(list_id == int(k.split('_')[1])))
+                all_text_b = [o.text for o in XML_m[k_p][int(ind_textb)]]
+                all_text_b = '. '.join(all_text_b)
+                print(all_text_b)    
+                dict_aux[k] = [np.array(XML_m[k_p][int(ind_textb)].attrib['bbox'].split(',')).astype(np.float64),all_text_b]
+            
+            dict_array[k_p] = dict_aux
+            
+            if flag_plot:
+                aux_str = 'Blocks'
+                self._plot_dict(XML_m, dict_array, k_p, aux_str, 'red',
+                       'Plotting blocks', 'BlocksDicts-', flag_save_figs)
+            
+            
+        return dict_array
+
+
     def _extract_textl_if(self, XML_root, vec_in, flag_op):
         # The key is as follows: <block_id>_<textbox_id>_<y1>_<x1>
         dict_text_found = dict()
@@ -847,17 +999,17 @@ class Document:
         
         for ind_el in range(0, len(XML_root[0])):
             for ind_line in range(0, len(XML_root[0][ind_el])):
-                if XML_root[0][ind_el][ind_line].tag == 'textline':  
+                if XML_root[0][ind_el][ind_line].tag == 'textline' and ('block' in XML_root[0][ind_el].attrib):  
                     #print(XML_root[0][ind_el][ind_line].text)                      
                     str_text = str(XML_root[0][ind_el][ind_line].text)
                     
+                    key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'],
+                                     str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]),
+                                     str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])])
+                    coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)                    
                     if flag_op == 1:
                         if np.sum([str_text.find(i) > -1 for i in vec_in]):
                         #if (str_text.find(text_search)) > -1:
-                            key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'],
-                                             str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]),
-                                             str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])])
-                            coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
                             dict_text_found[key_d] = [coord_textline_aux,str_text]                    
                     
                     elif flag_op == 2:
@@ -871,11 +1023,35 @@ class Document:
                                 
                         if np.sum(np.array(ind_val) > -1):
                         #if (str_text.find(text_search)) > -1:
-                            key_d = '_'.join([XML_root[0][ind_el].attrib['block'],XML_root[0][ind_el].attrib['id'],
-                                             str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[0]),
-                                             str(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')[1])])
-                            coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
                             dict_text_found[key_d] = [coord_textline_aux, str_text, np.max(ind_val)]
+                            
+                    elif flag_op == 3:
+                        str_text_red = str_text.replace('  ',' ')
+                        
+                        str_text_red = utils_annot.get_text(str_text_red)
+                        # print(str_text)
+                        aux_regexp = list()
+                        [aux_regexp.append(len(re.findall(i, str_text_red))) for i in vec_in]
+                        aux_regexp = np.array(aux_regexp)
+                        if np.max(aux_regexp):
+                        # For regular expressions
+                            dict_text_found[key_d] = [coord_textline_aux, str_text, np.argmax(aux_regexp)]
+                            
+                    elif flag_op == 4:
+                        ind_f2 = utils_proc.find_all(str_text, 'size="')   
+                        ind_f3 = utils_proc.find_all(str_text, 'face="') 
+                        ind_val = list()
+                        for i2 in range(len(ind_f2)):
+                            f_size = np.array(str_text[ind_f2[i2] + len('size'):].split('"')[1]).astype(np.float64)
+                            f_type = np.array(str_text[ind_f3[i2] + len('face'):].split('=')[1].split(' ')[0])
+                            #print(f_type)
+                            ind_f_size = np.intersect1d(np.argwhere(vec_in[1] < f_size), np.argwhere(vec_in[2] > f_size))
+                            if len(ind_f_size) and (f_type == vec_in[0]):
+                                ind_val.append(ind_f_size)      
+                                
+                        if np.sum(np.array(ind_val) > -1):
+                        #if (str_text.find(text_search)) > -1:
+                            dict_text_found[key_d] = [coord_textline_aux, str_text, 2]                            
                     
         return dict_text_found
 
@@ -883,20 +1059,7 @@ class Document:
         
     def parse_all(self, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml'):
         
-        if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml
-
-        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'        
-        if os.path.isfile(name_tar):
-            name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
-            if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:
-                h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
-                XML_tree = ET.parse(h_xml)
-                XML_main_corr = XML_tree.getroot()     
-            else:
-                print('You need first to obtained the corrected XML file from the original pdf')                
-        else:
-            print('You need first to obtained the corrected XML file from the original pdf')
+        XML_main_corr = self._open_xml(suffix_xml, name_outcorrxml)
             
         dict_all_info = dict()
         for ind_abs, ind_page in enumerate(self.n_pages):
@@ -920,6 +1083,24 @@ class Document:
                     mat_all_f = np.concatenate((mat_all_f, aux_c.reshape((1,7))), axis = 0)
         return dict_all_info, mat_all_f
      
+    def _open_xml(self, suffix_xml='_data', name_outcorrxml='04_correctedxml'):
+        
+        if 'name_outcorrxml' not in self.__dict__.keys():
+            self.name_outcorrxml = name_outcorrxml
+
+        name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'        
+        if os.path.isfile(name_tar):
+            name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
+            if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:
+                h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
+                XML_tree = ET.parse(h_xml)
+                XML_main_corr = XML_tree.getroot()   
+                return XML_main_corr
+            else:
+                print('You need first to obtained the corrected XML file from the original pdf')                
+        else:
+            print('You need first to obtained the corrected XML file from the original pdf')
+        
     
     def _extract_all_fonts(self, XML_root):   
         
diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py
index e5084adb..70946ac2 100644
--- a/src/python/preproc_docs.py
+++ b/src/python/preproc_docs.py
@@ -841,9 +841,10 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     widht_resc = WIDTH_CHAR * rescale_factor[0,1]
 
-    max_inrow_sep = 4 * widht_resc
-    max_incol_sep = 1 * height_resc
-    gap_row = height_resc/2
+    max_inrow_sep = 8 * widht_resc # Distance in the row, i.e., in x. Initially, 6
+    max_incol_sep = 1 * height_resc # Maximum separation allowed between two rows
+        # to be considered still part of the same block
+    gap_row = height_resc/2 # Small gap allow in the same row, in y
     similarity_fonts = 0.95
     indentation = 2 * widht_resc
 
-- 
GitLab