From a3ff06af9bb2a5a6fe33be6eb795109924e19317 Mon Sep 17 00:00:00 2001
From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch>
Date: Fri, 12 Jun 2020 15:57:23 +0200
Subject: [PATCH] Last improvements on the title completion code

---
 src/python/run_titlesComp_2steps.py | 109 +++++++++++++++++++++++++---
 1 file changed, 100 insertions(+), 9 deletions(-)

diff --git a/src/python/run_titlesComp_2steps.py b/src/python/run_titlesComp_2steps.py
index 9554bd50..a93b73ec 100644
--- a/src/python/run_titlesComp_2steps.py
+++ b/src/python/run_titlesComp_2steps.py
@@ -46,6 +46,69 @@ def remove_space_char(text):
     for match in all_matches:
         text = regex.sub(match,match.replace(' ',''),text)
     return text
+
+def correct_textline_df(df_feat):
+    # Function to correct for textblocks wrongly splitted in different textlines
+    # We simply check the y coordinates of the bbox
+    flag_notend = 1
+    ind_row = 0
+    df_feat_corr = df_feat.copy()
+    while flag_notend:
+        curr_coords = pt.coord_string_to_array(df_feat['bbox'].iloc[ind_row])[[1,3]]
+        offset = 1
+        flag_ne = 1
+        while flag_ne:
+            next_coords = pt.coord_string_to_array(df_feat['bbox'].iloc[ind_row + offset])[[1,3]]
+            # If the bbox of the next text block is placed at the same y coordinates,
+            # but it is considered a different line
+            if ((np.sum(np.abs(next_coords - curr_coords)) < 2) and 
+                (df_feat['num_line'].iloc[ind_row] != df_feat['num_line'].iloc[ind_row + offset])):
+                offset += 1
+                if (ind_row + offset) == len(df_feat):
+                     flag_ne = 0
+                     flag_notend = 0
+            else:
+                flag_ne = 0
+                
+        # If we can merge 2 text blocks, then we have to modify the following columns:
+        # textl, textb, in_char, end_char, num_line, length
+        if offset > 1:
+            vec_lengths = []
+            text_length = []
+            print('Correcting from Row {} to {}'.format(ind_row, ind_row + offset))
+            for i_off in range(offset):
+                if not i_off:
+                    val_textl = df_feat['textl'].iloc[ind_row]
+                    val_textb = df_feat['textb'].iloc[ind_row]
+                    val_numl = df_feat['num_line'].iloc[ind_row]
+                    val_inchar = df_feat['in_char'].iloc[ind_row]
+                vec_lengths.append(pt.coord_string_to_array(df_feat['bbox'].iloc[ind_row + i_off])[[0,2]])
+                text_length.append(len(df_feat['text'].iloc[ind_row + i_off]))
+            
+            tot_len = float(np.sum(text_length))/(1 - val_inchar)
+            for i_off in range(offset):
+                df_feat_corr['textl'].iloc[ind_row + i_off] = val_textl
+                df_feat_corr['textb'].iloc[ind_row + i_off] = val_textb + i_off
+                df_feat_corr['num_line'].iloc[ind_row + i_off] = val_numl
+                if i_off:
+                    df_feat_corr['in_char'].iloc[ind_row + i_off] = df_feat_corr['end_char'].iloc[ind_row + i_off - 1]
+    
+                #df_feat_corr['end_char'].iloc[ind_row + i_off] = np.min([1, val_inchar + float(vec_lengths[i_off][1] - np.min(vec_lengths))/tot_len])
+                df_feat_corr['end_char'].iloc[ind_row + i_off] = np.min([1, df_feat_corr['in_char'].iloc[ind_row + i_off] + 
+                                                                         float(text_length[i_off])/tot_len])
+                df_feat_corr['length'].iloc[ind_row + i_off] = df_feat_corr['end_char'].iloc[ind_row + i_off] - df_feat_corr['in_char'].iloc[ind_row + i_off]
+    
+            # Now, correct next values of num_line, as they have to be -1. Until next
+            # num_line = 0
+            ind_next_numline0 = np.argwhere(np.asarray(df_feat['num_line'].iloc[(ind_row + offset):]) == 0)
+            if len(ind_next_numline0):
+                ind_next_numline0 = ind_next_numline0.min()
+                df_feat_corr['num_line'].iloc[(ind_row + offset):(ind_row + offset + ind_next_numline0)] -= 1
+                    
+        ind_row += offset
+        if ind_row == len(df_feat) - 1:
+            flag_notend = 0
+    return df_feat_corr
     
 # Open a csv 
 
@@ -53,7 +116,7 @@ data_folder = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data
 folder_databaseAB = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar/'
 col2_f3_year = 1917
 
-year = 1893 # 1893
+year = 1945 # 1893, 1920
 flag_type_for_doc = 3
 
 flag_save_fig = 1
@@ -67,7 +130,7 @@ names_col_df_feat = ['year','file_id','page_id','index','textl','textb','text','
 
 #%%
 
-'''
+
 
 unique_ids = np.unique(df['file_id'])
 
@@ -84,7 +147,8 @@ n_next_lines = 2
 if year >= col2_f3_year:
     n_next_lines = 3
     
-    
+# This for builds a feature matrix that will be later 
+# used for the title completion
 for ids in unique_ids:
     # Check all ids and pages, and create a parallel csv with full titles.
     unique_pages = np.unique(df[df['file_id'] == ids]['page_id'])
@@ -155,8 +219,10 @@ for ids in unique_ids:
 
 df_feat = pd.DataFrame(vec_df_feat, columns = names_col_df_feat)
 
+df_feat_corr = correct_textline_df(df_feat)
+
 # Now one hot encoding for font type
-df_feat = pd.concat([df_feat,pd.get_dummies(df_feat['font_type'])], axis = 1)
+df_feat_corr = pd.concat([df_feat_corr, pd.get_dummies(df_feat['font_type'])], axis = 1)
 
 # Let's build the vocabulary only from the first X words
 
@@ -172,12 +238,14 @@ vocab, ocurr = utils_feats.get_vocab(' '.join(all_text_for_vocab), min_ocurr = 5
 # Some set of rules:
 # 1. If we find any of the following words among the first num_words, then it
 # ends the title
+
 words_ends = ['prüfung', 'prufung', 'botschaft', 'bötschaft', 'betr', 'betr.', 'berichte', 'bericht',
               'petitionen', 'schreiben', 'bot-', 'bot -', 'mitunterzeichnern', 'be-', 'be -']
 vec = CountVectorizer(vocabulary = words_ends, tokenizer = word_tokenize)
 mat_ocurr = vec.fit_transform(all_text_for_vocab).todense()
 
-df_feat_words_ends = pd.concat([df_feat, pd.DataFrame(mat_ocurr, columns = words_ends)], axis = 1)
+df_feat_words_ends = pd.concat([df_feat_corr, pd.DataFrame(mat_ocurr, columns = words_ends)], axis = 1)
+
 # 2. If not, we iterate through the textlines, taking the last textblock 
 # of the current line, and compare against the previous one (same or different line).
 # If it is different in font type, then we mark it as the start
@@ -190,7 +258,7 @@ df_feat_words_ends = pd.concat([df_feat, pd.DataFrame(mat_ocurr, columns = words
 # * For later year, specially when we have two columns, we should consider more 
 #   textlines. We can just use the delimiters set in the doc class
 
-'''
+
 
 #%%
 all_indexes = np.array(df_feat_words_ends[df_feat_words_ends['num_line'] == 0][df_feat_words_ends['textb'] == 0].index)
@@ -202,7 +270,7 @@ names_col_titles = ['year','file_id','page_id','index','title_text','in_textl','
 
 # Conditions!
 def_terms = ['botschaft', 'bötschaft', 'prüfung', 'prufung', 'berichte', 'bericht', 'schreiben','mitunterzeichnern']
-def_terms_regex = ['[v,y][o,ò][m,n][i]?[ .\xad-]{0,10}[0-9]', '[" "]be[\xad,-][ ]{0,10}richt', '[" "]bot[-,\xad][ ]{0,10}schaft[ ]',
+def_terms_regex = ['[v,y][o,ò,d][m,n][i]?[ .\xad-]{0,10}[0-9]', '[" "]be[\xad,-][ ]{0,10}richt', '[" "]bot[-,\xad][ ]{0,10}schaft[ ]',
                    '[ ]botschaft[ ]']
 font_notitle = 'Times-Roman'
 min_length = 10 # for textblocks to count
@@ -239,9 +307,27 @@ for count_ind, index_t in enumerate(all_indexes):
     in_textb = df_feat_words_ends['textb'].iloc[index_t]    
     in_index = df_feat_words_ends['index'].iloc[index_t] 
 
-    if (df_aux['file_id'].iloc[0] == 110000009) and (df_aux['page_id'].iloc[0] == 1):
+    if (df_aux['file_id'].iloc[0] == 110000427) and (df_aux['page_id'].iloc[0] == 2):
         stop = 1
         
+    # To check if the textline ends much before the rightmost coordinate. 
+    # To capture titles for example. We check only first 2 num_lines
+    
+    ind_min1 = 100
+    all_bbox = np.asarray([pt.coord_string_to_array(df_aux['bbox'].iloc[o]) for o in range(len(df_aux))])
+    min_max_ind = [all_bbox[:,0].min(), all_bbox[:,2].max()]
+    max_len = min_max_ind[1] - min_max_ind[0]
+    ind_un_numline = np.unique(df_aux['num_line'])[:2]
+    for ind_nl in ind_un_numline:
+        ind_nl_check = np.argwhere(np.asarray(df_aux['num_line']) == ind_nl).flatten()[-1]
+        coord_l = pt.coord_string_to_array(df_aux['bbox'].iloc[ind_nl_check])
+        if coord_l[2] < (min_max_ind[1] - max_len/10):
+            ind_min1 = ind_nl_check + 1
+            flag_notend = 0
+            print('Abrupt end detected')
+            break
+    
+    # Check for keywords in the text
     # Merge all the text and check regular expressions
     all_text_check_norm = ' '.join([o.text for ind_r, o in df_aux.iterrows()])
     all_text_check = ' '.join([o.text.lower() for ind_r, o in df_aux.iterrows()])
@@ -271,6 +357,7 @@ for count_ind, index_t in enumerate(all_indexes):
     # Now, locate the textblock where we found the term
     ind_pot_ind = np.array(ind_pot_ind)
     term_match = np.array(term_match)
+    ind_min = 1000
     if len(ind_pot_ind):
         flag_notend = 0
         term_match = term_match[np.argmin(ind_pot_ind)]
@@ -295,6 +382,10 @@ for count_ind, index_t in enumerate(all_indexes):
             #        (df_aux['end_char'].iloc[ind_min_aux] - df_aux['in_char'].iloc[ind_min_aux]))
             text_title = all_text_check_norm[:np.min(ind_pot_ind)]
     
+    if not flag_notend and (ind_min1 < ind_min):
+        ind_min = ind_min1
+        flag_fulltextblock = 1
+        
     # Now condition for abrupt jump in first line
     # At the level of the textblock
     if flag_notend:
@@ -401,7 +492,7 @@ df_titles_def = pd.DataFrame(list_vec_titles, columns = names_col_titles)
 #%%
 # Saving figures
 flag_text = 1
-resolution = 200
+resolution = 100
 if flag_save_fig:
     path_output_img = os.path.join(data_folder, '{}_previews'.format(year))
     if not os.path.exists(path_output_img): 
-- 
GitLab