From a3ff06af9bb2a5a6fe33be6eb795109924e19317 Mon Sep 17 00:00:00 2001 From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch> Date: Fri, 12 Jun 2020 15:57:23 +0200 Subject: [PATCH] Last improvements on the title completion code --- src/python/run_titlesComp_2steps.py | 109 +++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 9 deletions(-) diff --git a/src/python/run_titlesComp_2steps.py b/src/python/run_titlesComp_2steps.py index 9554bd50..a93b73ec 100644 --- a/src/python/run_titlesComp_2steps.py +++ b/src/python/run_titlesComp_2steps.py @@ -46,6 +46,69 @@ def remove_space_char(text): for match in all_matches: text = regex.sub(match,match.replace(' ',''),text) return text + +def correct_textline_df(df_feat): + # Function to correct for textblocks wrongly splitted in different textlines + # We simply check the y coordinates of the bbox + flag_notend = 1 + ind_row = 0 + df_feat_corr = df_feat.copy() + while flag_notend: + curr_coords = pt.coord_string_to_array(df_feat['bbox'].iloc[ind_row])[[1,3]] + offset = 1 + flag_ne = 1 + while flag_ne: + next_coords = pt.coord_string_to_array(df_feat['bbox'].iloc[ind_row + offset])[[1,3]] + # If the bbox of the next text block is placed at the same y coordinates, + # but it is considered a different line + if ((np.sum(np.abs(next_coords - curr_coords)) < 2) and + (df_feat['num_line'].iloc[ind_row] != df_feat['num_line'].iloc[ind_row + offset])): + offset += 1 + if (ind_row + offset) == len(df_feat): + flag_ne = 0 + flag_notend = 0 + else: + flag_ne = 0 + + # If we can merge 2 text blocks, then we have to modify the following columns: + # textl, textb, in_char, end_char, num_line, length + if offset > 1: + vec_lengths = [] + text_length = [] + print('Correcting from Row {} to {}'.format(ind_row, ind_row + offset)) + for i_off in range(offset): + if not i_off: + val_textl = df_feat['textl'].iloc[ind_row] + val_textb = df_feat['textb'].iloc[ind_row] + val_numl = df_feat['num_line'].iloc[ind_row] + val_inchar = df_feat['in_char'].iloc[ind_row] + vec_lengths.append(pt.coord_string_to_array(df_feat['bbox'].iloc[ind_row + i_off])[[0,2]]) + text_length.append(len(df_feat['text'].iloc[ind_row + i_off])) + + tot_len = float(np.sum(text_length))/(1 - val_inchar) + for i_off in range(offset): + df_feat_corr['textl'].iloc[ind_row + i_off] = val_textl + df_feat_corr['textb'].iloc[ind_row + i_off] = val_textb + i_off + df_feat_corr['num_line'].iloc[ind_row + i_off] = val_numl + if i_off: + df_feat_corr['in_char'].iloc[ind_row + i_off] = df_feat_corr['end_char'].iloc[ind_row + i_off - 1] + + #df_feat_corr['end_char'].iloc[ind_row + i_off] = np.min([1, val_inchar + float(vec_lengths[i_off][1] - np.min(vec_lengths))/tot_len]) + df_feat_corr['end_char'].iloc[ind_row + i_off] = np.min([1, df_feat_corr['in_char'].iloc[ind_row + i_off] + + float(text_length[i_off])/tot_len]) + df_feat_corr['length'].iloc[ind_row + i_off] = df_feat_corr['end_char'].iloc[ind_row + i_off] - df_feat_corr['in_char'].iloc[ind_row + i_off] + + # Now, correct next values of num_line, as they have to be -1. Until next + # num_line = 0 + ind_next_numline0 = np.argwhere(np.asarray(df_feat['num_line'].iloc[(ind_row + offset):]) == 0) + if len(ind_next_numline0): + ind_next_numline0 = ind_next_numline0.min() + df_feat_corr['num_line'].iloc[(ind_row + offset):(ind_row + offset + ind_next_numline0)] -= 1 + + ind_row += offset + if ind_row == len(df_feat) - 1: + flag_notend = 0 + return df_feat_corr # Open a csv @@ -53,7 +116,7 @@ data_folder = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data folder_databaseAB = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar/' col2_f3_year = 1917 -year = 1893 # 1893 +year = 1945 # 1893, 1920 flag_type_for_doc = 3 flag_save_fig = 1 @@ -67,7 +130,7 @@ names_col_df_feat = ['year','file_id','page_id','index','textl','textb','text',' #%% -''' + unique_ids = np.unique(df['file_id']) @@ -84,7 +147,8 @@ n_next_lines = 2 if year >= col2_f3_year: n_next_lines = 3 - +# This for builds a feature matrix that will be later +# used for the title completion for ids in unique_ids: # Check all ids and pages, and create a parallel csv with full titles. unique_pages = np.unique(df[df['file_id'] == ids]['page_id']) @@ -155,8 +219,10 @@ for ids in unique_ids: df_feat = pd.DataFrame(vec_df_feat, columns = names_col_df_feat) +df_feat_corr = correct_textline_df(df_feat) + # Now one hot encoding for font type -df_feat = pd.concat([df_feat,pd.get_dummies(df_feat['font_type'])], axis = 1) +df_feat_corr = pd.concat([df_feat_corr, pd.get_dummies(df_feat['font_type'])], axis = 1) # Let's build the vocabulary only from the first X words @@ -172,12 +238,14 @@ vocab, ocurr = utils_feats.get_vocab(' '.join(all_text_for_vocab), min_ocurr = 5 # Some set of rules: # 1. If we find any of the following words among the first num_words, then it # ends the title + words_ends = ['prüfung', 'prufung', 'botschaft', 'bötschaft', 'betr', 'betr.', 'berichte', 'bericht', 'petitionen', 'schreiben', 'bot-', 'bot -', 'mitunterzeichnern', 'be-', 'be -'] vec = CountVectorizer(vocabulary = words_ends, tokenizer = word_tokenize) mat_ocurr = vec.fit_transform(all_text_for_vocab).todense() -df_feat_words_ends = pd.concat([df_feat, pd.DataFrame(mat_ocurr, columns = words_ends)], axis = 1) +df_feat_words_ends = pd.concat([df_feat_corr, pd.DataFrame(mat_ocurr, columns = words_ends)], axis = 1) + # 2. If not, we iterate through the textlines, taking the last textblock # of the current line, and compare against the previous one (same or different line). # If it is different in font type, then we mark it as the start @@ -190,7 +258,7 @@ df_feat_words_ends = pd.concat([df_feat, pd.DataFrame(mat_ocurr, columns = words # * For later year, specially when we have two columns, we should consider more # textlines. We can just use the delimiters set in the doc class -''' + #%% all_indexes = np.array(df_feat_words_ends[df_feat_words_ends['num_line'] == 0][df_feat_words_ends['textb'] == 0].index) @@ -202,7 +270,7 @@ names_col_titles = ['year','file_id','page_id','index','title_text','in_textl',' # Conditions! def_terms = ['botschaft', 'bötschaft', 'prüfung', 'prufung', 'berichte', 'bericht', 'schreiben','mitunterzeichnern'] -def_terms_regex = ['[v,y][o,ò][m,n][i]?[ .\xad-]{0,10}[0-9]', '[" "]be[\xad,-][ ]{0,10}richt', '[" "]bot[-,\xad][ ]{0,10}schaft[ ]', +def_terms_regex = ['[v,y][o,ò,d][m,n][i]?[ .\xad-]{0,10}[0-9]', '[" "]be[\xad,-][ ]{0,10}richt', '[" "]bot[-,\xad][ ]{0,10}schaft[ ]', '[ ]botschaft[ ]'] font_notitle = 'Times-Roman' min_length = 10 # for textblocks to count @@ -239,9 +307,27 @@ for count_ind, index_t in enumerate(all_indexes): in_textb = df_feat_words_ends['textb'].iloc[index_t] in_index = df_feat_words_ends['index'].iloc[index_t] - if (df_aux['file_id'].iloc[0] == 110000009) and (df_aux['page_id'].iloc[0] == 1): + if (df_aux['file_id'].iloc[0] == 110000427) and (df_aux['page_id'].iloc[0] == 2): stop = 1 + # To check if the textline ends much before the rightmost coordinate. + # To capture titles for example. We check only first 2 num_lines + + ind_min1 = 100 + all_bbox = np.asarray([pt.coord_string_to_array(df_aux['bbox'].iloc[o]) for o in range(len(df_aux))]) + min_max_ind = [all_bbox[:,0].min(), all_bbox[:,2].max()] + max_len = min_max_ind[1] - min_max_ind[0] + ind_un_numline = np.unique(df_aux['num_line'])[:2] + for ind_nl in ind_un_numline: + ind_nl_check = np.argwhere(np.asarray(df_aux['num_line']) == ind_nl).flatten()[-1] + coord_l = pt.coord_string_to_array(df_aux['bbox'].iloc[ind_nl_check]) + if coord_l[2] < (min_max_ind[1] - max_len/10): + ind_min1 = ind_nl_check + 1 + flag_notend = 0 + print('Abrupt end detected') + break + + # Check for keywords in the text # Merge all the text and check regular expressions all_text_check_norm = ' '.join([o.text for ind_r, o in df_aux.iterrows()]) all_text_check = ' '.join([o.text.lower() for ind_r, o in df_aux.iterrows()]) @@ -271,6 +357,7 @@ for count_ind, index_t in enumerate(all_indexes): # Now, locate the textblock where we found the term ind_pot_ind = np.array(ind_pot_ind) term_match = np.array(term_match) + ind_min = 1000 if len(ind_pot_ind): flag_notend = 0 term_match = term_match[np.argmin(ind_pot_ind)] @@ -295,6 +382,10 @@ for count_ind, index_t in enumerate(all_indexes): # (df_aux['end_char'].iloc[ind_min_aux] - df_aux['in_char'].iloc[ind_min_aux])) text_title = all_text_check_norm[:np.min(ind_pot_ind)] + if not flag_notend and (ind_min1 < ind_min): + ind_min = ind_min1 + flag_fulltextblock = 1 + # Now condition for abrupt jump in first line # At the level of the textblock if flag_notend: @@ -401,7 +492,7 @@ df_titles_def = pd.DataFrame(list_vec_titles, columns = names_col_titles) #%% # Saving figures flag_text = 1 -resolution = 200 +resolution = 100 if flag_save_fig: path_output_img = os.path.join(data_folder, '{}_previews'.format(year)) if not os.path.exists(path_output_img): -- GitLab