diff --git a/src/python/run_titlesComp_2steps.py b/src/python/run_titlesComp_2steps.py new file mode 100644 index 0000000000000000000000000000000000000000..9554bd50cd629eaaa275319584710dc9ae0d828c --- /dev/null +++ b/src/python/run_titlesComp_2steps.py @@ -0,0 +1,446 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 20 12:20:59 2020 + +@author: luissalamanca +""" + +''' +The first test of command to extend the titles +''' +#%% + +import os, sys + +os.environ['DEMOCRASCI_DATA'] = "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/" +import LOCAL_CONSTANTS +import CONSTANTS + +import pandas as pd +import numpy as np +import def_classes as defc +import utils_proc +import utils_feats +import plot_tools as pt +import matplotlib.pyplot as plt +from nltk.tokenize import word_tokenize +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.manifold import TSNE +import regex + +def size_matching_bbox(bbox, list_bbox): + # Compare a single bbox with a list of bboxes, to locate the one + # that matches better the dimensions. This is is necessary to compensate + # for slightly differences in the bboxes though to slightly different + # processings + # Input: both have to be strings + bbox = np.array(bbox.split(',')).astype(float) + list_bbox = np.array([o.split(',') for o in list_bbox]).astype(float) + dist_vec = np.sum(np.abs(list_bbox - bbox.reshape((1,4))), axis = 1) + return np.argmin(dist_vec), np.min(dist_vec) + +def remove_space_char(text): + all_matches = regex.findall(r'(?=([a-z] [a-z]))',text) + #all_matches = regex.findall(r'(?=(\w \w))',text) + for match in all_matches: + text = regex.sub(match,match.replace(' ',''),text) + return text + +# Open a csv + +data_folder = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/trained_annotation_data/title/exhaustive_label' +folder_databaseAB = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar/' +col2_f3_year = 1917 + +year = 1893 # 1893 +flag_type_for_doc = 3 + +flag_save_fig = 1 + +name_file = '{}/{}.csv'.format(data_folder, year) +df = pd.read_csv(name_file) + +names_col_df_feat = ['year','file_id','page_id','index','textl','textb','text','bbox', + 'font_type','font_size', 'in_char', 'end_char','num_line','length'] +# Plus the extras, such as vocabulary, font_type as one-hot, and vocabulary + +#%% + +''' + +unique_ids = np.unique(df['file_id']) + +#unique_ids = [unique_ids[0]] + +#f = open('test_titles.txt','w+') + +vec_df_feat = list() +all_text_for_vocab = list() +num_words= 1 + +# More lines when we have 2 columns as the titles can span longer +n_next_lines = 2 +if year >= col2_f3_year: + n_next_lines = 3 + + +for ids in unique_ids: + # Check all ids and pages, and create a parallel csv with full titles. + unique_pages = np.unique(df[df['file_id'] == ids]['page_id']) + input_file = './{}/{}.pdf'.format(year, ids) + doc = defc.Document(input_file, folder_database=folder_databaseAB, flag_type=flag_type_for_doc) + XML_main_corr = doc.load_or_compute_xml() + XML_main_corr = utils_proc.convert_textlines_in_xml_tree(XML_main_corr) + + for page in unique_pages: + + # Reducing df to only those necessary, and iterate through titles + df_red = df[df['file_id'] == ids][df['page_id'] == page] + + xml_page = XML_main_corr[page] + all_textlines = xml_page.findall(".//textline") + all_bboxes = [o.attrib['bbox'] for o in all_textlines] + # NOTE: perhaps, not all the textlines have a bbox. In that case, there will + # a missmatch in the assignment between index all_bboxes and all_textlines + + df_titles = df_red[df_red['confidence'] == 1][df_red['category'] == 'title'] + #ind_val = np.ravel(np.argwhere(ind_indexes <= np.max(df_red.index))) + for c_index_t, index_t in enumerate(df_titles.index): + + if (index_t == 553): + stop = 1 + + for n_next in range(n_next_lines + 1): + index_t_s = index_t + n_next + if c_index_t < (len(df_titles.index) - 1): + + if index_t_s == df_titles.index[c_index_t + 1]: + #print(index_t_s) + break + + if index_t_s <= np.max(df_red.index): + + ind_textl, _ = size_matching_bbox(df_red['bbox'][index_t_s], all_bboxes) + + tot_len = 0 + for textb in range(len(all_textlines[ind_textl])): + tot_len += len(all_textlines[ind_textl][textb].text) + #tot_len = 1 + vec_len = list() + count_len = 0 + + for textb in range(len(all_textlines[ind_textl])): + + text_textb = all_textlines[ind_textl][textb].text + vec_feat = [year, ids, page, index_t_s, ind_textl, textb, + text_textb, + df_red['bbox'][index_t_s], + all_textlines[ind_textl][textb].attrib['font'], + all_textlines[ind_textl][textb].attrib['size'], + float(count_len)/tot_len, float(count_len + len(text_textb))/tot_len, + n_next, float(len(text_textb))/tot_len] + count_len += len(text_textb) + vec_df_feat.append(vec_feat) + all_text_for_vocab.append(' '.join(word_tokenize(text_textb)[:num_words])) + + + + + + #text_line_comp = all_textlines[ind_textl].text + #f.write(text_line_comp + '\n') + +#%% + +df_feat = pd.DataFrame(vec_df_feat, columns = names_col_df_feat) + +# Now one hot encoding for font type +df_feat = pd.concat([df_feat,pd.get_dummies(df_feat['font_type'])], axis = 1) + +# Let's build the vocabulary only from the first X words + +vocab, ocurr = utils_feats.get_vocab(' '.join(all_text_for_vocab), min_ocurr = 5, flag_lower = 1, flag_stopw = 1, n_words = 10, + lang_stopw = ['german','french','italian']) + +#f.close() + + + +#%% +# Finally, function to find the first textblock that is not part of the title +# Some set of rules: +# 1. If we find any of the following words among the first num_words, then it +# ends the title +words_ends = ['prüfung', 'prufung', 'botschaft', 'bötschaft', 'betr', 'betr.', 'berichte', 'bericht', + 'petitionen', 'schreiben', 'bot-', 'bot -', 'mitunterzeichnern', 'be-', 'be -'] +vec = CountVectorizer(vocabulary = words_ends, tokenizer = word_tokenize) +mat_ocurr = vec.fit_transform(all_text_for_vocab).todense() + +df_feat_words_ends = pd.concat([df_feat, pd.DataFrame(mat_ocurr, columns = words_ends)], axis = 1) +# 2. If not, we iterate through the textlines, taking the last textblock +# of the current line, and compare against the previous one (same or different line). +# If it is different in font type, then we mark it as the start +# +# IMPORTANT: for the moment, in order to make things simpler and more straightforward +# I am just going to assume that the first line out of the titles has to be +# times-roman. Will see if this holds in the future +###### +# Things to consider +# * For later year, specially when we have two columns, we should consider more +# textlines. We can just use the delimiters set in the doc class + +''' + +#%% +all_indexes = np.array(df_feat_words_ends[df_feat_words_ends['num_line'] == 0][df_feat_words_ends['textb'] == 0].index) +names_col_titles = ['year','file_id','page_id','index','title_text','in_textl','in_textb','in_index', + 'end_textl','end_textb','end_index','bbox_vec','textl_coverage','bbox_vec_conv'] +# bbox_vec: is a list with all the bboxes that form the title +# textl_coverage is a list with the percentage of the bounding box covered +# by the title + +# Conditions! +def_terms = ['botschaft', 'bötschaft', 'prüfung', 'prufung', 'berichte', 'bericht', 'schreiben','mitunterzeichnern'] +def_terms_regex = ['[v,y][o,ò][m,n][i]?[ .\xad-]{0,10}[0-9]', '[" "]be[\xad,-][ ]{0,10}richt', '[" "]bot[-,\xad][ ]{0,10}schaft[ ]', + '[ ]botschaft[ ]'] +font_notitle = 'Times-Roman' +min_length = 10 # for textblocks to count +min_length_bef = 6 + +# For later years, it should be smaller as the sizes are almost the same +# hence, 1 before 1917 and 0 after +diff_fontsize = 0 # Max font size for the text outside the title (has to be smaller) +if year >= col2_f3_year: + diff_fontsize = 1 + +slack_cond_def_terms = 1 # Simply to allow that, if the condition of ind_min3, i.e. +# a def_terms if fulfilled, and it is not too far from the condition with the min +# index, we overwrite it +#ind_1words = np.ravel(np.argwhere(df_feat_words_ends.columns == words_ends[0])) + +list_vec_titles = list() + +## Checking first def_terms and regex, exclusively. +for count_ind, index_t in enumerate(all_indexes): + + flag_notend = 1 + flag_fulltextblock = 1 + # Corresponding indexes + if count_ind == (len(all_indexes) - 1): + ind_all_el = np.arange(index_t,np.max(df_feat_words_ends.index) + 1) + else: + ind_all_el = np.arange(index_t,all_indexes[count_ind + 1]) + + df_aux = df_feat_words_ends.iloc[ind_all_el] + + # Just the first textline and first block + in_textl = df_feat_words_ends['textl'].iloc[index_t] + in_textb = df_feat_words_ends['textb'].iloc[index_t] + in_index = df_feat_words_ends['index'].iloc[index_t] + + if (df_aux['file_id'].iloc[0] == 110000009) and (df_aux['page_id'].iloc[0] == 1): + stop = 1 + + # Merge all the text and check regular expressions + all_text_check_norm = ' '.join([o.text for ind_r, o in df_aux.iterrows()]) + all_text_check = ' '.join([o.text.lower() for ind_r, o in df_aux.iterrows()]) + all_lengths = np.array([len(o.text) for ind_r, o in df_aux.iterrows()]) + 1 + all_lengths_cum = np.cumsum(all_lengths) + + ind_pot_ind = list() + term_match = list() + for term in def_terms: + ind_f = all_text_check.find(' ' + term) + if ind_f > -1: + text_bef_regex = ''.join(regex.findall(r'[a-zA-Z]',all_text_check[:ind_f])) + if len(text_bef_regex) >= min_length_bef: + ind_pot_ind.append(ind_f + 1) # +1 for the space before the word + term_match.append(term) + for term_re in def_terms_regex: + res_regex = regex.findall(term_re, all_text_check) + if len(res_regex): + ind_f = all_text_check.find(res_regex[0]) + text_bef_regex = ''.join(regex.findall(r'[a-zA-Z0-9()]',all_text_check[:ind_f])) + if len(text_bef_regex) >= min_length_bef: + if res_regex[0][0] == ' ': + ind_f += 1 + ind_pot_ind.append(ind_f) + term_match.append(res_regex[0]) + + # Now, locate the textblock where we found the term + ind_pot_ind = np.array(ind_pot_ind) + term_match = np.array(term_match) + if len(ind_pot_ind): + flag_notend = 0 + term_match = term_match[np.argmin(ind_pot_ind)] + ind_min_aux = np.min(np.ravel(np.argwhere(all_lengths_cum > np.min(ind_pot_ind)))) + + if ind_min_aux == 0: + percent_textb = float(np.min(ind_pot_ind))/all_lengths[ind_min_aux] + else: + percent_textb = float(np.min(ind_pot_ind) - all_lengths_cum[ind_min_aux - 1])/all_lengths[ind_min_aux] + + if percent_textb < 0.1: + # We take the full line + flag_fulltextblock = 1 + ind_min = ind_min_aux + else: + # We split the line + ind_min = ind_min_aux + 1 + flag_fulltextblock = 0 + val_coverage = df_aux['in_char'].iloc[ind_min_aux] + (percent_textb * + (df_aux['end_char'].iloc[ind_min_aux] - df_aux['in_char'].iloc[ind_min_aux])) + #val_coverage = df_aux['in_char'].iloc[ind_min_aux] + (float(ind_find_textb)/len(df_aux['text'].iloc[ind_min_aux])* + # (df_aux['end_char'].iloc[ind_min_aux] - df_aux['in_char'].iloc[ind_min_aux])) + text_title = all_text_check_norm[:np.min(ind_pot_ind)] + + # Now condition for abrupt jump in first line + # At the level of the textblock + if flag_notend: + ind_next_end = np.ravel(np.argwhere(df_aux['num_line'] == 0))[-1] # Only checking 1st line + if df_aux.shape[0] > (ind_next_end + 1): + coord = pt.coord_string_to_array(df_aux['bbox'].iloc[ind_next_end]) + coord_next = pt.coord_string_to_array(df_aux['bbox'].iloc[ind_next_end + 1]) + last_coord = coord[-2] + last_coord_next = coord_next[-2] + height_line = float(np.abs(coord[3] - coord[1])) + + if ((last_coord_next > (last_coord + np.max([last_coord, last_coord_next])/10)) and + (coord_next[3] < (coord[3] - height_line/2))): + ind_min = ind_next_end + 1 + flag_notend = 0 + + # Finally, condition for change of font size, font type, etc. + # At the level of the textblock + if flag_notend: + + # Checking second condition + # The font has to be different and also the textblock as to covered until + # the end of the line. These are the conditions + ind_min = -1 + # Assuming the first line outside the title has to be times-roman + ind_timesR = np.ravel(np.argwhere(np.array(df_aux[font_notitle]) == 1)) + ind_ending_textb = np.setdiff1d(ind_timesR,0) + # And here we check the parts in times roman, comparing to previous, and + # next one + all_fontsizes = np.array(df_aux['font_size']).astype(float) + for ind_e, ind_e_textb in enumerate(ind_ending_textb): + if ((df_aux['font_type'].iloc[ind_e_textb] != df_aux['font_type'].iloc[ind_e_textb - 1]) + and (df_aux['font_type'].iloc[ind_e_textb].find('Bold') == -1) + and (len(df_aux['text'].iloc[ind_e_textb].replace(' ','')) > min_length) + and ((np.max(all_fontsizes) - float(df_aux['font_size'].iloc[ind_e_textb])) >= diff_fontsize)): + + if ind_e_textb == np.max(ind_ending_textb): + ind_min = ind_e_textb + break + else: + # Also similar to next one, that should be already times-roman + + # This condition is not avoid short things such as N, III, or sometimes numbers + # that are recognized in bold, and to ensure that after this piece of text + # we have a long times-roman text + len_rest = np.array([len(o) for o in np.array(df_aux['text'].iloc[ind_ending_textb])])[(ind_e + 1):] + # And to avoid short texts before + text_bef = ''.join(df_aux['text'].iloc[:ind_e_textb]) + text_bef_regex = ''.join(regex.findall(r'[a-zA-Z]',text_bef)) + if len(np.argwhere(len_rest > min_length)) and (len(text_bef_regex) > min_length_bef): + ind_min = ind_e_textb + + if ind_min >= 0: + break + + # We are forcing to take times roman + if ind_min < 0 and len(ind_ending_textb): + ind_min = np.max(ind_ending_textb) + + if (ind_min > df_aux.shape[0]) and (ind_min != 1000): + ind_min = df_aux.shape[0] - 1 + + # To finally merge with some small part in the beginning of the sentence + if (ind_min != 1000): + if (df_aux['length'].iloc[ind_min - 1] < 0.1) and (df_aux['num_line'].iloc[ind_min - 1] > 0): # Less than 10% of the textline length + ind_min -= 1 + + if (ind_min == 1000) or (ind_min == -1): + ind_min = df_aux.shape[0] + + end_textl = df_aux['textl'].iloc[ind_min - 1] + end_textb =df_aux['textb'].iloc[ind_min - 1] + end_index = df_aux['index'].iloc[ind_min - 1] + + if flag_fulltextblock: + text_title = ' '.join(df_aux['text'].iloc[:ind_min]) + val_coverage = df_aux['end_char'].iloc[ind_min - 1] + + # common for all cases + bbox_vec = list() + textl_coverage = list() + bbox_vec_conv = list() + for index_bb in np.arange(in_index, end_index + 1): + if index_bb < end_index: + textl_coverage.append(1) + bbox_vec_conv.append(df_aux[df_aux['index'] == index_bb]['bbox'].iloc[0]) + bbox_vec.append(df_aux[df_aux['index'] == index_bb]['bbox'].iloc[0]) + + textl_coverage.append(val_coverage) + arr_coord = pt.coord_string_to_array(bbox_vec[-1]) + arr_coord[2] = arr_coord[0] + (arr_coord[2] - arr_coord[0]) * val_coverage + bbox_vec_conv.append(','.join(arr_coord.astype(str))) + # Stack all features + vec_new_el = [df_aux['year'].iloc[0], df_aux['file_id'].iloc[0], df_aux['page_id'].iloc[0], + df_aux['index'].iloc[0], text_title, in_textl, in_textb, in_index, end_textl, end_textb, + end_index, bbox_vec, textl_coverage, bbox_vec_conv] + + list_vec_titles.append(vec_new_el) + + +df_titles_prev = df[df['confidence'] == 1][df['category'] == 'title'] +df_titles_def = pd.DataFrame(list_vec_titles, columns = names_col_titles) + +#%% +# Saving figures +flag_text = 1 +resolution = 200 +if flag_save_fig: + path_output_img = os.path.join(data_folder, '{}_previews'.format(year)) + if not os.path.exists(path_output_img): + os.makedirs(path_output_img) + + unique_ids = np.unique(df_titles_def['file_id']) + for ids in unique_ids: + # Check all ids and pages, and create a parallel csv with full titles. + unique_pages = np.unique(df_titles_def[df_titles_def['file_id'] == ids]['page_id']) + input_file = './{}/{}.pdf'.format(year, ids) + doc = defc.Document(input_file, folder_database=folder_databaseAB, flag_type=flag_type_for_doc) + XML_main = doc.xml + doc.pdf2imgobj(resolution = resolution) + for page in unique_pages: + # Reducing df to only those necessary, and iterate through titles + df_titles_def_red = df_titles_def[(df_titles_def['file_id'] == ids) & (df_titles_def['page_id'] == page)] + + # Obtain vector of bboxes + list_bbox = np.concatenate([row['bbox_vec_conv'] for o, row in df_titles_def_red.iterrows()]) + list_bbox = np.array([o.split(',') for o in list_bbox]).astype(float) + img = np.array(doc.imgobj[page]) + dim_bbox_page = np.array(XML_main[page].attrib['bbox'].split(',')).astype(np.float64) + + # Getting the image plotted + #img_arr = pt.highlight_text(img, list_bbox, dim_bbox_page, color_vec = 'red', filled = False, thick_line = 3) + list_bbox_t, _ = pt.adapt_coordtoimg(img, np.transpose(list_bbox), dim_bbox_page) + img_arr = pt.plot_bboxes_rainbow(img, list_bbox_t, n_colors = list_bbox_t.shape[1]) + fig, axes = plt.subplots(1, 1, figsize=(16, 16)) + axes.axis('off') + axes.imshow(img_arr) + + if flag_text: + for ind, row in df_titles_def_red.iterrows(): + text_t = row['title_text'] + bbox_v = pt.coord_string_to_array(row['bbox_vec'][0]) + bbox_t = np.ravel(pt.adapt_coordtoimg(img, bbox_v, dim_bbox_page)[0]) + axes.text(bbox_t[1] + 10, bbox_t[0] - 10, text_t, fontsize = 10, color = 'red') + + format_fig = 'png' + name_fig = '{}/{}_page{}.{}'.format(path_output_img,doc.name_wo_ext,page,format_fig) + fig.savefig(name_fig, format = format_fig, dpi = resolution) + plt.close(fig) \ No newline at end of file diff --git a/src/python/run_titles_completion.py b/src/python/run_titles_completion.py index 34c46f9b26a80666664197be54f5097ae92e812a..fa9037b8b8c0c4e1df54a2471cca42be4ee5020f 100644 --- a/src/python/run_titles_completion.py +++ b/src/python/run_titles_completion.py @@ -41,7 +41,8 @@ def size_matching_bbox(bbox, list_bbox): return np.argmin(dist_vec), np.min(dist_vec) def remove_space_char(text): - all_matches = regex.findall(r'(?=(\w \w))',text_regex) + all_matches = regex.findall(r'(?=([a-z] [a-z]))',text) + #all_matches = regex.findall(r'(?=(\w \w))',text) for match in all_matches: text = regex.sub(match,match.replace(' ',''),text) return text @@ -52,7 +53,7 @@ data_folder = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data folder_databaseAB = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar/' col2_f3_year = 1917 -year = 1922 # 1893 +year = 1940 # 1893 flag_type_for_doc = 3 flag_save_fig = 1 @@ -67,6 +68,7 @@ names_col_df_feat = ['year','file_id','page_id','index','textl','textb','text',' #%% ''' + unique_ids = np.unique(df['file_id']) #unique_ids = [unique_ids[0]] @@ -164,7 +166,7 @@ vocab, ocurr = utils_feats.get_vocab(' '.join(all_text_for_vocab), min_ocurr = 5 #f.close() -''' + #%% # Finally, function to find the first textblock that is not part of the title # Some set of rules: @@ -188,6 +190,8 @@ df_feat_words_ends = pd.concat([df_feat, pd.DataFrame(mat_ocurr, columns = words # * For later year, specially when we have two columns, we should consider more # textlines. We can just use the delimiters set in the doc class +''' + #%% all_indexes = np.array(df_feat_words_ends[df_feat_words_ends['num_line'] == 0][df_feat_words_ends['textb'] == 0].index) names_col_titles = ['year','file_id','page_id','index','title_text','in_textl','in_textb','in_index', @@ -198,10 +202,10 @@ names_col_titles = ['year','file_id','page_id','index','title_text','in_textl',' # Conditions! def_terms = ['botschaft', 'bötschaft', 'prüfung', 'prufung', 'berichte', 'bericht', 'schreiben','mitunterzeichnern'] -def_terms_regex = ['vom [1-9]', '[" "]be[" ",-,\xad]', '[" "]bot[" ",-,\xad]'] +def_terms_regex = ['vom *[0-9]', '[" "]be[" ",-,\xad]', '[" "]?^?bot[$," ",-,\xad]?$?','bot'] font_notitle = 'Times-Roman' min_length = 10 # for textblocks to count -min_length_bef = 14 +min_length_bef = 6 diff_fontsize = 1 # Max font size for the text outside the title (has to be smaller) slack_cond_def_terms = 1 # Simply to allow that, if the condition of ind_min3, i.e. # a def_terms if fulfilled, and it is not too far from the condition with the min @@ -220,7 +224,7 @@ for count_ind, index_t in enumerate(all_indexes): df_aux = df_feat_words_ends.iloc[ind_all_el] - if (df_aux['file_id'].iloc[0] == 110000459) and (df_aux['page_id'].iloc[0] == 20): + if (df_aux['file_id'].iloc[0] == 110000771) and (df_aux['page_id'].iloc[0] == 9): stop = 1 # Just the first textline and first block @@ -241,7 +245,9 @@ for count_ind, index_t in enumerate(all_indexes): if (n_line == 0) and len(ind_sum_words_def): for ind_s_def in ind_sum_words_def: text_bef = ''.join(df_aux['text'].iloc[:int(ind_s_def)]) - if len(text_bef) > min_length_bef: + # Regex to consider only character + text_bef_regex = ''.join(regex.findall(r'[a-zA-Z]',text_bef)) + if len(text_bef_regex) >= min_length_bef: flag_valid = 1 break if flag_valid: @@ -264,7 +270,8 @@ for count_ind, index_t in enumerate(all_indexes): # Checking that we are not taking a line too early if (n_line == 0) and (ind_regex > -1): text_bef = ''.join(df_aux['text'].iloc[:ind_regex]).replace(' ','') - if len(text_bef) < min_length_bef: + text_bef_regex = ''.join(regex.findall(r'[a-zA-Z]',text_bef)) + if len(text_bef_regex) < min_length_bef: ind_regex = -1 # 2nd hard: line much shorter than next. This is to indicate and abrupt jump. @@ -314,26 +321,30 @@ for count_ind, index_t in enumerate(all_indexes): ind_min3 = 1000 ind_find = -1 for ind_r in ind_num_line: - text_block = remove_space_char(df_aux['text'].iloc[ind_r].lower()) + #text_block = remove_space_char(df_aux['text'].iloc[ind_r].lower()) + text_block = df_aux['text'].iloc[ind_r].lower() for term_def in def_terms: if term_def in word_tokenize(text_block)[num_words:]: ind_find = df_aux['text'].iloc[ind_r].lower().find(term_def) text_title = text_title3 + ' ' + df_aux['text'].iloc[ind_r][:ind_find] # To avoid too short titles - if len(text_title.replace(' ','')) > min_length_bef: + if len(text_title.replace(' ','')) > min_length: flag_fulltextblock = 0 break if not ind_find > -1: for ireg in def_terms_regex: - text_regex = ' '.join(word_tokenize(text_block)[num_words:]) - res_regex = regex.findall(ireg, text_regex) - if res_regex: - ind_find = df_aux['text'].iloc[ind_r].lower().find(res_regex[0]) - text_title = text_title3 + ' ' + df_aux['text'].iloc[ind_r][:ind_find] - # To avoid too short titles - if len(text_title.replace(' ','')) > min_length_bef: - flag_fulltextblock = 0 - break + tokenize_tb = word_tokenize(text_block) + if len(tokenize_tb) > num_words: + index_start = text_block.find(tokenize_tb[num_words]) + #text_regex = ' '.join(word_tokenize(text_block)[num_words]) + res_regex = regex.findall(ireg, text_block[index_start:]) + if res_regex: + ind_find = df_aux['text'].iloc[ind_r].lower().find(res_regex[0]) + text_title = text_title3 + ' ' + df_aux['text'].iloc[ind_r][:ind_find] + # To avoid too short titles + if len(text_title.replace(' ','')) > min_length: + flag_fulltextblock = 0 + break if not flag_fulltextblock: val_coverage = df_aux['in_char'].iloc[ind_r] + (float(ind_find)/len(df_aux['text'].iloc[ind_r])* @@ -396,7 +407,7 @@ for count_ind, index_t in enumerate(all_indexes): else: ind_imin = np.argmin([ind_min1, ind_min2, ind_min3]) if ind_imin == 2: - ind_imin = ind_min3 + ind_min = ind_min3 else: ind_min = np.min([ind_min1, ind_min2]) if (ind_min + slack_cond_def_terms) >= ind_min3: @@ -411,12 +422,13 @@ for count_ind, index_t in enumerate(all_indexes): if (ind_min != 1000): if (df_aux['length'].iloc[ind_min - 1] < 0.1) and (df_aux['num_line'].iloc[ind_min - 1] > 0): # Less than 10% of the textline length ind_min -= 1 - + # Checking that we are not taking a line too early. Therefore, we just # require for some minimum length of text for the case of the first line if (n_line == 0) and (ind_min > -1): text_bef = ''.join(df_aux['text'].iloc[:ind_min]) - if len(text_bef) < min_length_bef: + text_bef_regex = ''.join(regex.findall(r'[a-zA-Z]',text_bef)) + if len(text_bef_regex) < min_length_bef: ind_min = 1000 @@ -459,7 +471,7 @@ df_titles_def = pd.DataFrame(list_vec_titles, columns = names_col_titles) #%% # Saving figures flag_text = 1 -resolution = 100 +resolution = 200 if flag_save_fig: path_output_img = os.path.join(data_folder, '{}_previews'.format(year)) if not os.path.exists(path_output_img): diff --git a/src/python/test_correct.py b/src/python/test_correct.py index 1da770adda994d958719c88de9396e265ecbe454..23d0aa22fae351c4585896ac484eb3d6a7a59eda 100644 --- a/src/python/test_correct.py +++ b/src/python/test_correct.py @@ -1,19 +1,22 @@ import os os.environ['DEMOCRASCI_DATA'] = "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/" +import numpy as np +import matplotlib.pyplot as plt +import plot_tools as pt import def_classes as defc import utils_proc year = 1982 -year = 1922 +year = 1950 folder_database = '../../data/AB_other/SessionOverviews_tar/' folder_database = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar' #%% iddoc = '110001467' -iddoc = '110000455' +iddoc = '110000959' input_file = "./{}/{}.pdf".format(year, iddoc) doc = defc.Document(input_file, folder_database, flag_type = 3) @@ -42,3 +45,45 @@ for l_d in list_docs: doc.correct_xml(flag_plots = 1, flag_parallel = 0, flag_save_figs = 1, pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml', name_outcorrxml = '04_correctedxml', flag_save = 1) + +#%% +# Visualize how the page look directly extracted from the original xml, +# without any ordering + +import xml.etree.ElementTree as ET + +year = 1940 +iddoc = '110000779' + +input_file = "./{}/{}.pdf".format(year, iddoc) + +doc = defc.Document(input_file, folder_database, flag_type = 3) +doc.pdf2imgobj() + +name_outxml = '02_extractedxml' + +name_xml = './' + str(year) + '/' + str(iddoc) + '_data.xml' +if name_xml in utils_proc.get_contained_file_names(year, folder_database, name_outxml): + h_xml = utils_proc.get_handlerfile(name_xml, doc.folder_database, name_outxml) + XML_tree = ET.parse(h_xml) + XML_main = XML_tree.getroot() + +#%% + +for page in range(len(doc.imgobj)): +#for page in range(2): + + # Obtain vector of bboxes + list_bbox = XML_main[page].findall(".//textline") + list_bbox = np.array([o.attrib['bbox'].split(',') for o in list_bbox]).astype(float) + img = np.array(doc.imgobj[page]) + dim_bbox_page = np.array(XML_main[page].attrib['bbox'].split(',')).astype(np.float64) + + # Getting the image plotted + #img_arr = pt.highlight_text(img, list_bbox, dim_bbox_page, color_vec = 'red', filled = False, thick_line = 3) + list_bbox_t, _ = pt.adapt_coordtoimg(img, np.transpose(list_bbox), dim_bbox_page) + img_arr = pt.plot_bboxes_rainbow(img, list_bbox_t, n_colors = list_bbox_t.shape[1]) + fig, axes = plt.subplots(1, 1, figsize=(16, 16)) + axes.axis('off') + axes.imshow(img_arr) + diff --git a/src/python/test_titles.txt b/src/python/test_titles.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4c824d13cb5d2821a8fbda05197af9d9eec725b --- /dev/null +++ b/src/python/test_titles.txt @@ -0,0 +1,72 @@ +[font face="Times-Bold" size="10.500"]1. Wahlaktenpriifung. [/font][font face="Times-Roman" size="8.500"]Prüfung der Wahlakten neuer Mitglieder. [/font] +[font face="Times-Bold" size="10.500"]2 Bureaux-Neubestellung. [/font][font face="Times-Roman" size="8.500"]Wahl des Bureau des Nationalrates und desjenigen des Ständerates. [/font] +[font face="Times-Bold" size="10.500"]3. Wahl der Büdget-Commissionen [/font][font face="Times-Roman" size="8.500"]des Nationalrates und des Ständerates für das Budget von 1894. [/font] +[font face="Times-Bold" size="10.500"]4. Geschäftsbericht und Staatsrechnung für das Jahr 1892. [/font] +[font face="Times-Roman" size="9.500"]5. [/font][font face="Times-Bold" size="10.500"]^Vertretung der Schweiz im Auslande. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 19. Mai 1893 (Bundesblatt III. 69), betr [/font] +[font face="Times-Bold" size="10.500"]7 [/font][font face="Times-Roman" size="9.500"](65). [/font][font face="Times-Italic" size="8.000"]8 [/font][font face="Times-Bold" size="10.500"]Schulwandkarte der Schweiz. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 20. März 1893 (Bundesblatt I. 1019) betr Er [/font] +[font face="Times-Bold" size="10.500"]9 [/font][font face="Times-Roman" size="9.500"](7). [/font][font face="Times-Bold" size="10.500"]S Polytechnikum. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 27. Januar 1893 (Bundesblatt I. 353), betr. Erhöhung des Jahres[/font] +[font face="Times-Bold" size="10.500"].0(8). [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.500"]Nationalbibliothek. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom [/font][font face="Times-Bold" size="10.500"]8. [/font][font face="Times-Roman" size="9.500"]März [/font][font face="Times-Bold" size="10.500"]1893 [/font][font face="Times-Roman" size="9.500"](Bundesblatt I. [/font][font face="Times-Bold" size="10.500"]1000), [/font][font face="Times-Roman" size="9.500"]betr. die Gründuno einer [/font] +[font face="Times-Bold" size="10.500"]11. S Cholera-TJebereinkunft. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 16. Mai 1893 (Bundesblatt III. 159), betr. Ratification der [/font] +[font face="Times-Bold" size="10.500"]12. s Gesundheitsamt beim Departement des Innern. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 19. Mai 1893 (Bundesblatt [/font] +[font face="Times-Roman" size="9.500"]IB [/font][font face="Times-Bold" size="10.500"](9). [/font][font face="Times-Italic" size="8.000"]fi [/font][font face="Times-Bold" size="10.500"]Parlamentsgebäude. [/font][font face="Times-Roman" size="9.500"]Botschaft vom [/font][font face="Times-Bold" size="10.500"]2. [/font][font face="Times-Roman" size="9.500"]Juni [/font][font face="Times-Bold" size="10.500"]1892 [/font][font face="Times-Roman" size="9.500"](Bundesblatt III. [/font][font face="Times-Bold" size="10.500"]572), [/font][font face="Times-Roman" size="9.500"]betr. Erwerbung der Casinoliegenschaft in Bern [/font] +[font face="Times-Bold" size="10.500"]14(11). S Archivgebäude. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom [/font][font face="Times-Bold" size="10.500"]8. [/font][font face="Times-Roman" size="9.500"]Dezember [/font][font face="Times-Bold" size="10.500"]1892 [/font][font face="Times-Roman" size="9.500"](Bandesblatt V. [/font][font face="Times-Bold" size="10.500"]764), [/font][font face="Times-Roman" size="9.500"]betr. Bewilligung des [/font] +[font face="Times-Bold" size="10.500"]15(12). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.500"]Schallenberg strasse. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom [/font][font face="Times-Bold" size="10.500"]9. [/font][font face="Times-Roman" size="9.500"]September [/font][font face="Times-Bold" size="10.500"]1892 [/font][font face="Times-Roman" size="9.500"](Bundesblatt IV. [/font][font face="Times-Bold" size="10.500"]441), [/font][font face="Times-Roman" size="9.500"]betr. Bewilligung [/font] +[font face="Times-Bold" size="10.000"]16. [/font][font face="Times-Italic" size="8.000"]n [/font][font face="Times-Bold" size="10.500"]Schangnau—Wiggen-Strasse. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom [/font][font face="Times-Bold" size="10.500"]19. [/font][font face="Times-Roman" size="9.500"]Mai [/font][font face="Times-Bold" size="10.500"]1893 [/font][font face="Times-Roman" size="9.500"](Bundesblatt III. [/font][font face="Times-Bold" size="10.500"]11), [/font][font face="Times-Roman" size="9.500"]betr. Bewilligung [/font] +[font face="Times-Bold" size="10.000"]17. [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.500"]Sisselnbach-Correction. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom [/font][font face="Times-Bold" size="10.500"]18, [/font][font face="Times-Roman" size="9.500"]April [/font][font face="Times-Bold" size="10.500"]1893 [/font][font face="Times-Roman" size="9.500"](Bundesblatt II. [/font][font face="Times-Bold" size="10.500"]739), [/font][font face="Times-Roman" size="9.500"]betr. Zusicherung einer [/font] +[font face="Times-Bold" size="10.000"]18. [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.500"]Rheinregulierung. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom [/font][font face="Times-Bold" size="10.500"]26. [/font][font face="Times-Roman" size="9.500"]Mai [/font][font face="Times-Bold" size="10.500"]1893 [/font][font face="Times-Roman" size="9.500"](Bandesblatt III. [/font][font face="Times-Bold" size="10.500"]101), [/font][font face="Times-Roman" size="9.500"]betr. Ratification des Staats [/font] +[font face="Times-Bold" size="10.000"]19[/font][font face="Times-Bold" size="10.500"](5). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.500"]Initiativbegehren betr. das Verbot des Schlachtens ohne vorherige Betäubung. [/font][font face="Times-Roman" size="9.500"]Wortlaut des Begehrens: [/font] +[font face="Times-Bold" size="10.500"]762). [/font][font face="Times-Roman" size="9.500"]— Zu vergi, auch Bundesblatt IV. [/font][font face="Times-Bold" size="10.500"]339 [/font][font face="Times-Roman" size="9.500"]und [/font][font face="Times-Bold" size="10.500"]477. [/font] +[font face="Times-Bold" size="10.000"]20 [/font][font face="Times-Bold" size="10.500"](14). [/font][font face="Times-Italic" size="8.000"]n [/font][font face="Times-Bold" size="10.000"]Politische Rechte. [/font][font face="Times-Roman" size="9.500"]Botschaft und Gesetzes-Entwurf vom [/font][font face="Times-Bold" size="10.500"]2. [/font][font face="Times-Roman" size="9.500"]Juni [/font][font face="Times-Bold" size="10.500"]1882 [/font][font face="Times-Roman" size="9.500"](Bundesblatt III. [/font][font face="Times-Bold" size="10.500"]1), [/font][font face="Times-Roman" size="9.500"]betretend die politischen Rechte [/font] +[font face="Times-Bold" size="10.000"]21 [/font][font face="Times-Roman" size="9.500"](15). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Schweizerische Eisenbahnrente. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom 23. October 1891 (Bundesblatt IV. 620) über die Frage der [/font] +[font face="Times-Bold" size="10.000"]22[/font][font face="Times-Roman" size="9.500"](16). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Tessiner Interventionskosten. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 15. Dezember 1891 (Bundesblatt V. 785), betr. [/font] +[font face="Times-Roman" size="9.500"]23(18). [/font][font face="Times-Bold" size="10.000"]s Recurs Käslin. [/font][font face="Times-Roman" size="9.500"]Recurs der Familie Käslin «zum Freienhof» in Stansstad gegen den Bundesratsbeschluss vom 6. August 1891 [/font] +[font face="Times-Bold" size="10.000"]24 a [/font][font face="Times-Roman" size="9.500"](20). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Maifeier 1892. [/font][font face="Times-Roman" size="9.500"]Petitionen der [/font][font face="Times-Roman" size="8.500"]Schweiz. [/font][font face="Times-Roman" size="9.500"]Volksversammlungen vom 1. Mai 1892, sowie einer Volksversammlung in Siebnen [/font] +[font face="Times-Bold" size="10.000"]24 b. n Maifeier 1893. [/font][font face="Times-Roman" size="9.500"]Petitionen der schweizerischen Volksversammlungen vom 1. Mai 1893 betreffend Verkürzung der Arbeitszeit, [/font] +[font face="Times-Bold" size="10.000"]25 [/font][font face="Times-Roman" size="9.500"](59). [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.000"]Recurs Krummenacher. [/font][font face="Times-Roman" size="9.500"]Recurs des Ludwig Krummenacher «zum Hirschen» in Sarnen gegen' den Bundesratsbeschluss [/font] +[font face="Times-Bold" size="10.000"]26 [/font][font face="Times-Roman" size="9.500"](66). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Beschwerde Lœw. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates yom 19. Mai 1893 (Bundesblatt III. 17), betr. die Beschwerde des Soldaten [/font] +[font face="Times-Bold" size="10.000"]27 [/font][font face="Times-Roman" size="9.500"](67). [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.000"]Recurs Bérard. [/font][font face="Times-Roman" size="9.500"]Recurs des Pacifique Bérard in Genf gegen zwei Bundesratsbeschlüsse vom 2. September und 22. November [/font] +[font face="Times-Bold" size="10.000"]28 [/font][font face="Times-Roman" size="9.500"](30). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Besoldungsgesetz für die Beamten des Militärdepartements. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 21. No [/font] +[font face="Times-Bold" size="10.000"]29 a [/font][font face="Times-Roman" size="9.500"](31). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Eingabe von Pferdezüchtern der romanischen Schweiz [/font][font face="Times-Roman" size="9.500"]betr. Ankauf von Artillerie und Cavalleriepferden im [/font] +[font face="Times-Bold" size="10.500"]b [/font][font face="Times-Roman" size="9.500"](32). [/font][font face="Times-Italic" size="8.000"]8 [/font][font face="Times-Bold" size="10.500"]Einmietung von Artillerie-Zugpferden. [/font][font face="Times-Roman" size="9.500"]Eingabe des Vereins ostschweizerischer Pferdezüchter und Pferdeliebhaber [/font] +[font face="Times-Bold" size="10.000"]80 [/font][font face="Times-Roman" size="9.500"](33). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Landsturm. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 12. Mai 1893 (Bundesblatt II. 1014), betr. die Bewaffnung, die Aus [/font] +[font face="Times-Bold" size="10.000"]31 [/font][font face="Times-Roman" size="9.500"](68). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Militärische Telegraphen und Telephonlinien. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 27. März 1893 (Bundes [/font] +[font face="Times-Bold" size="10.000"]32. [/font][font face="Times-Italic" size="8.000"]tl [/font][font face="Times-Bold" size="10.500"]Kriegsmaterialanschaffungen, [/font][font face="Times-Roman" size="9.500"]Büdget für 1894. Botschaft und Beschlusses-Entwurf vom 29. Mai 1893 (B.-Bl. [/font][font face="Times-Bold" size="10.000"]III. [/font][font face="Times-Roman" size="9.500"]191), [/font] +[font face="Times-Bold" size="10.000"]33. [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Entschädigung für Recru tenausrüstung pro 1894. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 30. Mai 1893 (Bundes [/font] +[font face="Times-Bold" size="10.000"]34. [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.000"]Militärgeleiseanlage bei Göschenen. [/font][font face="Times-Roman" size="9.500"]Botschaft nnd Beschlusses-Entwurf vom 23. Mai 1893 (Bundesblatt III. 21), betr. [/font] +[font face="Times-Bold" size="10.000"]35. S Notportionen und Notrationen. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 29. Mai 1893 (Bundesblatt III. 105), betr. Verab [/font] +[font face="Times-Bold" size="10.000"]36. [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.000"]Verpflegungs und Magazinbureau. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 30. Mai 1893 (Bundesblatt III. 115), betr. [/font] +[font face="Times-Bold" size="10.000"]37. [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.000"]Recurs Römer und Carrard. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom 30. Mai 1893 (Bundesblatt III. 126) über den Recurs der HH. Römer [/font] +[font face="Times-Bold" size="10.000"]38. [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Artillerie-Versuchsstation in Thun. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 30. Mai 1893 (Bundesblatt III. 119), betr. [/font] +[font face="Times-Bold" size="10.000"]39. [/font][font face="Times-Roman" size="9.500"]S [/font][font face="Times-Bold" size="10.000"]Nachtragscredite (I. [/font][font face="Times-Roman" size="9.500"]Serie). Botschaft und Beschlusses-Entwurf vom 2. Juni 1893 (Bundesblatt [/font][font face="Times-Bold" size="10.000"]III. [/font][font face="Times-Roman" size="9.500"]212), betr. Bewilligung [/font] +[font face="Times-Bold" size="10.000"]40 [/font][font face="Times-Roman" size="9.500"](37). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Zollgesetz, Revision. [/font][font face="Times-Roman" size="9.500"]Botschaft und Gesetzes-Entwurf vom 30. Mai 1892 (Bundesblatt III. 410), betr. Revision des Bundes [/font] +[font face="Times-Bold" size="10.000"]41. [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Zuckerrückzoll. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 26. Mai 1893 (Bundesblatt III. 87), betreffend die fernere Gewährung [/font] +[font face="Times-Bold" size="10.000"]42 [/font][font face="Times-Roman" size="9.500"](38). [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Arbeit in den Fabriken. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom 3. Juni 1891 (Bundesblatt III. 194), betr. vier Beschlüsse der [/font] +[font face="Times-Bold" size="10.000"]43 [/font][font face="Times-Roman" size="9.500"](39). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Zündhölzchenmonopol. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 20. November 1891 (Bundesblatt V. 413) , betr. Ein [/font] +[font face="Times-Bold" size="10.000"]44[/font][font face="Times-Roman" size="9.500"](40). [/font][font face="Times-BoldItalic" size="8.500"]S [/font][font face="Times-Bold" size="10.000"]Gewerbe-Gesetzgebung. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 25. November 1892 (Bundesblatt V. 366), betr. Ein [/font] +[font face="Times-Bold" size="10.000"]45 [/font][font face="Times-Roman" size="9.500"](41). [/font][font face="Times-BoldItalic" size="8.500"]n [/font][font face="Times-Bold" size="10.000"]Förderung der Landwirtschaft durch den Bund. [/font][font face="Times-Roman" size="9.500"]Botschaft und Gesetzes-Entwurf vom 28. November 1892 [/font] +[font face="Times-Bold" size="10.000"]46 [/font][font face="Times-Roman" size="9.500"](42). [/font][font face="Times-Bold" size="10.000"]Eisenbahngeschäfte. [/font][font face="Times-Roman" size="9.500"](Priorität beim Ständerat.) [/font] +[font face="Times-Bold" size="10.000"]47 [/font][font face="Times-Roman" size="9.500"](43). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Stundenzonenzeit. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom [/font][font face="Times-Bold" size="10.000"]17. [/font][font face="Times-Roman" size="9.500"]Juni 1892 (Bundesblatt III. 1034), betr. die Einführung der Stunden [/font] +[font face="Times-Bold" size="10.000"]48 [/font][font face="Times-Roman" size="9.500"](45). [/font][font face="Times-Italic" size="8.000"]n [/font][font face="Times-Bold" size="10.000"]Motionen Comtesse und Curti. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom 7. Juni 1892 (Bundesblatt III. 473) zu den Motionen [/font] +[font face="Times-Bold" size="10.000"]49[/font][font face="Times-Roman" size="9.500"](48). [/font][font face="Times-Italic" size="8.000"]n [/font][font face="Times-Bold" size="10.000"]Postregalgesetz. [/font][font face="Times-Roman" size="9.500"]Botschaft und Gesetzes-Entwurf vom 14. Januar 1893 (Bundesblatt I. 77) zu einem neuen Bundesgesetze [/font] +[font face="Times-Bold" size="10.000"]50 [/font][font face="Times-Roman" size="9.500"](49). [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Telephonverbindung zwischen Tessin und der innerschweiz. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom [/font][font face="Times-Bold" size="10.000"]24. [/font][font face="Times-Roman" size="9.500"]Januar 1893 [/font] +[font face="Times-Bold" size="10.000"]51. [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Telephongebühren. [/font][font face="Times-Roman" size="9.500"]Botschaft und Gesetzes-Entwurf vom 15. November 1892 (Bundesblatt V. 313), betr. Ermässigung der [/font] +[font face="Times-Bold" size="10.000"]52 [/font][font face="Times-Roman" size="9.500"](50) [/font][font face="Times-Bold" size="10.000"]Motion von Hrn. Nationalrat Brunner [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichnern, vom 15. Juni 1892. [/font] +[font face="Times-Bold" size="10.500"]2, ÄT ffiÄ WsStìSju ». .«[/font][font face="Times-Bold" size="6.820"]s[/font][font face="Times-Bold" size="10.500"].» [/font] +[font face="Times-Bold" size="10.000"]56. [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Zollrückvergütung auf Mais. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 2. Juni 1893 (Bundesblatt [/font][font face="Times-Bold" size="10.000"]III. [/font][font face="Times-Roman" size="9.500"]258), betreffend die [/font] +[font face="Times-Bold" size="10.000"]55 [/font][font face="Times-Roman" size="9.500"](63) [/font][font face="Times-Bold" size="10.000"]Motion von Hrn. Nationalrat Brenner [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichnern, vom 15. März 1893. [/font] +[font face="Times-Bold" size="10.000"]53[/font][font face="Times-Roman" size="9.500"](51). [/font][font face="Times-Bold" size="10.000"]Motion von Hrn. Nationalrat Ador [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichnern, vom 15. Juni 1892. [/font] +[font face="Times-Bold" size="10.000"]54[/font][font face="Times-Roman" size="9.500"](53). [/font][font face="Times-Bold" size="10.000"]Motion von Hrn. Nationalrat Curti [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichnern, vom 20. Juni 1892. [/font] +[font face="Times-Roman" size="9.500"]57. [/font][font face="Times-Bold" size="10.000"]Motion von Hrn. Nationalrat Aeby [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichnern, vom 5. Juni 1893. [/font] +[font face="Times-Roman" size="9.500"]58. [/font][font face="Times-Bold" size="10.000"]Begnadigungsgesuch Boffa. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom 21. Juni 1893 (Bundesblatt III. 631), betr. das Begnadigungsge [/font] +[font face="Times-Roman" size="9.500"]59. [/font][font face="Times-Italic" size="8.000"]n [/font][font face="Times-Bold" size="10.000"]Petition der Zuckerfabrik „Helvetia" in Monthey, [/font][font face="Times-Roman" size="9.500"]vom 25. Mai 1893, um Ermässigung des Eingangszolles für Boh[/font] +[font face="Times-Bold" size="10.000"]60. Motion von Hrn. Nationalrat Hochstrasser [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichner, vom 10. Juni 1893. [/font] +[font face="Times-Bold" size="10.000"]61. Interpellation von Hrn. Nationalrat Jeanhenry [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichnern, vom 16. Juni 1893. [/font] +[font face="Times-Bold" size="10.000"]62. n Petition des [/font][font face="Times-Bold" size="11.000"]Schweiz. [/font][font face="Times-Bold" size="10.000"]Typographenbundes. [/font][font face="Times-Roman" size="9.500"]Bericht des Bundesrates vom 19. Juni 1893 (Bundesblatt III. 544) betr [/font] +[font face="Times-Bold" size="10.000"]63. [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Getreide und Fouragemagazine in Bern. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 19. Juni 1893 (Bundesblatt III. 539) [/font] +[font face="Times-Bold" size="10.000"]64. [/font][font face="Times-BoldItalic" size="10.500"]s [/font][font face="Times-Bold" size="10.000"]Postund Telegraphengebäude in Zürich. [/font][font face="Times-Roman" size="9.500"]Botschaft und Beschlusses-Entwurf vom 19. Juni 1893 (Bundesblatt III 534) [/font] +[font face="Times-Bold" size="10.000"]65. Motion von Hrn. Ständerat Héridier, [/font][font face="Times-Roman" size="9.500"]vom 21. Juni 1893. [/font] +[font face="Times-Bold" size="10.000"]66. [/font][font face="Times-BoldItalic" size="10.500"]n [/font][font face="Times-Bold" size="10.000"]Vereinbarung mit Frankreich betr. die Handelsreisenden. [/font][font face="Times-Roman" size="9.500"]Botschaft und [/font][font face="Times-Roman" size="9.000"]Beschlusses[/font][font face="Times-Roman" size="9.500"]-Entwurf vom 19. Juni 1893 [/font] +[font face="Times-Bold" size="10.000"]67. Bundesgericht, Neubestellung. [/font] +[font face="Times-Bold" size="10.000"]68. 8 Nachtragscredite für das Bundesgericht pro 1893. [/font][font face="Times-Roman" size="9.500"]Botschaft und [/font][font face="Times-Roman" size="7.500"]Beschlusses-[/font][font face="Times-Roman" size="9.500"]Entwurf vom 23. Juni 1893 (Bundesblatt [/font] +[font face="Times-Bold" size="10.000"]69. s Reorganisation und Besoldung der Beamten des Eisenbahndepartements. [/font] +[font face="Times-Bold" size="10.000"]70 [/font][font face="Times-Bold" size="10.500"]Motion der HH. Nationalräte Fonjallaz, Ceresole [/font][font face="Times-Roman" size="9.500"]und Mitunterzeichner, vom [/font][font face="Times-Bold" size="10.500"]26. [/font][font face="Times-Roman" size="9.500"]Juni [/font][font face="Times-Bold" size="10.500"]1893. [/font] diff --git a/src/python/utils_pandas.py b/src/python/utils_pandas.py index 2b7f1b727bc1e3125da392531d669eb8bf5fe0f3..d7a9fd5c047e9af702ae107d0928d0483dd17613 100644 --- a/src/python/utils_pandas.py +++ b/src/python/utils_pandas.py @@ -47,6 +47,7 @@ def split_in_data_and_id(data_frame, id_col_names): def rows_unique(df, on): unique_elems = df[on].drop_duplicates() all_elems_unique = (len(unique_elems) == len(df)) + print('Length original df {} and after dropping {}'.format(len(df), len(unique_elems))) return all_elems_unique def one_hot_dataframe_to_y(df, class_column_names):