From 642823c564af61f2024f7c1179783af3000e9f80 Mon Sep 17 00:00:00 2001 From: schmanna <schmanna@student.ethz.ch> Date: Fri, 28 Feb 2020 01:26:45 +0100 Subject: [PATCH] updated setup (csv pper year) --- src/python/CONSTANTS.py | 2 +- src/python/def_classes.py | 2 +- src/python/exhaustive_labelling.py | 144 ++++++++++++++++++----------- src/python/utils_pandas.py | 51 ++++++++++ 4 files changed, 144 insertions(+), 55 deletions(-) diff --git a/src/python/CONSTANTS.py b/src/python/CONSTANTS.py index 3e03ab84..780e4949 100644 --- a/src/python/CONSTANTS.py +++ b/src/python/CONSTANTS.py @@ -80,7 +80,7 @@ class TypeOfTrainedAnnotationProblemConstants: self.heuristic_label_path = os.path.join(self.problem_folder, "heuristic_label") self.human_label_path = os.path.join(self.problem_folder, "human_label.csv") self.predictions_path = os.path.join(self.problem_folder, "predictions.csv") - self.exhaustive_human_label_path = os.path.join(self.problem_folder, "exhaustive_human_label.csv") + self.exhaustive_human_label_folder_path = os.path.join(self.problem_folder, "exhaustive_label") self.path_containing_test_document_ids = TestSetsLocation.get_appropriate_test_file_id_path(self.path_to_input_files) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 2439079d..37fee03f 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -235,7 +235,7 @@ class Document: if 'imgobj' not in self.__dict__.keys(): self.pdf2imgobj() - if 'XML_main' not in self.__dict__.keys(): + if 'main' not in self.__dict__.keys(): name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz' if os.path.isfile(name_tar): name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml' diff --git a/src/python/exhaustive_labelling.py b/src/python/exhaustive_labelling.py index cb9b42e1..ebcdbfeb 100644 --- a/src/python/exhaustive_labelling.py +++ b/src/python/exhaustive_labelling.py @@ -10,6 +10,7 @@ import def_classes as defc import CONSTANTS import pandas as pd import numpy as np +import pickle import xml.etree.ElementTree as ET import matplotlib.pyplot as plt @@ -27,63 +28,86 @@ def test_newCoords(image, labelled_bboxes): plt.show() +# merge human and classifier predictions into one csv file (per year) def set_Up(): - # TODO! - # get both human and other predictions and merge into pd - # combine_human_label_with_predictions but both positive and negative - # seperate csv for each year? - # where is updated human labelled file? - - pred_df = utils_data.load_pd_file(CONSTANTS.title.predictions_path) - - # create new column to indicate if page has been labelled by a human - pred_df['Labeled'] = "False" - pred_df['Human_title'] = None - pred_df['linetext'] = "Filler" - utils_data.save_pd_to_file(pred_df, CONSTANTS.title.exhaustive_human_label_path) - - -""" returns: - 1. image of page (PpmImageFile) - 2. all bounding boxes and predictions for the next page - (list of dictionaries with keys: - 1. 'bbox': top left point & bottom right point (np.array) - 2. 'title_prob': prediction probability that line is a title - 3. 'text': textline - 3. metaData of current page (dict with keys: "Year", "File_ID", "Page_ID") + years = [1891, 1995] + range_years = range(years[0], years[1] + 1) + for year in range_years: + print(year) + pred_df = utils_data.load_pd_file(CONSTANTS.title.predictions_path) + pred_classifier_year_df = pred_df.loc[pred_df['year'] == year] + pred_human_df = utils_data.load_pd_file(CONSTANTS.title.human_label_path) + features = utils_data.load_pd_file(os.path.join(CONSTANTS.title.features_path, str(year) + ".pickle")) + merged_df = utils_pandas.combine_human_label_with_predictions_getAllRows(pred_human_df, pred_classifier_year_df, features, + CONSTANTS.id_columns_for_line_based_classification, + selection_class_name='title', + additional_data_cols=['linetext'], + ) + + # create new column to indicate if the page has been labelled by a human and an empty column for title corrections + # order by file_id and page_id (combine_human_label_with_predictions_getAllRows returns dataframe with human labels at the top) + merged_df['Labeled'] = "False" + merged_df['Corrected_title'] = None + merged_df = merged_df.sort_values(by=['file_id', 'page_id']) + utils_data.save_pd_to_file(merged_df, os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(year) + ".csv" )) + +""" + :param: + + returns: + 1. image of page (PpmImageFile) + 2. all bounding boxes and predictions for the next page + (list of dictionaries with keys: + 1. 'bbox': top left point & bottom right point (np.array) + 2. 'title_prob': prediction probability that line is a title + 3. 'text': textline + 3. metaData of current page (dict with keys: "year", "file_id", "page_id") + + if nothing left to label each output vatiables returns None """ -def load_nextPageToLabel(): - - pred_df = utils_data.load_pd_file(CONSTANTS.title.exhaustive_human_label_path) - file_id = page_id = year = -1 - - # get all bboxes of current page (first non human labelled page) and the predicted labels - print("searching for next page") - labelled_bboxes = [] - for j in range(len(pred_df)): - if not pred_df.at[j,'Labeled']: - file_id = pred_df.at[j,'file_id'] - page_id = pred_df.at[j,'page_id'] - year = pred_df.at[j,'year'] - for i in range(j, len(pred_df)): - row = pred_df.iloc[i] - file_id_curr = row['file_id'] - page_id_curr = row['page_id'] - if file_id_curr == file_id and page_id_curr == page_id: - bbox = np.array(row['bbox'].split(',')).astype(np.float64) - labelled_bboxes.append({"bbox" : bbox,"title_prob" : row['pred_title'], "text" : row['linetext']}) - else: break - break +# call load_nextPageToLabel wiht the previous "indexRowEnd" value of the metaData of the previous load call! +def load_nextPageToLabel(year=1891, beginRow=0): + years = [year, 1995] + range_years = range(years[0], years[1] + 1) + labelled_bboxes = metaData = image = None + for year in range_years: + print(year) + pred_df = utils_data.load_pd_file(os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(year) + ".csv" )) + file_id = page_id = -1 + rowIdx_Beg = rowIdx_End = None + # get all bboxes of current page (first non human labelled page) and the predicted labels + print("searching for next page") + labelled_bboxes = [] + for j in range(beginRow, len(pred_df)): + if not pred_df.at[j,'Labeled']: + rowIdx_Beg = j + file_id = pred_df.at[j,'file_id'] + page_id = pred_df.at[j,'page_id'] + year = pred_df.at[j,'year'] + for i in range(j, len(pred_df)): + row = pred_df.iloc[i] + file_id_curr = row['file_id'] + page_id_curr = row['page_id'] + if file_id_curr == file_id and page_id_curr == page_id: + rowIdx_End = i + bbox = np.array(row['bbox'].split(',')).astype(np.float64) + labelled_bboxes.append({"bbox" : bbox,"title_prob" : row['confidence'], "text" : row['linetext']}) + else: break + break + break # get image of page print("getting image") + print(file_id) file_aux = str(year) + '/' + str(file_id) + '.pdf' doc = defc.Document(file_aux, CONSTANTS.session_folder_database) defc.Document.pdf2imgobj(doc) im = doc.imgobj[page_id] + print(doc) # get bbox of page XML_main_corr = doc._open_xml(suffix_xml='_data', name_outcorrxml='04_correctedxml') + print(XML_main_corr) bbox_page = np.array(XML_main_corr[page_id].attrib['bbox'].split(',')).astype(np.float64) # correct bbox coordinates to 00 at topleft of page @@ -93,7 +117,12 @@ def load_nextPageToLabel(): coord2 = plot_tools.transform_coord(coord2, dim_page = bbox_page, invert_xy = True) box['bbox'] = coord2 - metaData = {"year" : year, "file_id" : file_id, "page_id" : page_id} + metaData = {"year" : year, "file_id" : file_id, "page_id" : page_id, "indexRowBeg" : rowIdx_Beg, "indexRowEnd" : rowIdx_End} + + #file_obj = open('example.obj', 'wb') + #pickle.dump(im, file_obj) + #pickle.dump(labelled_bboxes, file_obj) + #pickle.dump(metaData, file_obj) return im, labelled_bboxes, metaData @@ -105,22 +134,31 @@ def load_nextPageToLabel(): 1. 'bbox': top left point & bottom right point (np.array) 2. 'title_prob': 1 or 0 if labelled as title or not <-- to update 3. 'text': textline <-- to update - 2. metaData of current page (dict with keys: "Year", "File_ID", "Page_ID") + 2. metaData of current page (dict with keys: "year", "file_id", "page_id") """ def save_labelledPage(human_labelled_bboxes, metaData): - pred_df = utils_data.load_pd_file(CONSTANTS.title.exhaustive_human_label_path) + pred_df = utils_data.load_pd_file(os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(metaData[year]) + ".csv" )) pred_df.loc[(pred_df['file_id'] == metaData['file_id']) & (pred_df['page_id'] == metaData['page_id']), 'Labeled'] = "True" - # TODO update labels - #for box in human_labelled_bboxes: - + # update labels + idx = metaData['indexRowBeg'] + for box in human_labelled_bboxes: + pred_df.at[idx,'confidence'] = box['title_prob'] + pred_df.at[idx,'Corrected_title'] = box['text'] + + utils_data.save_pd_to_file(pred_df, os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(metaData[year]) + ".csv" )) - utils_data.save_pd_to_file(pred_df, CONSTANTS.title.exhaustive_human_label_path) # TESTING +#set_Up() +image, bboxes, metaData = load_nextPageToLabel() +#test_newCoords(image, bboxes) + + + """ -set_Up() + image, bboxes, metaData = load_nextPageToLabel() test_newCoords(image, bboxes) save_labelledPage(5, {"year" : 1955, "file_id" : 110001039, "page_id" : 0}) diff --git a/src/python/utils_pandas.py b/src/python/utils_pandas.py index 9c8a9e6f..2b7f1b72 100644 --- a/src/python/utils_pandas.py +++ b/src/python/utils_pandas.py @@ -112,6 +112,57 @@ def combine_human_label_with_predictions(human_label, predictions, features, id_ return final_selected_columns + + + +def combine_human_label_with_predictions_getAllRows(human_label, predictions, features, id_cols, + selection_class_name, additional_data_cols=None): + """ + combine classifier prediction with human label. further add optional columns from the features + + :param human_label: pandas dataframe with human label + :param predictions: pandas df with classifier predcitions + :param id_cols: list of strings with the names of id columns + :param selection_class_name: Which class counts as ('positive'). text elements with this class label are saved in the final believe file + :param features: pandas df with the features + :param additional_data_cols: which additional columns from the the features to include in the final believe file + :return: a dataframe with the current believe of text elements that belong to the selection_class_name + """ + if additional_data_cols is None: + additional_data_cols = list() + + + predictions_not_human_labeled = rows_left_not_in_right(predictions, human_label, on=id_cols).copy()\ + .rename({"prediction_time": "time"}, axis=1) + + predictions_not_human_labeled["decision"] = "prediction" + predictions_not_human_labeled["confidence"] = predictions_not_human_labeled[CONSTANTS.prediction_column_template.format(selection_class_name)] + + human_label_true = human_label[human_label[CONSTANTS.human_label_column_template.format(selection_class_name)] == True].copy()\ + .rename({"human_label_time": "time"}, axis=1) + human_label_true["decision"] = "human" + human_label_true["confidence"] = 1.0 + + human_label_false = human_label[human_label[CONSTANTS.human_label_column_template.format(selection_class_name)] == False].copy()\ + .rename({"human_label_time": "time"}, axis=1) + human_label_false["decision"] = "human" + human_label_false["confidence"] = 0.0 + + columns_to_take = id_cols + additional_data_cols + ["time", "decision", "confidence"] + + columns_already_in_prediciton_data = [c for c in columns_to_take if (c in predictions_not_human_labeled.columns) and (c in human_label_true.columns)] + + pred = predictions_not_human_labeled[columns_already_in_prediciton_data] + hh_true = human_label_true[columns_already_in_prediciton_data] + hh_false = human_label_false[columns_already_in_prediciton_data] + final = pd.concat([hh_true, hh_false, pred], axis=0) + + final_with_features = pd.merge(final, features, on=id_cols) + + final_selected_columns = final_with_features.loc[:, columns_to_take] + + return final_selected_columns + def rows_left_not_in_right(left, right, on): """ :param left: dataframe -- GitLab