From 642823c564af61f2024f7c1179783af3000e9f80 Mon Sep 17 00:00:00 2001
From: schmanna <schmanna@student.ethz.ch>
Date: Fri, 28 Feb 2020 01:26:45 +0100
Subject: [PATCH] updated setup (csv pper year)

---
 src/python/CONSTANTS.py            |   2 +-
 src/python/def_classes.py          |   2 +-
 src/python/exhaustive_labelling.py | 144 ++++++++++++++++++-----------
 src/python/utils_pandas.py         |  51 ++++++++++
 4 files changed, 144 insertions(+), 55 deletions(-)

diff --git a/src/python/CONSTANTS.py b/src/python/CONSTANTS.py
index 3e03ab84..780e4949 100644
--- a/src/python/CONSTANTS.py
+++ b/src/python/CONSTANTS.py
@@ -80,7 +80,7 @@ class TypeOfTrainedAnnotationProblemConstants:
         self.heuristic_label_path = os.path.join(self.problem_folder, "heuristic_label")
         self.human_label_path = os.path.join(self.problem_folder, "human_label.csv")
         self.predictions_path = os.path.join(self.problem_folder, "predictions.csv")
-        self.exhaustive_human_label_path = os.path.join(self.problem_folder, "exhaustive_human_label.csv")
+        self.exhaustive_human_label_folder_path = os.path.join(self.problem_folder, "exhaustive_label")
 
         self.path_containing_test_document_ids = TestSetsLocation.get_appropriate_test_file_id_path(self.path_to_input_files)
 
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 2439079d..37fee03f 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -235,7 +235,7 @@ class Document:
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
 
-        if 'XML_main' not in self.__dict__.keys():
+        if 'main' not in self.__dict__.keys():
             name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz'
             if os.path.isfile(name_tar):
                 name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + '.xml'
diff --git a/src/python/exhaustive_labelling.py b/src/python/exhaustive_labelling.py
index cb9b42e1..ebcdbfeb 100644
--- a/src/python/exhaustive_labelling.py
+++ b/src/python/exhaustive_labelling.py
@@ -10,6 +10,7 @@ import def_classes as defc
 import CONSTANTS
 import pandas as pd
 import numpy as np
+import pickle
 
 import xml.etree.ElementTree as ET
 import matplotlib.pyplot as plt
@@ -27,63 +28,86 @@ def test_newCoords(image, labelled_bboxes):
 	plt.show()
 
 
+# merge human and classifier predictions into one csv file (per year)
 def set_Up():
-	# TODO!
-	# get both human and other predictions and merge into pd
-	# combine_human_label_with_predictions but both positive and negative
-	# seperate csv for each year?
-	# where is updated human labelled file?
-	 
-	pred_df = utils_data.load_pd_file(CONSTANTS.title.predictions_path)
-
-	# create new column to indicate if page has been labelled by a human  
-	pred_df['Labeled'] = "False"
-	pred_df['Human_title'] = None
-	pred_df['linetext'] = "Filler"
-	utils_data.save_pd_to_file(pred_df, CONSTANTS.title.exhaustive_human_label_path)
-
-
-""" returns:
- 1. image of page (PpmImageFile)
- 2. all bounding boxes and predictions for the next page 
- 	(list of dictionaries with keys: 
- 		1. 'bbox': top left point & bottom right point (np.array) 
- 		2. 'title_prob': prediction probability that line is a title 
- 		3. 'text': textline
- 3. metaData of current page (dict with keys: "Year", "File_ID", "Page_ID")
+	years = [1891, 1995] 
+	range_years = range(years[0], years[1] + 1)
+	for year in range_years:
+		print(year)
+		pred_df = utils_data.load_pd_file(CONSTANTS.title.predictions_path)
+		pred_classifier_year_df = pred_df.loc[pred_df['year'] == year] 		
+		pred_human_df = utils_data.load_pd_file(CONSTANTS.title.human_label_path)
+		features = utils_data.load_pd_file(os.path.join(CONSTANTS.title.features_path, str(year) + ".pickle")) 
+		merged_df = utils_pandas.combine_human_label_with_predictions_getAllRows(pred_human_df, pred_classifier_year_df, features,
+		                                                   CONSTANTS.id_columns_for_line_based_classification,
+		                                                   selection_class_name='title',
+		                                                   additional_data_cols=['linetext'],
+	    )
+
+		# create new column to indicate if the page has been labelled by a human and an empty column for title corrections
+		# order by file_id and page_id (combine_human_label_with_predictions_getAllRows returns dataframe with human labels at the top)
+		merged_df['Labeled'] = "False"
+		merged_df['Corrected_title'] = None
+		merged_df = merged_df.sort_values(by=['file_id', 'page_id'])
+		utils_data.save_pd_to_file(merged_df, os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(year) +  ".csv" ))
+
+""" 
+	:param: 
+
+	returns:
+	 1. image of page (PpmImageFile)
+	 2. all bounding boxes and predictions for the next page 
+	 	(list of dictionaries with keys: 
+	 		1. 'bbox': top left point & bottom right point (np.array) 
+	 		2. 'title_prob': prediction probability that line is a title 
+	 		3. 'text': textline
+	 3. metaData of current page (dict with keys: "year", "file_id", "page_id")
+
+	 if nothing left to label each output vatiables returns None
 """
-def load_nextPageToLabel():
-
-	pred_df = utils_data.load_pd_file(CONSTANTS.title.exhaustive_human_label_path)
-	file_id = page_id = year = -1
-
-	# get all bboxes of current page (first non human labelled page) and the predicted labels
-	print("searching for next page")
-	labelled_bboxes = []
-	for j in range(len(pred_df)):
-		if not pred_df.at[j,'Labeled']:
-			file_id = pred_df.at[j,'file_id']
-			page_id = pred_df.at[j,'page_id']
-			year = pred_df.at[j,'year'] 
-			for i in range(j, len(pred_df)):
-				row = pred_df.iloc[i]
-				file_id_curr = row['file_id']
-				page_id_curr = row['page_id']
-				if file_id_curr == file_id and page_id_curr == page_id:
-					bbox = np.array(row['bbox'].split(',')).astype(np.float64)
-					labelled_bboxes.append({"bbox" : bbox,"title_prob" : row['pred_title'], "text" : row['linetext']})
-				else: break
-			break
+# call load_nextPageToLabel wiht the previous "indexRowEnd" value of the metaData of the previous load call!
+def load_nextPageToLabel(year=1891, beginRow=0):
+	years = [year, 1995] 
+	range_years = range(years[0], years[1] + 1)
+	labelled_bboxes = metaData = image = None
+	for year in range_years:
+		print(year)
+		pred_df = utils_data.load_pd_file(os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(year) +  ".csv" ))
+		file_id = page_id = -1
+		rowIdx_Beg = rowIdx_End = None
+		# get all bboxes of current page (first non human labelled page) and the predicted labels
+		print("searching for next page")
+		labelled_bboxes = []
+		for j in range(beginRow, len(pred_df)):
+			if not pred_df.at[j,'Labeled']:
+				rowIdx_Beg = j 
+				file_id = pred_df.at[j,'file_id']
+				page_id = pred_df.at[j,'page_id']
+				year = pred_df.at[j,'year'] 
+				for i in range(j, len(pred_df)):
+					row = pred_df.iloc[i]
+					file_id_curr = row['file_id']
+					page_id_curr = row['page_id']
+					if file_id_curr == file_id and page_id_curr == page_id:
+						rowIdx_End = i
+						bbox = np.array(row['bbox'].split(',')).astype(np.float64)
+						labelled_bboxes.append({"bbox" : bbox,"title_prob" : row['confidence'], "text" : row['linetext']})
+					else: break
+				break
+		break
 
 	# get image of page
 	print("getting image")
+	print(file_id)
 	file_aux = str(year) + '/' + str(file_id) + '.pdf'
 	doc = defc.Document(file_aux, CONSTANTS.session_folder_database)
 	defc.Document.pdf2imgobj(doc)
 	im = doc.imgobj[page_id]
+	print(doc)
 
 	# get bbox of page
 	XML_main_corr = doc._open_xml(suffix_xml='_data', name_outcorrxml='04_correctedxml') 
+	print(XML_main_corr)
 	bbox_page = np.array(XML_main_corr[page_id].attrib['bbox'].split(',')).astype(np.float64)
 
 	# correct bbox coordinates to 00 at topleft of page
@@ -93,7 +117,12 @@ def load_nextPageToLabel():
 		coord2 = plot_tools.transform_coord(coord2, dim_page = bbox_page, invert_xy = True)
 		box['bbox'] = coord2
 
-	metaData = {"year" : year, "file_id" : file_id, "page_id" : page_id}
+	metaData = {"year" : year, "file_id" : file_id, "page_id" : page_id, "indexRowBeg" : rowIdx_Beg, "indexRowEnd" : rowIdx_End}
+
+	#file_obj = open('example.obj', 'wb') 
+	#pickle.dump(im, file_obj)
+	#pickle.dump(labelled_bboxes, file_obj)
+	#pickle.dump(metaData, file_obj)
 	return im, labelled_bboxes, metaData
 
 
@@ -105,22 +134,31 @@ def load_nextPageToLabel():
 			1. 'bbox': top left point & bottom right point (np.array) 
 			2. 'title_prob': 1 or 0 if labelled as title or not 			<--  to update
 			3. 'text': textline												<--  to update
-	2. metaData of current page (dict with keys: "Year", "File_ID", "Page_ID")
+	2. metaData of current page (dict with keys: "year", "file_id", "page_id")
 """
 def save_labelledPage(human_labelled_bboxes, metaData):
 
-	pred_df = utils_data.load_pd_file(CONSTANTS.title.exhaustive_human_label_path)
+	pred_df = utils_data.load_pd_file(os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(metaData[year]) +  ".csv" ))
 	pred_df.loc[(pred_df['file_id'] == metaData['file_id']) & (pred_df['page_id'] == metaData['page_id']), 'Labeled'] = "True"
 
-	# TODO update labels
-	#for box in human_labelled_bboxes:	
-		
+	# update labels
+	idx = metaData['indexRowBeg']
+	for box in human_labelled_bboxes:	
+		pred_df.at[idx,'confidence'] = box['title_prob']
+		pred_df.at[idx,'Corrected_title'] = box['text']
+
+	utils_data.save_pd_to_file(pred_df, os.path.join(CONSTANTS.title.exhaustive_human_label_folder_path, str(metaData[year]) +  ".csv" ))
 
-	utils_data.save_pd_to_file(pred_df, CONSTANTS.title.exhaustive_human_label_path)
 
 # TESTING
+#set_Up()
+image, bboxes, metaData = load_nextPageToLabel()
+#test_newCoords(image, bboxes)
+
+
+
 """
-set_Up()
+
 image, bboxes, metaData = load_nextPageToLabel()
 test_newCoords(image, bboxes)
 save_labelledPage(5, {"year" : 1955, "file_id" : 110001039, "page_id" : 0})
diff --git a/src/python/utils_pandas.py b/src/python/utils_pandas.py
index 9c8a9e6f..2b7f1b72 100644
--- a/src/python/utils_pandas.py
+++ b/src/python/utils_pandas.py
@@ -112,6 +112,57 @@ def combine_human_label_with_predictions(human_label, predictions, features, id_
 
     return final_selected_columns
 
+
+
+
+def combine_human_label_with_predictions_getAllRows(human_label, predictions, features, id_cols,
+                                         selection_class_name, additional_data_cols=None):
+    """
+    combine classifier prediction with human label. further add optional columns from the features
+
+    :param human_label: pandas dataframe with human label
+    :param predictions: pandas df with classifier predcitions
+    :param id_cols: list of strings with the names of id columns
+    :param selection_class_name: Which class counts as ('positive'). text elements with this class label are saved in the final believe file
+    :param features: pandas df with the features
+    :param additional_data_cols: which additional columns from the the features to include in the final believe file
+    :return: a dataframe with the current believe of text elements that belong to the selection_class_name
+    """
+    if additional_data_cols is None:
+        additional_data_cols = list()
+
+
+    predictions_not_human_labeled = rows_left_not_in_right(predictions, human_label, on=id_cols).copy()\
+        .rename({"prediction_time": "time"}, axis=1)
+
+    predictions_not_human_labeled["decision"] = "prediction"
+    predictions_not_human_labeled["confidence"] = predictions_not_human_labeled[CONSTANTS.prediction_column_template.format(selection_class_name)]
+
+    human_label_true = human_label[human_label[CONSTANTS.human_label_column_template.format(selection_class_name)] == True].copy()\
+        .rename({"human_label_time": "time"}, axis=1)
+    human_label_true["decision"] = "human"
+    human_label_true["confidence"] = 1.0
+
+    human_label_false = human_label[human_label[CONSTANTS.human_label_column_template.format(selection_class_name)] == False].copy()\
+        .rename({"human_label_time": "time"}, axis=1)
+    human_label_false["decision"] = "human"
+    human_label_false["confidence"] = 0.0
+
+    columns_to_take = id_cols + additional_data_cols + ["time", "decision", "confidence"]
+
+    columns_already_in_prediciton_data = [c for c in columns_to_take if (c in predictions_not_human_labeled.columns) and (c in human_label_true.columns)]
+
+    pred = predictions_not_human_labeled[columns_already_in_prediciton_data]
+    hh_true = human_label_true[columns_already_in_prediciton_data]
+    hh_false = human_label_false[columns_already_in_prediciton_data]
+    final = pd.concat([hh_true, hh_false, pred], axis=0)
+
+    final_with_features = pd.merge(final, features, on=id_cols)
+
+    final_selected_columns = final_with_features.loc[:, columns_to_take]
+    
+    return final_selected_columns
+
 def rows_left_not_in_right(left, right, on):
     """
     :param left: dataframe
-- 
GitLab