From e30aa8a0eea934987f4e036235a909471170a1f8 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Mon, 28 Jan 2019 17:06:11 +0100
Subject: [PATCH] new method in def_classes to get council and date,
 check_discussion now works with corrected meta data

---
 src/python/def_classes.py             | 40 +++++++++++++++++++--------
 src/python/run_extract_discussions.py | 11 ++++++--
 src/python/utils_annot.py             | 14 ++++++++++
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 8e2a480b..1c3c2f06 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -48,6 +48,7 @@ class Document:
         self.name_wo_ext = os.path.splitext(self.name_file)[0]
         self.folder_database = folder_database
         self._meta_ext()
+        self._meta_corr_ext()
         self._xml_ext()
 
     def _meta_ext(self):
@@ -57,6 +58,12 @@ class Document:
         name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz'
         self.name_meta = [name_tar, name_file, name_file_db]
 
+    def _meta_corr_ext(self):
+    # Both for the correction and the extraction of the metadata information
+        name_file = str(self.year) + '/' + self.id_doc + '_metacorr.xml'
+        name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz'
+        self.name_meta_corr = [name_tar, name_file]
+
     def _xml_ext(self, suffix_xml = '_data', name_outcorrxml = '04_correctedxml'):
     # For the extraction, correction and annotation of the xmls
     # TODO for extraction and annotation
@@ -647,15 +654,6 @@ class Document:
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
             plt.close(fig)
 
-    def check_discussion(self):
-        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
-        flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
-        command = 'rm -rf ./' + str(self.year)
-        #print(command)
-        utils_proc.call_with_out(command)
-
-        return flag_discussion
-
     def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '',
                            flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200):
         #print(groups)
@@ -706,15 +704,33 @@ class Document:
 
 
 
-    def check_discussion(self):
-        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
-        flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
+    def check_discussion(self, name_outmeta = '03_correctedmeta'):
+
+        if 'name_outmeta' not in self.__dict__.keys():
+            self.name_outmeta = name_outmeta
+
+        utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta)
+        flag_discussion = utils_annot.check_if_discussion(self.name_meta_corr[1])
+
         command = 'rm -rf ./' + str(self.year)
         #print(command)
         utils_proc.call_with_out(command)
 
         return flag_discussion
 
+    def get_council_date(self, name_outmeta = '03_correctedmeta'):
+
+        if 'name_outmeta' not in self.__dict__.keys():
+            self.name_outmeta = name_outmeta
+
+        utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta)
+        (str_council, str_date) = utils_annot.get_council_and_date(self.name_meta_corr[1])
+
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+
+        return (str_council, str_date)
 
 
     def annotate_xml(self, flag_save = 1, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'):
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 618f7b2c..b707f8cc 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -26,7 +26,7 @@ from utils_proc import call_with_out
 # specify input and output files
 
 # needed for running in atom, can be ignored
-year = '1971'
+year = '1893'
 input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
@@ -113,7 +113,7 @@ utils_proc.compress_tar(output_annotatedxml)
 
 #%%
 # to test for one file
-file_tarpath = './1971/20000010_datacorr.xml'
+file_tarpath = './1893/20026592_datacorr.xml'
 
 id_doc = file_tarpath.split('/')[-1][:8]
 
@@ -126,12 +126,17 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20
 
     file_doc.df_lastnames = df_lastnames
     file_doc.list_notnames = list_notnames
+    (str_council, str_date) = file_doc.get_council_date()
+    file_doc.str_council = str_council
+    file_doc.str_date = str_date
     file_doc.annotate_xml()
 
 
 #%%
 
-
+file_doc = defc.Document(infile_aux, folder_database)
+file_doc.get_council_date()
 #id_doc
 
 #len(files_to_process)
+file_doc.check_discussion()
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index f90590ae..e5ea8ff8 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -49,6 +49,20 @@ def check_if_discussion(path_meta_xml_file,
 
     return True
 
+# function to get date and council
+def get_council_and_date(path_meta_xml_file):
+
+    # parse, get root and then part of interest
+    XML_tree = ET.parse(path_meta_xml_file)
+    XML_root = XML_tree.getroot()
+    XML_poi = XML_root[0].find('META_FROM_DB')
+
+    # get council and date
+    str_council = XML_poi.attrib['RAT']
+    str_date = XML_poi.attrib['DATUM']
+
+    return (str_council, str_date)
+
 # helper function to get text without font information
 # example for font information: [font face="11.718" size="Times-Roman"] sometext [/font]
 # input:
-- 
GitLab