From e30aa8a0eea934987f4e036235a909471170a1f8 Mon Sep 17 00:00:00 2001 From: Lilian Gasser <gasserli@ethz.ch> Date: Mon, 28 Jan 2019 17:06:11 +0100 Subject: [PATCH] new method in def_classes to get council and date, check_discussion now works with corrected meta data --- src/python/def_classes.py | 40 +++++++++++++++++++-------- src/python/run_extract_discussions.py | 11 ++++++-- src/python/utils_annot.py | 14 ++++++++++ 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 8e2a480b..1c3c2f06 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -48,6 +48,7 @@ class Document: self.name_wo_ext = os.path.splitext(self.name_file)[0] self.folder_database = folder_database self._meta_ext() + self._meta_corr_ext() self._xml_ext() def _meta_ext(self): @@ -57,6 +58,12 @@ class Document: name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' self.name_meta = [name_tar, name_file, name_file_db] + def _meta_corr_ext(self): + # Both for the correction and the extraction of the metadata information + name_file = str(self.year) + '/' + self.id_doc + '_metacorr.xml' + name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz' + self.name_meta_corr = [name_tar, name_file] + def _xml_ext(self, suffix_xml = '_data', name_outcorrxml = '04_correctedxml'): # For the extraction, correction and annotation of the xmls # TODO for extraction and annotation @@ -647,15 +654,6 @@ class Document: fig.savefig(name_fig, format = format_fig, dpi = dpi) plt.close(fig) - def check_discussion(self): - utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) - flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) - command = 'rm -rf ./' + str(self.year) - #print(command) - utils_proc.call_with_out(command) - - return flag_discussion - def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '', flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200): #print(groups) @@ -706,15 +704,33 @@ class Document: - def check_discussion(self): - utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta) - flag_discussion = utils_annot.check_if_discussion(self.name_meta[1]) + def check_discussion(self, name_outmeta = '03_correctedmeta'): + + if 'name_outmeta' not in self.__dict__.keys(): + self.name_outmeta = name_outmeta + + utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta) + flag_discussion = utils_annot.check_if_discussion(self.name_meta_corr[1]) + command = 'rm -rf ./' + str(self.year) #print(command) utils_proc.call_with_out(command) return flag_discussion + def get_council_date(self, name_outmeta = '03_correctedmeta'): + + if 'name_outmeta' not in self.__dict__.keys(): + self.name_outmeta = name_outmeta + + utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta) + (str_council, str_date) = utils_annot.get_council_and_date(self.name_meta_corr[1]) + + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + return (str_council, str_date) def annotate_xml(self, flag_save = 1, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'): diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 618f7b2c..b707f8cc 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -26,7 +26,7 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1971' +year = '1893' input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" @@ -113,7 +113,7 @@ utils_proc.compress_tar(output_annotatedxml) #%% # to test for one file -file_tarpath = './1971/20000010_datacorr.xml' +file_tarpath = './1893/20026592_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -126,12 +126,17 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20 file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames + (str_council, str_date) = file_doc.get_council_date() + file_doc.str_council = str_council + file_doc.str_date = str_date file_doc.annotate_xml() #%% - +file_doc = defc.Document(infile_aux, folder_database) +file_doc.get_council_date() #id_doc #len(files_to_process) +file_doc.check_discussion() diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index f90590ae..e5ea8ff8 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -49,6 +49,20 @@ def check_if_discussion(path_meta_xml_file, return True +# function to get date and council +def get_council_and_date(path_meta_xml_file): + + # parse, get root and then part of interest + XML_tree = ET.parse(path_meta_xml_file) + XML_root = XML_tree.getroot() + XML_poi = XML_root[0].find('META_FROM_DB') + + # get council and date + str_council = XML_poi.attrib['RAT'] + str_date = XML_poi.attrib['DATUM'] + + return (str_council, str_date) + # helper function to get text without font information # example for font information: [font face="11.718" size="Times-Roman"] sometext [/font] # input: -- GitLab