diff --git a/src/python/def_classes.py b/src/python/def_classes.py index f8e07833543f350fe3f1a5877f63986f0a898038..cb2f626087d76ebc4ab1baf78b52b45cd42ed4ed 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -718,6 +718,20 @@ class Document: return flag_discussion + + def get_document_title(self, list_attributes, name_outmeta = '03_correctedmeta'): + + if 'name_outmeta' not in self.__dict__.keys(): + self.name_outmeta = name_outmeta + + utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta) + self.list_titles = utils_annot.get_document_title_(self.name_meta_corr[1], list_attributes) + + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + def get_council_date(self, name_outmeta = '03_correctedmeta'): if 'name_outmeta' not in self.__dict__.keys(): diff --git a/src/python/extract_document_titles.py b/src/python/extract_document_titles.py new file mode 100644 index 0000000000000000000000000000000000000000..3efe540f94411a00b61067d64e65d83247780bc3 --- /dev/null +++ b/src/python/extract_document_titles.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Code to get titles of all the documents and save it in a csv file +# to run: +# renku run --isolation python src/python/extract_document_titles.py data/AB/ 03_correctedmeta data/lists/all_titles.csv + +import sys +import csv + +import utils_proc +import def_classes as defc + +# years of interest +years = [1891, 1995] #1995 +range_years = range(years[0], years[1] + 1) + +# specify input and output parameters +folder_database = sys.argv[1] +suffix_correctedmeta = sys.argv[2] +output_titles = sys.argv[3] + +# title attributes +list_attributes = ['year', 'number', 'TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'] + +# initialize list of all titles and add header +list_all_titles = [] +list_all_titles.append(list_attributes) + +# for each year +for year in range_years: + print(year) + # generate path of input file + input_file = folder_database + str(year) + '/' + suffix_correctedmeta + '.tar.gz' + command = 'git lfs pull -I ' + input_file + utils_proc.call_with_out(command) + + # get list of files to process and sort that list + files_to_process, _ = utils_proc.get_list(str(year), folder_database, suffix_correctedmeta) + files_to_process.sort() + + # for each file + for file_tarpath in files_to_process: + # get id + id_doc = file_tarpath.split('/')[-1][:8] + + # instantiate document object (always from original pdf) + file_aux = str(year) + '/' + id_doc + '.pdf' + file_doc = defc.Document(file_aux, folder_database) + + # add year and id + list_titles = [str(year), id_doc] + + # get titles and add them to the list for that document + file_doc.get_document_title(list_attributes[2:]) + list_titles.extend(file_doc.list_titles) + + # append to list of all titles + list_all_titles.append(list_titles) + +# save csv file +with open(output_titles, 'w') as fo: + wr = csv.writer(fo, dialect='excel') + wr.writerows(list_all_titles) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 7d8ce8053992f159caae552c4d84bed90bcae5af..ce4a88568f880975da15996fe02c47e85d150c8d 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -50,6 +50,26 @@ def check_if_discussion(path_meta_xml_file, return True +# function to get date and council +def get_document_title_(path_meta_xml_file, list_attributes): + + # parse, get root and then part of interest + XML_tree = ET.parse(path_meta_xml_file) + XML_root = XML_tree.getroot() + XML_poi = XML_root[0].find('ADS_TEXTEINHEIT') + + # get titles + list_titles = [] + for attribute in list_attributes: + if attribute in XML_poi.attrib: + title = XML_poi.attrib[attribute] + list_titles.append(title) + else: + list_titles.append('(empty)') + + return list_titles + + # function to get date and council def get_council_and_date(path_meta_xml_file):