diff --git a/.gitattributes b/.gitattributes index 322c7e41fa0cb7d9a137a0bbd8e7aa2ca252c288..bcb7c4d4d23b73c05031572b9e29f12153e4b397 100644 --- a/.gitattributes +++ b/.gitattributes @@ -254,6 +254,7 @@ data/AB/1983/02_extractedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1991/02_extractedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1991/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/lists/dict_overlaps.pickle filter=lfs diff=lfs merge=lfs -text +data/lists/all_titles.csv filter=lfs diff=lfs merge=lfs -text data/AB/1978/02_extractedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1979/02_extractedxml.tar.gz filter=lfs diff=lfs merge=lfs -text data/AB/1975/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/.renku/workflow/d7f5834807064a1f8ed8026be247a3be_python.cwl b/.renku/workflow/d7f5834807064a1f8ed8026be247a3be_python.cwl new file mode 100644 index 0000000000000000000000000000000000000000..1226793d34017c9012dd77961743a84a3740a1d3 --- /dev/null +++ b/.renku/workflow/d7f5834807064a1f8ed8026be247a3be_python.cwl @@ -0,0 +1,60 @@ +arguments: [] +baseCommand: +- python +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: + class: File + path: ../../src/python/extract_document_titles.py + inputBinding: + position: 1 + separate: true + shellQuote: true + streamable: false + type: File + input_2: + default: + class: Directory + listing: [] + path: ../../data/AB + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: Directory + input_3: + default: 03_correctedmeta + inputBinding: + position: 3 + separate: true + shellQuote: true + streamable: false + type: string + input_4: + default: data/lists/all_titles.csv + inputBinding: + position: 4 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_4) + streamable: false + type: File +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/lists + writable: true +successCodes: [] +temporaryFailCodes: [] diff --git a/data/lists/all_titles.csv b/data/lists/all_titles.csv new file mode 100644 index 0000000000000000000000000000000000000000..812c19fa3d04e8278fba69e89350e22aeb45383a --- /dev/null +++ b/data/lists/all_titles.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fbfbadcb22dcb849344764815b91ba6a54e092f392f5ea6b84d04d8e90c9cd +size 5867681 diff --git a/src/python/def_classes.py b/src/python/def_classes.py index f8e07833543f350fe3f1a5877f63986f0a898038..cb2f626087d76ebc4ab1baf78b52b45cd42ed4ed 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -718,6 +718,20 @@ class Document: return flag_discussion + + def get_document_title(self, list_attributes, name_outmeta = '03_correctedmeta'): + + if 'name_outmeta' not in self.__dict__.keys(): + self.name_outmeta = name_outmeta + + utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta) + self.list_titles = utils_annot.get_document_title_(self.name_meta_corr[1], list_attributes) + + command = 'rm -rf ./' + str(self.year) + #print(command) + utils_proc.call_with_out(command) + + def get_council_date(self, name_outmeta = '03_correctedmeta'): if 'name_outmeta' not in self.__dict__.keys(): diff --git a/src/python/extract_document_titles.py b/src/python/extract_document_titles.py new file mode 100644 index 0000000000000000000000000000000000000000..3efe540f94411a00b61067d64e65d83247780bc3 --- /dev/null +++ b/src/python/extract_document_titles.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Code to get titles of all the documents and save it in a csv file +# to run: +# renku run --isolation python src/python/extract_document_titles.py data/AB/ 03_correctedmeta data/lists/all_titles.csv + +import sys +import csv + +import utils_proc +import def_classes as defc + +# years of interest +years = [1891, 1995] #1995 +range_years = range(years[0], years[1] + 1) + +# specify input and output parameters +folder_database = sys.argv[1] +suffix_correctedmeta = sys.argv[2] +output_titles = sys.argv[3] + +# title attributes +list_attributes = ['year', 'number', 'TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'] + +# initialize list of all titles and add header +list_all_titles = [] +list_all_titles.append(list_attributes) + +# for each year +for year in range_years: + print(year) + # generate path of input file + input_file = folder_database + str(year) + '/' + suffix_correctedmeta + '.tar.gz' + command = 'git lfs pull -I ' + input_file + utils_proc.call_with_out(command) + + # get list of files to process and sort that list + files_to_process, _ = utils_proc.get_list(str(year), folder_database, suffix_correctedmeta) + files_to_process.sort() + + # for each file + for file_tarpath in files_to_process: + # get id + id_doc = file_tarpath.split('/')[-1][:8] + + # instantiate document object (always from original pdf) + file_aux = str(year) + '/' + id_doc + '.pdf' + file_doc = defc.Document(file_aux, folder_database) + + # add year and id + list_titles = [str(year), id_doc] + + # get titles and add them to the list for that document + file_doc.get_document_title(list_attributes[2:]) + list_titles.extend(file_doc.list_titles) + + # append to list of all titles + list_all_titles.append(list_titles) + +# save csv file +with open(output_titles, 'w') as fo: + wr = csv.writer(fo, dialect='excel') + wr.writerows(list_all_titles) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 7d8ce8053992f159caae552c4d84bed90bcae5af..ce4a88568f880975da15996fe02c47e85d150c8d 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -50,6 +50,26 @@ def check_if_discussion(path_meta_xml_file, return True +# function to get date and council +def get_document_title_(path_meta_xml_file, list_attributes): + + # parse, get root and then part of interest + XML_tree = ET.parse(path_meta_xml_file) + XML_root = XML_tree.getroot() + XML_poi = XML_root[0].find('ADS_TEXTEINHEIT') + + # get titles + list_titles = [] + for attribute in list_attributes: + if attribute in XML_poi.attrib: + title = XML_poi.attrib[attribute] + list_titles.append(title) + else: + list_titles.append('(empty)') + + return list_titles + + # function to get date and council def get_council_and_date(path_meta_xml_file):