From e6452cd569a86ce9d0a4fd64cded7cf88f3ae9b4 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Tue, 5 Feb 2019 18:49:11 +0100
Subject: [PATCH] add document titles

---
 src/python/def_classes.py             | 14 ++++++
 src/python/extract_document_titles.py | 64 +++++++++++++++++++++++++++
 src/python/utils_annot.py             | 20 +++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 src/python/extract_document_titles.py

diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index f8e07833..cb2f6260 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -718,6 +718,20 @@ class Document:
 
         return flag_discussion
 
+
+    def get_document_title(self, list_attributes, name_outmeta = '03_correctedmeta'):
+
+        if 'name_outmeta' not in self.__dict__.keys():
+            self.name_outmeta = name_outmeta
+
+        utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta)
+        self.list_titles = utils_annot.get_document_title_(self.name_meta_corr[1], list_attributes)
+
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+
+
     def get_council_date(self, name_outmeta = '03_correctedmeta'):
 
         if 'name_outmeta' not in self.__dict__.keys():
diff --git a/src/python/extract_document_titles.py b/src/python/extract_document_titles.py
new file mode 100644
index 00000000..3efe540f
--- /dev/null
+++ b/src/python/extract_document_titles.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Code to get titles of all the documents and save it in a csv file
+# to run:
+# renku run --isolation python src/python/extract_document_titles.py data/AB/ 03_correctedmeta data/lists/all_titles.csv
+
+import sys
+import csv
+
+import utils_proc
+import def_classes as defc
+
+# years of interest
+years = [1891, 1995]   #1995
+range_years = range(years[0], years[1] + 1)
+
+# specify input and output parameters
+folder_database = sys.argv[1]
+suffix_correctedmeta = sys.argv[2]
+output_titles = sys.argv[3]
+
+# title attributes
+list_attributes = ['year', 'number', 'TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR']
+
+# initialize list of all titles and add header
+list_all_titles = []
+list_all_titles.append(list_attributes)
+
+# for each year
+for year in range_years:
+    print(year)
+    # generate path of input file
+    input_file = folder_database + str(year) + '/' + suffix_correctedmeta + '.tar.gz'
+    command = 'git lfs pull -I ' + input_file
+    utils_proc.call_with_out(command)
+
+    # get list of files to process and sort that list
+    files_to_process, _ = utils_proc.get_list(str(year), folder_database, suffix_correctedmeta)
+    files_to_process.sort()
+
+    # for each file
+    for file_tarpath in files_to_process:
+        # get id
+        id_doc = file_tarpath.split('/')[-1][:8]
+
+        # instantiate document object (always from original pdf)
+        file_aux = str(year) + '/' + id_doc + '.pdf'
+        file_doc = defc.Document(file_aux, folder_database)
+
+        # add year and id
+        list_titles = [str(year), id_doc]
+
+        # get titles and add them to the list for that document
+        file_doc.get_document_title(list_attributes[2:])
+        list_titles.extend(file_doc.list_titles)
+
+        # append to list of all titles
+        list_all_titles.append(list_titles)
+
+# save csv file
+with open(output_titles, 'w') as fo:
+    wr = csv.writer(fo, dialect='excel')
+    wr.writerows(list_all_titles)
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 7d8ce805..ce4a8856 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -50,6 +50,26 @@ def check_if_discussion(path_meta_xml_file,
 
     return True
 
+# function to get date and council
+def get_document_title_(path_meta_xml_file, list_attributes):
+
+    # parse, get root and then part of interest
+    XML_tree = ET.parse(path_meta_xml_file)
+    XML_root = XML_tree.getroot()
+    XML_poi = XML_root[0].find('ADS_TEXTEINHEIT')
+
+    # get titles
+    list_titles = []
+    for attribute in list_attributes:
+        if attribute in XML_poi.attrib:
+            title = XML_poi.attrib[attribute]
+            list_titles.append(title)
+        else:
+            list_titles.append('(empty)')
+
+    return list_titles
+
+
 # function to get date and council
 def get_council_and_date(path_meta_xml_file):
 
-- 
GitLab