From 6dffcc06fa680cab337c94b778e9e6c04739c434 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Fri, 4 Jan 2019 10:01:48 +0100
Subject: [PATCH] WIP extract discussions from bash file

---
 src/python/run_extract_discussions.py | 67 ++++++++++++---------------
 src/sh/extract_discussions_yearly.sh  |  2 +-
 2 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 434c10d1..0a1fd5fd 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -3,8 +3,9 @@
 
 # Code to extract discussions from corrected XML files
 #%%
-%load_ext autoreload
-%autoreload 2
+# to work with atom
+#%load_ext autoreload
+#%autoreload 2
 
 import pickle
 import time
@@ -18,6 +19,7 @@ import utils_proc
 import utils_annot
 
 #%%
+# specify input and output files
 
 # needed for running in atom, can be ignored
 input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle"
@@ -26,10 +28,10 @@ input_correctedmeta = "data/AB/1893/03_correctedmeta.tar.gz"
 output_annotatedxml = "data/AB/1893/05_annotatedxml.tar.gz"
 
 # detect arguments
-#input_lastnames = sys.argv[1]
-#input_correctedxml = sys.argv[2]
-#input_correctedmeta = sys.argv[3]
-#output_annotatedxml = sys.argv[4]
+input_lastnames = sys.argv[1]
+input_correctedxml = sys.argv[2]
+input_correctedmeta = sys.argv[3]
+output_annotatedxml = sys.argv[4]
 
 #%%
 # extract suffixes, year, folder_database
@@ -40,13 +42,6 @@ folder_database = input_correctedxml.split(year)[0]
 suffix_correctedmeta = '_metacorr'
 #suffix_correctedxml = '_datacorr'
 
-# print some output
-print(year)
-print(type(year))
-#print(input_lastnames)
-#print(input_correctedxml)
-#print(input_correctedmeta)
-print(folder_database)
 #%%
 # TODO pull necessary data from here
 #!!git lfs pull origin -I input_correctedxml
@@ -64,14 +59,16 @@ files_to_process.sort()
 print(files_to_process)
 
 # open dataframe of last names from pickle file
+# (there is one file of lastnames per year)
 with open(input_lastnames, 'rb') as f:
     df_lastnames = pickle.load(f)
 
-df_lastnames.columns
+print(df_lastnames.column)
+
 #%%
 # for each file
 for file_tarpath in files_to_process[66:]:
-#    print(file_tarpath)
+    #print(file_tarpath)
     id_doc = file_tarpath.split('/')[-1][:8]
 
     # instantiate document object (always from original pdf)
@@ -93,30 +90,26 @@ utils_proc.compress_tar(output_annotatedxml)
 
 
 #%%
-
-file_tarpath = './1893/20026537_datacorr.xml'
-
-id_doc = file_tarpath.split('/')[-1][:8]
-metafile_tarpath = './{}/{}{}.xml'.format(year, id_doc, suffix_correctedmeta)
-
-# instantiate document object (always from original pdf)
-infile_aux = year + '/' + id_doc + '.pdf'
-file_doc = defc.Document(infile_aux, folder_database)
-
-if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
-    print(id_doc + '\n')
-
-    file_doc.df_lastnames = df_lastnames
-    file_doc.annotate_xml()
-
-    name_xml = 'data/AB/1893/id_doc_previewannotated-class.xml'
-    tree = ET.ElementTree(file_doc.XML_main_annot)
-    tree.write(name_xml, encoding = 'utf-8')
-
+## to test for one file
+#file_tarpath = './1893/20026538_datacorr.xml'
+#
+#id_doc = file_tarpath.split('/')[-1][:8]
+#
+## instantiate document object (always from original pdf)
+#infile_aux = year + '/' + id_doc + '.pdf'
+#file_doc = defc.Document(infile_aux, folder_database)
+#
+#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
+    #print(id_doc + '\n')
+#
+    #file_doc.df_lastnames = df_lastnames
+    #file_doc.annotate_xml()
+#
+#
 #%%
 
 
 
-id_doc
+#id_doc
 
-len(files_to_process)
+#len(files_to_process)
diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh
index 4df0e302..9e8bef33 100755
--- a/src/sh/extract_discussions_yearly.sh
+++ b/src/sh/extract_discussions_yearly.sh
@@ -6,5 +6,5 @@ year_end=1891
 for year in $(seq $year_start $year_end)
 do
     echo $year
-    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/$year/04_correctedxml.tar.gz data/AB/$year/03_correctedmeta.tar.gz data/ data/AB/$year/05_annotatedxml.tar.gz
+    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/ data/AB/${year}/05_annotatedxml.tar.gz
 done
-- 
GitLab