From 82ea87c063e19a88e7565a78fc7bcfe24b52b45c Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Wed, 5 Dec 2018 18:11:27 +0100
Subject: [PATCH] mend

---
 src/python/run_extract_discussions.py | 62 ++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 4e2c1531..0dc23f86 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+#%%
+%load_ext autoreload
+%autoreload 2
 
 import os
 import pickle
@@ -14,6 +17,7 @@ sys.path.append('src/python/')
 import def_classes as defc
 import utils_proc
 import utils_annot
+#%%
 
 # needed for running in atom, can be ignored
 input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle"
@@ -22,19 +26,22 @@ input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz"
 output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz"
 
 # detect arguments
-input_lastnames = sys.argv[1]
-input_correctedxml = sys.argv[2]
-input_correctedmeta = sys.argv[3]
-output_annotatedxml = sys.argv[4]
+#input_lastnames = sys.argv[1]
+#input_correctedxml = sys.argv[2]
+#input_correctedmeta = sys.argv[3]
+#output_annotatedxml = sys.argv[4]
 
-# extract suffix, year, folder_database
+#%%
+
+# extract suffixes, year, folder_database
 suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0]
 suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0]
-year = (input_correctedxml.split('/')[-2])
+year = input_correctedxml.split('/')[-2]
 folder_database = input_correctedxml.split(year)[0]
 suffix_correctedmeta = '_metacorr'
 suffix_correctedxml = '_datacorr'
 
+#%%
 # print some output
 print(year)
 print(type(year))
@@ -42,7 +49,7 @@ print(input_correctedxml)
 print(input_lastnames)
 print(input_correctedmeta)
 print(folder_database)
-
+#%%
 # pull necessary data
 # TODO pull necessary data from here
 #!!git lfs pull origin -I input_correctedxml
@@ -54,7 +61,7 @@ print(folder_database)
 
 # get dictionary of discussions
 # -----------------------------
-
+#%%
 start_time_discussions = time.time()
 print('start to identify discussions of the year', year, '\n\n\n')
 
@@ -71,8 +78,7 @@ list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
 list_stopwords.extend(stopwords.words('french'))
 list_stopwords.extend(['ils', 'les', 'celle'])
 
-# string for this year
-str_year = str(year)
+#%%
 # initialize empty dictionary for that year
 dict_year = {}
 
@@ -82,6 +88,7 @@ files_to_process.sort()
 print(files_to_process)
 
  #meta_files, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedmeta)
+#%%
 
 # open dataframe of last names from pickle file
 with open(input_lastnames, 'rb') as f:
@@ -89,25 +96,48 @@ with open(input_lastnames, 'rb') as f:
 
 print(df_lastnames)
 df_lastnames.columns
+#%%
 
 file_tarpath = './1891/20026440_datacorr.xml'
+
+file_number = file_tarpath.split('/')[-1][:8]
+metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta)
+
+# instantiate document object (always from original pdf)
+infile_aux = year + '/' + file_number + '.pdf'
+file_doc = defc.Document(infile_aux, folder_database)
+file_doc
+
+
+if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
+    print(file_number + '\n')
+
+
+
+
+
+
+
+
+
+
+
+#%%
 # for each file
 for file_tarpath in files_to_process:
 #    print(file_tarpath)
     file_number = file_tarpath.split('/')[-1][:8]
-    metafile_tarpath = './{}/{}{}.xml'.format(str_year, file_number, suffix_correctedmeta)
 
     # instantiate document object (always from original pdf)
-    infile_aux = str_year + '/' + file_number + '.pdf'
-    file_doc = defc.Document(infile_aux, folder_database)
-
-
+    file_aux = year + '/' + file_number + '.pdf'
+    file_doc = defc.Document(file_aux, folder_database)
 
     # ----> CONTINUE HERE NEXT TIME!!!!!!
-    #path_number = path_data + str_year + '/' + number + '/'
     # if document is a discussion
     if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
         print(file_number + '\n')
+
+#%%
         ## get dictionary with text
         #dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml')
         ## exclude parts from previous and next document
-- 
GitLab