From babfb25f4befae6e64b7f279eeee51b959f40e2e Mon Sep 17 00:00:00 2001
From: Lili Gasser <gasserli@ethz.ch>
Date: Thu, 7 Feb 2019 10:27:15 +0000
Subject: [PATCH] get rid of print statements

---
 src/python/def_classes.py             | 14 +++++-----
 src/python/run_extract_discussions.py | 23 +++-------------
 src/python/utils_annot.py             | 38 ++++++++++++++++-----------
 3 files changed, 33 insertions(+), 42 deletions(-)

diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index cb2f6260..96897ce7 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -732,7 +732,7 @@ class Document:
         utils_proc.call_with_out(command)
 
 
-    def get_council_date(self, name_outmeta = '03_correctedmeta'):
+    def _get_council_date(self, name_outmeta = '03_correctedmeta'):
 
         if 'name_outmeta' not in self.__dict__.keys():
             self.name_outmeta = name_outmeta
@@ -750,7 +750,7 @@ class Document:
     # - dict_overlaps: dictionary with overlaps
     # output:
     # - (first_entry, last_entry): tuple of first and last textbox id
-    def get_first_last_textbox(self, dict_overlaps_year):
+    def _get_first_last_textbox(self, dict_overlaps_year):
 
         # initialize to impossible values
         first_entry = -1
@@ -781,7 +781,7 @@ class Document:
             self.name_outxml = name_outxml
 
         if 'XML_main_corr' not in self.__dict__.keys():
-            print('no main corr')
+            #print('no main corr')
             name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
             if os.path.isfile(name_tar):
                 name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
@@ -799,13 +799,13 @@ class Document:
                     #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml,
                     #name_outcorrxml = self.name_outcorrxml)
 
-        print('we have a main corr XML file')
+        #print('we have a main corr XML file')
 
         # get council and date
-        self.get_council_date()
+        self._get_council_date()
 
         # get start and end of document
-        entries = self.get_first_last_textbox(self.dict_overlaps_year)
+        entries = self._get_first_last_textbox(self.dict_overlaps_year)
 
         # update if document starts/ends as on pdf
         if entries[0] == -1:
@@ -841,9 +841,7 @@ class Document:
 
         self.name_outannotxml = name_outannotxml
         self.name_annot_corr = [name_tar, name_xml]
-#        self._xml_ext(suffix_xml, self.name_outannotxml)
         command = 'rm -rf ./' + str(self.year)
-        #print(command)
         utils_proc.call_with_out(command)
 
         print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 2ad33fdf..a7689b9b 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -9,16 +9,12 @@
 
 import pickle
 import time
-import xml.etree.ElementTree as ET
 
 import sys
 sys.path.append('src/python/')
 
 import def_classes as defc
 import utils_proc
-import utils_annot
-
-import os
 
 from utils_proc import call_with_out
 
@@ -35,7 +31,7 @@ input_overlaps = "data/lists/dict_overlaps.pickle"
 output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 
 #%%
-# detect arguments
+# detect arguments from sh file
 input_lastnames = sys.argv[1]
 input_correctedxml = sys.argv[2]
 input_correctedmeta = sys.argv[3]
@@ -50,33 +46,22 @@ suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0]
 #suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0]
 year = input_correctedxml.split('/')[-2]
 folder_database = input_correctedxml.split(year)[0]
-suffix_correctedmeta = '_metacorr'
-#suffix_correctedxml = '_datacorr'
 
 #%%
-# TODO: make it work!
 # git lfs pull necessary data
+# does not work in atom
 for lfsfile in [input_correctedxml, input_correctedmeta, input_overlaps]:
     command = 'git lfs pull -I ' + lfsfile
     call_with_out(command)
-
-print(input_lastnames.split(year)[0])
     
 command = 'git lfs pull -I ' + input_lastnames.split(year)[0]
 call_with_out(command)   
     
 #%%
-# TODO: exclude overlaps --> after annotation
-
-
-#%%
-start_time_discussions = time.time()
-print('start to identify discussions of the year', year, '\n')
-
 # extract list of files
 files_to_process, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedxml)
 files_to_process.sort()
-print('files to process loaded:', files_to_process)
+print('files to process loaded')
 
 # open dataframe of last names from pickle file
 # (there is one file of lastnames per year)
@@ -112,7 +97,7 @@ for file_tarpath in files_to_process:
 
     # if document is a discussion
     if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
-        print(id_doc + '\n')
+        #print(id_doc + '\n')
         file_doc.df_lastnames = df_lastnames
         file_doc.list_notnames = list_notnames
         file_doc.dict_overlaps_year = dict_overlaps_year
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index ce4a8856..a6e4c9cf 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -431,8 +431,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                     # set flag
                     this_is_speech = True
                     if bln_print:
-                        print('found a name:', text_start, list_oi, str_name, str_role, '\n')
-                    print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n')
+                        print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n')
 
     return XML_new, this_is_speech
 
@@ -513,9 +512,10 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
     def get_string(term, df_names, str_name, list_uniqueID):
         # get name type
         name_type = df_names['nameType'].loc[df_names['shortName']==term].iloc[0]
-        if name_type != 'simple':
-            print(df_names[df_names['shortName']==term])
-        print(term, name_type)
+        if bln_print:
+            if name_type != 'simple':
+                print(df_names[df_names['shortName']==term])
+            print(term, name_type)
 
         # extract uniqueID and complete name for this term
         list_temp = []
@@ -528,7 +528,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
             list_temp = list(df_names.loc[(df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
             str_completeName = term + ' (CANTON MISSING)'
 
-        print(list_temp, str_completeName)
+        if bln_print:
+            print(list_temp, str_completeName)
 
         # set or update unique ID and name
         # if no unique ID and name has been assigned so far
@@ -686,7 +687,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                 if term in list_:
                     str_canton = term
                     canton_type = type_
-                    print('!!! is a canton', term, list_oi, str_name, str_role)
+                    if bln_print:
+                        print('!!! is a canton', term, list_oi, str_name, str_role)
                     break
 
             # if person was not uniquely identified, check for misspellings
@@ -699,7 +701,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                     if term_approx:
                         str_canton = term_approx
                         canton_type = type_
-                        print('!!! is a canton', term, list_oi, str_name, str_role)
+                        if bln_print:
+                            print('!!! is a canton', term, list_oi, str_name, str_role)
                         break
 
             # if a canton or similar was found
@@ -728,15 +731,18 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
 
 
                 else:
-                    print(canton_type, str_canton, str_name, df_temp)
+                    if bln_print:
+                        print(canton_type, str_canton, str_name, df_temp)
                     list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
                     str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]
 
-                print(list_temp, list_uniqueID, str_completeName)
+                if bln_print:
+                    print(list_temp, list_uniqueID, str_completeName)
 
                 if len(list_temp) > 0:
                     list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
-                    print(str_completeName)
+                    if bln_print:
+                        print(str_completeName)
                     if 'CANTON MISSING' in str_completeName:
                         str_name = add_to_string('', str_completeName)
                     elif str_completeName.split(' ')[0] == str_name:
@@ -745,13 +751,15 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                         str_name = add_to_string(str_name, str_completeName)
 
             else:
-                print('could not be identified as a canton:', term, list_oi, str_name, str_role)
+                if bln_print:
+                    print('could not be identified as a canton:', term, list_oi, str_name, str_role)
 
         # if term is first name
         # needed when people are referenced by FirstName LastName, e.g. Simon Kohler
         elif term in list_all_firstnames:
             str_firstname = term
-            print('found a first name', str_firstname)
+            if bln_print:
+                print('found a first name', str_firstname)
 
         # if term is not easily mistaken as a name (avoid false positives)
         elif term not in list_notnames:
@@ -780,7 +788,6 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
             # TODO check for false positives of these procedures
             if name_type == 'canton':
                 # check if person can be identified from firstname
-                print(str_firstname)
                 if str_firstname:
                     df_temp = df_names.loc[(df_names['shortName']==str_name.split(' ')[0]) & (df_names['FirstName']==str_firstname)]
                     if df_temp.shape[0] == 1:
@@ -831,7 +838,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
                         else:
                             str_name = add_to_string(str_name, str_completeName)
 
-                    print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape)
+                    if bln_print:
+                        print(str_date, df_temp.shape, df_temp_before.shape, df_temp_after.shape)
 
 
                 # TODO: function to update list unique ID and str_name
-- 
GitLab