From 8473096a602923d39a826cf28bce071e81023f14 Mon Sep 17 00:00:00 2001
From: Lilian Gasser <gasserli@ethz.ch>
Date: Tue, 22 Jan 2019 17:02:27 +0100
Subject: [PATCH] clean utils_annot

---
 src/python/utils_annot.py | 96 ++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 57 deletions(-)

diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 3b1ab135..b19c0640 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -224,6 +224,14 @@ def get_complete_text(textbox):
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
 def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False):
 
+    # lists of roles
+    list_roles = ['PrÃ¤sident', 'PrÃ¤sidentin', 'VizeprÃ¤sident', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente',
+                  'Berichterstatter', 'Berichterstatterin', 'rapporteur',
+                  'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
+                  'Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller fÃ©dÃ©ral',
+                  'VizeprÃ¤sident']
+    list_roles_ext = ['Mehrheit', 'Minderheit', 'majoritÃ©', 'minoritÃ©', 'deutscher', 'deutsche', 'franÃ§ais', 'franÃ§aise', 'Kommission', 'commission']
+
     # initialize flag
     this_is_speech = False
 
@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
     # at the beginning of a textbox and identifiying a name or a role in front
     # of that colon
     if ind_tl_colon >= 0:
-#    if ':' in text[:100]:
         # extract the index of the colon in the text
         colon_index_text = text.index(':')
 
@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
 
         # remove single characters
         # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
+        # TODO: maybe exclude I and A to account for Appenzell
         list_oi = [term for term in list_oi if len(term)>1]
 
-#        # for every term
-#        for term in list_oi:
-        # if possible, find a name in a list
-        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, df_names, list_notnames, bln_print=False)
+        # if possible, find a name from the list
+        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False)
         if bln_print:
             print('name', str_name, 'role', str_role)
 
@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                 thattext = XML_new[ind_p][ind_t][0].text
                 colon_index = thattext.index(':')
 
-#                print(thattext)
-
                 try:
                     # write speaker to first line
                     XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend
 
                     # get start of speech with correct font start
-#                    print(thattext[colon_index+1:])
                     if thattext[colon_index+1:].startswith('[font'):
                         startspeech = thattext[colon_index+1:]
                     elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                     else:
                         startspeech = thattext[colon_index+1:]
 
-#                    print(startspeech)
-
                     # write beginning of speech to second line
                     # (create new ET element if necessary)
                     if len(list(XML_new[ind_p][ind_t])) > 1:
@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                 colon_index = thattext.index(':')
 
                 # get start of speech with correct font start
-#                    print(thattext[colon_index+1:])
                 if thattext[colon_index+1:].startswith('[font'):
                     startspeech = thattext[colon_index+1:]
                 elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
@@ -424,9 +424,9 @@ def flatten(l):
 # - list_uniqueID: list with one or several uniqueIDs
 # - list_tupels: list of tupels containing all types of names
 # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
-def find_names(list_oi, df_names, list_notnames, bln_print=False):
+def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):
 
-    def get_string(term, str_name, str_role, list_uniqueID, str_canton):
+    def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton):
         name_type = ''
         # if it is one of the simple names
         if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
             str_name = add_to_string(str_name, correct_name)
             name_type = 'comp'
         # if it contains a canton
-        # !!! also pass list_oi to look for canton
-        # !!! how to handle for people mentioned in text???
+        # TODO: how to handle for people mentioned in text???
         elif term in list(df_names['name_short'].loc[df_names['type']=='canton']):
             if bln_print:
                 print('contains a canton', term)
@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
                 print(list_temp, list_uniqueID)
                 print(type(list_temp), type(list_uniqueID))
                 print(isinstance(list_uniqueID, list))
-            # if no unique ID has been assigned so far
-            if len(list_uniqueID) == 0:
-                list_uniqueID = list_temp
-            # if there are already one or several people and have a new person, we update
-            elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
-                list_uniqueID.append(list_temp)
-
-            ## if we already have several possible people, e.g. because of canton
-            #elif isinstance(int_uniqueID, tuple):
-                #print('I should be here')
-                ## and refound the uniqueID of one of those, don't update
-                #if temp in int_uniqueID:
-                    #pass
-                ## and update if we don't have that uniqueID yet
-                #else:
-                    #int_uniqueID = (int_uniqueID, temp)
-            ## if a person with that uniqueID exists already, don't update
-            #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
-                #print('but end up here.. not even.....')
-                #pass
-            ## if a different unique ID has been assigned already
-            #else:
-                #int_uniqueID = (int_uniqueID, temp)
+
+            list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
 
         return str_name, str_role, list_uniqueID, name_type
 
+    def update_list_uniqueID(list_uniqueID, list_temp):
+        # if no unique ID has been assigned so far
+        if len(list_uniqueID) == 0:
+            list_uniqueID = list_temp
+        # if there are already one or several people and have a new person, we update
+        elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
+            list_uniqueID.append(list_temp)
+
+        return list_uniqueID
+
     # function to find correct term (in case of misspellings, etc.)
-    def get_approximate_term(term, array_all_names):
+    def get_approximate_term(term, array_all):
         # TODO: probably need to improve this procedure
         #       - find better values ....
 
         # initialize string
         term_approx = ''
 
-        # get normalize array
-        array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35]
+        # get normalized array
+        array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35]
         array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized)
 
         # get absolute array
-        array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2]
+        array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2]
         array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute)
         if bln_print:
             print(term)
@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
     str_canton = ''
     name_type = ''
 
-    # lists of roles
-    list_roles = ['PrÃ¤sident', 'PrÃ¤sidentin', 'VizeprÃ¤sident', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente',
-                  'Berichterstatter', 'Berichterstatterin', 'rapporteur',
-                  'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
-                  'Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller fÃ©dÃ©ral',
-                  'VizeprÃ¤sident']
-    list_roles_ext = ['Mehrheit', 'Minderheit', 'majoritÃ©', 'minoritÃ©', 'deutscher', 'deutsche', 'franÃ§ais', 'franÃ§aise', 'Kommission', 'commission']
-
     # extract list and array of last names
     list_all_names = list(df_names['name_short'])
     array_all_names = np.array(df_names['name_short'])
@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
 
         if term in list_roles:
             # get correct name and uniqueID, or role, for that term
-            str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
+            str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
 
             if bln_print:
                 print('found a role', term)
 
             # TODO: also look for similar terms (misspellings)
+            # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
 
         elif term in list_roles_ext:
             pass
             # TODO: extract whether it is minority or majority and save that information
 
-        # can not happen for the first term
+        # cannot happen for the first term
         elif name_type == 'canton':
             list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
             canton_type = ''
@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
 
             else:
                 print('might be a canton', term, list_oi, str_name, str_role)
-                # TODO: maybe: go to next elif?
 
             # if a canton or similar was found
             if canton_type:
@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
                     try:
                         if len(list_cities) == 1:
                             str_citizenship = list_cities[0]
-                    # except:
+                    except:
                         print('found no or more than one person with citizenship', str_canton, str_name)
                         pass
 
@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
                 else:
                     list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                 print(list_temp, list_uniqueID)
-                list_uniqueID = list_temp
+
+                if len(list_temp) > 0:
+                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
 
         # if term is not easily mistaken as a name (avoid false positives)
         elif term not in list_notnames:
@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
             # if term is in the list of all names
             if term in list_all_names:
                 # get correct name and uniqueID, or role, for that term
-                str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
+                str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
 
                 if bln_print:
                     print('=== correct name', term)
@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
 
                 # if one was found, get correct name, etc.
                 if term_approx:
-                    str_name, str_role, list_uniqueID, name_type = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton)
+                    str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton)
                     if bln_print:
                         print('=== approximate name', str_name, term_approx)
 
-- 
GitLab