clean utils_annot

8473096a · Lili Gasser · 135db555 · 8473096a
Commit 8473096a authored 6 years ago by Lili Gasser
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -224,6 +224,14 @@ def get_complete_text(textbox):
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
 def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False):

+    # lists of roles
+    list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente',
+                  'Berichterstatter', 'Berichterstatterin', 'rapporteur',
+                  'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
+                  'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
+                  'Vizepräsident']
+    list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
+
    # initialize flag
    this_is_speech = False

@@ -234,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
    # at the beginning of a textbox and identifiying a name or a role in front
    # of that colon
    if ind_tl_colon >= 0:
-#    if ':' in text[:100]:
        # extract the index of the colon in the text
        colon_index_text = text.index(':')

@@ -259,12 +266,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_

        # remove single characters
        # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
+        # TODO: maybe exclude I and A to account for Appenzell
        list_oi = [term for term in list_oi if len(term)>1]

-#        # for every term
-#        for term in list_oi:
-        # if possible, find a name in a list
-        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, df_names, list_notnames, bln_print=False)
+        # if possible, find a name from the list
+        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False)
        if bln_print:
            print('name', str_name, 'role', str_role)

@@ -300,14 +306,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                thattext = XML_new[ind_p][ind_t][0].text
                colon_index = thattext.index(':')

-#                print(thattext)
-
                try:
                    # write speaker to first line
                    XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend

                    # get start of speech with correct font start
-#                    print(thattext[colon_index+1:])
                    if thattext[colon_index+1:].startswith('[font'):
                        startspeech = thattext[colon_index+1:]
                    elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
@@ -317,8 +320,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                    else:
                        startspeech = thattext[colon_index+1:]

-#                    print(startspeech)
-
                    # write beginning of speech to second line
                    # (create new ET element if necessary)
                    if len(list(XML_new[ind_p][ind_t])) > 1:
@@ -343,7 +344,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                colon_index = thattext.index(':')

                # get start of speech with correct font start
-#                    print(thattext[colon_index+1:])
                if thattext[colon_index+1:].startswith('[font'):
                    startspeech = thattext[colon_index+1:]
                elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
@@ -424,9 +424,9 @@ def flatten(l):
 # - list_uniqueID: list with one or several uniqueIDs
 # - list_tupels: list of tupels containing all types of names
 # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
-def find_names(list_oi, df_names, list_notnames, bln_print=False):
+def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):

-    def get_string(term, str_name, str_role, list_uniqueID, str_canton):
+    def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton):
        name_type = ''
        # if it is one of the simple names
        if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
@@ -457,8 +457,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
            str_name = add_to_string(str_name, correct_name)
            name_type = 'comp'
        # if it contains a canton
-        # !!! also pass list_oi to look for canton
-        # !!! how to handle for people mentioned in text???
+        # TODO: how to handle for people mentioned in text???
        elif term in list(df_names['name_short'].loc[df_names['type']=='canton']):
            if bln_print:
                print('contains a canton', term)
@@ -480,46 +479,35 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
                print(list_temp, list_uniqueID)
                print(type(list_temp), type(list_uniqueID))
                print(isinstance(list_uniqueID, list))
-            # if no unique ID has been assigned so far
-            if len(list_uniqueID) == 0:
-                list_uniqueID = list_temp
-            # if there are already one or several people and have a new person, we update
-            elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
-                list_uniqueID.append(list_temp)
-
-            ## if we already have several possible people, e.g. because of canton
-            #elif isinstance(int_uniqueID, tuple):
-                #print('I should be here')
-                ## and refound the uniqueID of one of those, don't update
-                #if temp in int_uniqueID:
-                    #pass
-                ## and update if we don't have that uniqueID yet
-                #else:
-                    #int_uniqueID = (int_uniqueID, temp)
-            ## if a person with that uniqueID exists already, don't update
-            #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
-                #print('but end up here.. not even.....')
-                #pass
-            ## if a different unique ID has been assigned already
-            #else:
-                #int_uniqueID = (int_uniqueID, temp)
+
+            list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)

        return str_name, str_role, list_uniqueID, name_type

+    def update_list_uniqueID(list_uniqueID, list_temp):
+        # if no unique ID has been assigned so far
+        if len(list_uniqueID) == 0:
+            list_uniqueID = list_temp
+        # if there are already one or several people and have a new person, we update
+        elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
+            list_uniqueID.append(list_temp)
+
+        return list_uniqueID
+
    # function to find correct term (in case of misspellings, etc.)
-    def get_approximate_term(term, array_all_names):
+    def get_approximate_term(term, array_all):
        # TODO: probably need to improve this procedure
        #       - find better values ....

        # initialize string
        term_approx = ''

-        # get normalize array
-        array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35]
+        # get normalized array
+        array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35]
        array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized)

        # get absolute array
-        array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2]
+        array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2]
        array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute)
        if bln_print:
            print(term)
@@ -560,14 +548,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
    str_canton = ''
    name_type = ''

-    # lists of roles
-    list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente',
-                  'Berichterstatter', 'Berichterstatterin', 'rapporteur',
-                  'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
-                  'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
-                  'Vizepräsident']
-    list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
-
    # extract list and array of last names
    list_all_names = list(df_names['name_short'])
    array_all_names = np.array(df_names['name_short'])
@@ -580,18 +560,19 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):

        if term in list_roles:
            # get correct name and uniqueID, or role, for that term
-            str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
+            str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)

            if bln_print:
                print('found a role', term)

            # TODO: also look for similar terms (misspellings)
+            # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter

        elif term in list_roles_ext:
            pass
            # TODO: extract whether it is minority or majority and save that information

-        # can not happen for the first term
+        # cannot happen for the first term
        elif name_type == 'canton':
            list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
            canton_type = ''
@@ -614,7 +595,6 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):

            else:
                print('might be a canton', term, list_oi, str_name, str_role)
-                # TODO: maybe: go to next elif?

            # if a canton or similar was found
            if canton_type:
@@ -630,7 +610,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
                    try:
                        if len(list_cities) == 1:
                            str_citizenship = list_cities[0]
-                    # except:
+                    except:
                        print('found no or more than one person with citizenship', str_canton, str_name)
                        pass

@@ -639,7 +619,9 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
                else:
                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
                print(list_temp, list_uniqueID)
-                list_uniqueID = list_temp
+
+                if len(list_temp) > 0:
+                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)

        # if term is not easily mistaken as a name (avoid false positives)
        elif term not in list_notnames:
@@ -647,7 +629,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):
            # if term is in the list of all names
            if term in list_all_names:
                # get correct name and uniqueID, or role, for that term
-                str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
+                str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)

                if bln_print:
                    print('=== correct name', term)
@@ -659,7 +641,7 @@ def find_names(list_oi, df_names, list_notnames, bln_print=False):

                # if one was found, get correct name, etc.
                if term_approx:
-                    str_name, str_role, list_uniqueID, name_type = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton)
+                    str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton)
                    if bln_print:
                        print('=== approximate name', str_name, term_approx)