WIP trying to split speaker from text

59b1071d · Lili Gasser · ae40dd41 · 59b1071d · 59b1071d
Commit 59b1071d authored 6 years ago by Lili Gasser
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]:
 # Commands to get the compressed version of the file
-#data/AB/${year}/02_extractedxml.tar.gz
+#data/AB/${year}/05_annotatedxml.tar.gz
 utils_proc.compress_tar(output_annotatedxml)

--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames):
                        print(textbox.tag, textbox.attrib)
                        # get complete text of that textbox
-                        complete_text = get_complete_text(textbox)
+                        complete_text, ind_tl_colon = get_complete_text(textbox)
                        # identify and label language in XML
                        dict_lang = identify_language(complete_text)
@@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames):
                            print(complete_text)
-                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, df_lastnames, list_stopwords, bln_print=False)
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
                            if this_is_speech:
                                prev_is_speech = True
 #                                print('stopped after finding speech start')
@@ -166,6 +166,7 @@ def get_textbox_type(textbox):
 # - textbox
 # output:
 # - complete_text: string
+# - ind_tl_colon: index of textline with colon (needed for label speech start)
 def get_complete_text(textbox):
    # helper function to get text without font information
@@ -177,16 +178,28 @@ def get_complete_text(textbox):
                newtext += text[1:-1]
        #print(newtext)
        return newtext
    # initialize empty string
    complete_text = ''
+    # initialize index of textline colon to impossible value
+    ind_tl_colon = -1
    # for every textline in that textbox
    for ind_tl, textline in enumerate(textbox):
        if textline.tag == 'textline':
-            # append text to string
+            # get that text
-            complete_text += get_text(textline.text)
+            thattext = get_text(textline.text)
+            # append that text to string
+            complete_text += thattext
-    return complete_text
+            # in first two textlines of textbox, check for colon
+            if ind_tl < 3:
+                if ':' in thattext:
+                    ind_tl_colon = ind_tl
+    return complete_text, ind_tl_colon
 # function to label speech starts
@@ -197,7 +210,7 @@ def get_complete_text(textbox):
 # - bln_print: whether to print during execution, default False
 # output:
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
-def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln_print=False):
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False):
    # initialize flag
    this_is_speech = False
@@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
    # very consistently, a speaker can be identified by looking for a colon
    # at the beginning of a textbox and identifiying a name or a role in front
    # of that colon
-    if ':' in text[:100]:
+    if ind_tl_colon >= 0:
+#    if ':' in text[:100]:
        # extract the index of the colon in the text
        colon_index_text = text.index(':')
@@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
            # add speaker to first textline
            XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton)
            # TODO: split speaker from text (check on which line and split that line accordingly)
+            # TODO account for splitting of [font ...] ... [/font]
+            if ind_tl_colon == 0:
+                thattext = XML_new[ind_p][ind_t][0].text
+                colon_index = thattext.index(':')
+                try:
+                    XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1]
+                    XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + ' ' + XML_new[ind_p][ind_t][1].text
+                except:
+                    print('error in self.input_file when splitting speaker')
+                    pass
+            if ind_tl_colon == 1:
+                thattext = XML_new[ind_p][ind_t][1].text
+                colon_index = thattext.index(':')
+                XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1]
+                XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:]
 #            dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
 #                    text[colon_index_text+1:])