diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index ba1cbd8c101db6dea6a6d967c1c5db84bbf07007..434c10d10f13f6f29be0d09073c44becd61b7bb0 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]:
 
 
 # Commands to get the compressed version of the file
-#data/AB/${year}/02_extractedxml.tar.gz
+#data/AB/${year}/05_annotatedxml.tar.gz
 utils_proc.compress_tar(output_annotatedxml)
 
 
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 2d9310c9929e64998b0c5835ed6eddd1bafb1de6..a330b3fcd9cb5c8fa09a0f6c8231d3b8c966b558 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames):
                         print(textbox.tag, textbox.attrib)
 
                         # get complete text of that textbox
-                        complete_text = get_complete_text(textbox)
+                        complete_text, ind_tl_colon = get_complete_text(textbox)
 
                         # identify and label language in XML
                         dict_lang = identify_language(complete_text)
@@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames):
 
                             print(complete_text)
 
-                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, df_lastnames, list_stopwords, bln_print=False)
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
                             if this_is_speech:
                                 prev_is_speech = True
 #                                print('stopped after finding speech start')
@@ -166,6 +166,7 @@ def get_textbox_type(textbox):
 # - textbox
 # output:
 # - complete_text: string
+# - ind_tl_colon: index of textline with colon (needed for label speech start)
 def get_complete_text(textbox):
 
     # helper function to get text without font information
@@ -177,16 +178,28 @@ def get_complete_text(textbox):
                 newtext += text[1:-1]
         #print(newtext)
         return newtext
+
     # initialize empty string
     complete_text = ''
 
+    # initialize index of textline colon to impossible value
+    ind_tl_colon = -1
+
     # for every textline in that textbox
     for ind_tl, textline in enumerate(textbox):
         if textline.tag == 'textline':
-            # append text to string
-            complete_text += get_text(textline.text)
+            # get that text
+            thattext = get_text(textline.text)
+
+            # append that text to string
+            complete_text += thattext
 
-    return complete_text
+            # in first two textlines of textbox, check for colon
+            if ind_tl < 3:
+                if ':' in thattext:
+                    ind_tl_colon = ind_tl
+
+    return complete_text, ind_tl_colon
 
 
 # function to label speech starts
@@ -197,7 +210,7 @@ def get_complete_text(textbox):
 # - bln_print: whether to print during execution, default False
 # output:
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
-def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln_print=False):
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False):
 
     # initialize flag
     this_is_speech = False
@@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
     # very consistently, a speaker can be identified by looking for a colon
     # at the beginning of a textbox and identifiying a name or a role in front
     # of that colon
-    if ':' in text[:100]:
+    if ind_tl_colon >= 0:
+#    if ':' in text[:100]:
         # extract the index of the colon in the text
         colon_index_text = text.index(':')
 
@@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
 
             # add speaker to first textline
             XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton)
+
             # TODO: split speaker from text (check on which line and split that line accordingly)
+            # TODO account for splitting of [font ...] ... [/font]
+            if ind_tl_colon == 0:
+                thattext = XML_new[ind_p][ind_t][0].text
+                colon_index = thattext.index(':')
+                try:
+                    XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1]
+                    XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + ' ' + XML_new[ind_p][ind_t][1].text
+                except:
+                    print('error in self.input_file when splitting speaker')
+                    pass
+            if ind_tl_colon == 1:
+                thattext = XML_new[ind_p][ind_t][1].text
+                colon_index = thattext.index(':')
+                XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1]
+                XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:]
+
+
+
+
 #            dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
 #                    text[colon_index_text+1:])