diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 32c85bf240e4bfce0b82ba578541110b8a1198fa..495b23ceb7f8b38e2849c5d16119e76cf177d48f 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -4,8 +4,8 @@
 # Code to extract discussions from corrected XML files
 #%%
 # to work with atom
-#%load_ext autoreload
-#%autoreload 2
+%load_ext autoreload
+%autoreload 2
 
 import pickle
 import time
@@ -26,11 +26,13 @@ from utils_proc import call_with_out
 # specify input and output files
 
 # needed for running in atom, can be ignored
-input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle"
-input_correctedxml = "data/AB/1893/04_correctedxml.tar.gz"
-input_correctedmeta = "data/AB/1893/03_correctedmeta.tar.gz"
-output_annotatedxml = "data/AB/1893/05_annotatedxml.tar.gz"
+year = '1891'
+input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
+input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
+input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
+output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 
+#%%
 # detect arguments
 input_lastnames = sys.argv[1]
 input_correctedxml = sys.argv[2]
@@ -55,7 +57,7 @@ for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]:
     #print(command)
     call_with_out(command)
 
-
+#%%
 # TODO: exclude overlaps --> after annotation
 
 
@@ -93,6 +95,7 @@ for file_tarpath in files_to_process:
         file_doc.annotate_xml()
 
 # Commands to get the compressed version of the file
+# (compressed file is around 5 times smaller than uncompressed file)
 #data/AB/${year}/05_annotatedxml.tar.gz
 utils_proc.compress_tar(output_annotatedxml)
 
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 8f8fc03144ac406791d549efaa2098c7a97cf472..e41d934ac7914234ab6fae1f425123999971c413 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -49,9 +49,17 @@ def check_if_discussion(path_meta_xml_file,
 
     return True
 
-
+# helper function to get text without font information
+# example for font information: [font face="11.718" size="Times-Roman"] sometext [/font]
+# input:
+# - sometext: string
+# output:
+# - newtext: modified string
 def get_text(sometext):
+    # initialize
     newtext = ''
+
+    # find text between font information
     for text in re.findall('\].*?\[',sometext):
         #print(text)
         if text.startswith(']') and text.endswith('['):
@@ -59,6 +67,7 @@ def get_text(sometext):
     #print(newtext)
     return newtext
 
+
 # function to annotated corrected XML
 def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
 
@@ -112,9 +121,6 @@ def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
 
                         if textbox_texttype in ['text_col1', 'text_col2']:
 
-                            if bln_print:
-                                print(complete_text)
-
                             XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
                             if this_is_speech:
                                 prev_is_speech = True
@@ -146,7 +152,6 @@ def get_textbox_type(textbox):
     # for every textline in that textbox
     for ind_tl, textline in enumerate(textbox):
         if textline.tag == 'textline':
-#            print(textline.tag, textline.attrib)
 
             # count types
             if textline.attrib['type'] not in dict_type.keys():
@@ -154,16 +159,17 @@ def get_textbox_type(textbox):
             else:
                 dict_type[textline.attrib['type']] += 1
 
-#    print(dict_type)
     # list of all types with maximum count
     list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())]
-#    print(list_types)
+
     # if only one with maximum value
     if len(list_types) == 1:
         textbox_type = list_types[0]
+
     # if several with same maximum value
     else:
         textbox_type = 'notdistinct'
+
     return textbox_type
 
 
@@ -227,6 +233,9 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
     list_uniqueID = []
     str_canton = ''
 
+    # font text end
+    fontend = '[/font]'
+
     # very consistently, a speaker can be identified by looking for a colon
     # at the beginning of a textbox and identifiying a name or a role in front
     # of that colon
@@ -286,31 +295,69 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
             # add attribute speech_start to textbox
             XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start'
 
-            # add speaker to first textline
+            # add speaker as attribute to first textline
             XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton)
 
-            # TODO: split speaker from text (check on which line and split that line accordingly)
-            # TODO account for splitting of [font ...] ... [/font]
+            # update text of XML (speaker is on first line, actual speech start on second line of speech_start textbox)
+            # if colon is on first line
             if ind_tl_colon == 0:
+                # get text of that line and colon index
                 thattext = XML_new[ind_p][ind_t][0].text
                 colon_index = thattext.index(':')
+
+                # get last font information of thattext
+                fontstart = re.findall('\[font.*?\]', thattext)[-1]
+
                 try:
-                    XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1]
-                    XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + ' ' + XML_new[ind_p][ind_t][1].text
+                    # write speaker to first line
+                    XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend
+
+                    # get start of speech with correct font start
+                    if thattext[colon_index+1:].startswith('[font'):
+                        startspeech = thattext[colon_index+1:]
+                    elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                        startspeech = ''
+                    else:
+                        startspeech = fontstart + thattext[colon_index+1:]
+
+                    # write beginning of speech to second line
+                    # (create new ET element if necessary)
+                    if len(list(XML_new[ind_p][ind_t])) > 1:
+                        XML_new[ind_p][ind_t][1].text = startspeech + ' ' + XML_new[ind_p][ind_t][1].text
+                    else:
+                        XML_new[ind_p][ind_t].append(copy.deepcopy(XML_new[ind_p][ind_t][0]))
+                        XML_new[ind_p][ind_t][1].attrib.pop('speaker')
+                        XML_new[ind_p][ind_t][1].text = startspeech
                 except:
                     print('error in self.input_file when splitting speaker')
+                    #print(thattext)
+                    #print(len(list(XML_new[ind_p][ind_t])))
+                    #print(list(XML_new[ind_p][ind_t]))
+                    #print(XML_new[ind_p][ind_t])
+                    #print('gefundener Name:', str_name, str_role)
                     pass
+
+            # if colon is on second line
             if ind_tl_colon == 1:
+                # get text of that line and colon index
                 thattext = XML_new[ind_p][ind_t][1].text
                 colon_index = thattext.index(':')
-                XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1]
-                XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:]
-
 
+                # get last font information of thattext
+                fontstart = re.findall('\[font.*?\]', thattext)[-1]
 
+                # get start of speech with correct font start
+                if thattext[colon_index+1:].startswith('[font'):
+                    startspeech = thattext[colon_index+1:]
+                elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                    startspeech = ''
+                else:
+                    startspeech = fontstart + thattext[colon_index+1:]
 
-#            dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
-#                    text[colon_index_text+1:])
+                # write speaker to first line
+                XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend
+                # write beginning of speech to second line
+                XML_new[ind_p][ind_t][1].text = startspeech
 
             # set flag
             this_is_speech = True
@@ -488,8 +535,8 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl
                   'Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller fÃ©dÃ©ral',
                   'VizeprÃ¤sident']
 
-    list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'Gallen', 'StGallen',
-                     'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter']
+    list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen',
+                     'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'RÃ©diger', 'rÃ©diger', 'Wer', 'FÃ¤llen']
 
     list_places = get_list_cantons(df_names)