diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py
index dcee82d9c9b6c507aafaa81df11b0b16335c710f..dfeaa6f14cdb06002c1cc87e67ef2b40fd53f7d1 100644
--- a/src/python/bla_tryreadxml.py
+++ b/src/python/bla_tryreadxml.py
@@ -1,14 +1,58 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #%%
+%load_ext autoreload
+%autoreload 2
+
 import xml.etree.ElementTree as ET
 import re
+import pickle
+import string
+from nltk.corpus import stopwords
+from nltk.tokenize import RegexpTokenizer
+import copy
+
+import sys
+sys.path.append('src/python/')
+import utils_annot
 
-xml_file = '/home/lili/Downloads/1891/20026448_datacorr.xml'
+tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
+
+xml_file = 'data/AB/1893/1893/20026528_datacorr.xml'
+input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle"
 
 XML_tree = ET.parse(xml_file)
 XML_root = XML_tree.getroot()
 
+# list of stopwords
+list_stopwords = stopwords.words('german')
+list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
+list_stopwords.extend(stopwords.words('french'))
+list_stopwords.extend(['ils', 'les', 'celle'])
+
+# add a few terms to list_stopwords that are easily mistaken as last names
+list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen',
+                       'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter'])
+
+# list of votation terms
+# TODO: make it work for Ã©, etc.
+list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
+                      'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)',
+                      'Votation', 'Vote', 'votation', #'(Adoptï¿½s)', 'adoptï¿½s', 'adoptï¿½e', 'rejetï¿½e',
+                      "D'accord", 'voix']
+
+# open dataframe of last names from pickle file
+with open(input_lastnames, 'rb') as f:
+    df_lastnames = pickle.load(f)
+
+#%%
+# create new XML as a copy of the corrected one
+XML_new = copy.deepcopy(XML_root)
+
+# initialize flags
+this_is_speech = False
+prev_is_speech = False
+this_is_vote = False
 
 # for every page
 for ind_p, page in enumerate(XML_root):
@@ -20,12 +64,34 @@ for ind_p, page in enumerate(XML_root):
                 if (textbox.attrib['type_textbox'] == 'text'):
                     print(textbox.tag, textbox.attrib)
 
-                    # for every textline in that textbox
-                    for ind_tl, textline in enumerate(textbox):
-                        if textline.tag == 'textline':
-                            print(textline.tag, textline.attrib)
+                    textbox_texttype = get_textbox_type(textbox)
+                    print(textbox_texttype)
+
+                    if textbox_texttype in ['text_col1', 'text_col2']:
+
+                        complete_text = get_complete_text(textbox)
+                        print(complete_text)
+                        XML_new, this_is_speech = label_speechstart(XML_new, complete_text, df_lastnames, list_stopwords, bln_print=False)
+                        if this_is_speech:
+                            prev_is_speech = True
+                            print('stopped after finding speech start')
+                            continue
+                        XML_new, this_is_vote = label_votations(XML_new, complete_text, list_votationterms, bln_print=False)
+                        if this_is_vote:
+                            prev_is_speech = False
+                            print('stopped after finding vote')
+                            continue
+                        if prev_is_speech and (not this_is_vote):
+                            XML_new = label_speechcont(XML_new)
+
+        print('go to next textbox \n')
+
+
+name_xml = 'data/AB/1893/id_doc_previewannotated.xml'
+tree = ET.ElementTree(XML_new)
+tree.write(name_xml, encoding = 'utf-8')
+
 
-                            print(get_text(textline.text))
 
 #%%
 sometext = '[font face="8.071" size="Times-Bold"]FÃ¼r  die  Bedaktion  verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud.  SdMarst[/font][font face="8.071" size="Times-Bold"]  â€”â€¢  Druck  und Expedition  von[/font][font face="7.973" size="Times-BoldItalic"]  Jmi  Ã¨  Ã‰eineft[/font][font face="8.071" size="Times-Bold"]  fa[/font][font face="7.973" size="Times-BoldItalic"]  SeÃ¬rit. [/font]'
@@ -42,7 +108,193 @@ def get_text(sometext):
 get_text(sometext)
 #%%
 
+# helper function to get type of textbox_type
+# corresponds to majority vote of types of textlines
+# input:
+# - textbox
+# output:
+# - textbox_type: string
+def get_textbox_type(textbox):
+
+    # initialize empty dictionary
+    dict_type = {}
+
+    # for every textline in that textbox
+    for ind_tl, textline in enumerate(textbox):
+        if textline.tag == 'textline':
+#            print(textline.tag, textline.attrib)
+
+            # count types
+            if textline.attrib['type'] not in dict_type.keys():
+                dict_type[textline.attrib['type']] = 1
+            else:
+                dict_type[textline.attrib['type']] += 1
+
+#    print(dict_type)
+    # list of all types with maximum count
+    list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())]
+#    print(list_types)
+    # if only one with maximum value
+    if len(list_types) == 1:
+        textbox_type = list_types[0]
+    # if several with same maximum value
+    else:
+        textbox_type = 'notdistinct'
+    return textbox_type
+#%%
+
+# helper function to get complete text of a textbox
+# input:
+# - textbox
+# output:
+# - complete_text: string
+def get_complete_text(textbox):
+
+    # helper function to get text without font information
+    def get_text(sometext):
+        newtext = ''
+        for text in re.findall('\].*?\[',sometext):
+            #print(text)
+            if text.startswith(']') and text.endswith('['):
+                newtext += text[1:-1]
+        #print(newtext)
+        return newtext
+    # initialize empty string
+    complete_text = ''
+
+    # for every textline in that textbox
+    for ind_tl, textline in enumerate(textbox):
+        if textline.tag == 'textline':
+            # append text to string
+            complete_text += get_text(textline.text)
+
+    return complete_text
+
+
+#%%
+
+# function to label speech starts
+# input:
+# - text: stringt to be analyzed
+# - df_names: dataframe of politicians
+# - list_stopwords: list of german and french stopwords
+# - bln_print: whether to print during execution, default False
+# output:
+# - (str_name, str_role, int_uniqueID, str_canton): tuple with strings and ID
+def label_speechstart(XML_new, text, df_names, list_stopwords, bln_print=False):
+
+    # initialize strings and ID
+    str_name = ''
+    str_role = ''
+    int_uniqueID = int(0)
+    str_canton = ''
+
+    # very consistently, a speaker can be identified by looking for a colon
+    # at the beginning of a textbox and identifiying a name or a role in front
+    # of that colon
+    if ':' in text[:100]:
+        # extract the index of the colon in the text
+        colon_index_text = text.index(':')
+
+        # look at first few terms of that textbox
+        text_start = re.sub(r'[\(\)]','',text[:colon_index_text])
+        list_oi = tokenizer.tokenize(text_start)
+        print('possible speech start: ', list_oi)
+
+        # remove stopwords
+        list_oi = [term for term in list_oi if term.lower() not in list_stopwords]
+
+        # remove punctuation
+        list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi]
+        list_oi = [s for s in list_oi if s]
+
+        # remove lower case terms
+#        list_oi = [term for term in list_oi if not term.islower()]
+
+        # remove numbers
+        list_oi = [term for term in list_oi if not term.isdigit()]
+
+        # remove single characters
+        list_oi = [term for term in list_oi if len(term)>1]
+
+        # for every term, reversed finds canton before it finds name
+        for term in reversed(list_oi):
+            # if possible, find a name in a list
+            str_name, str_role, int_uniqueID, str_canton = utils_annot.find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True)
+        print('name', str_name, 'role', str_role)
+
+        # get rid of 'PrÃ¤sident stimmt nicht PrÃ©sident ne vote pas'
+        if set(str_role.split()).intersection(set(['PrÃ¤sident', 'PrÃ¤sidentin', 'PrÃ©sident', 'PrÃ©sidente'])) and not str_name:
+            if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
+                print('get rid of PrÃ¤sident stimmt nicht, PrÃ©sident ne vote pas', list_oi)
+                str_role = ''
+
+        # get rid of 'FÃ¼r den Antrag "Name" stimmen: Votent pour la proposition "Name":'
+        if str_name:
+            if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1:
+                print('get rid of FÃ¼r den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi)
+                str_name = ''
+
+        # if a name has been found, add it to XML_new
+        if str_name or str_role:
+            # add attribute speech_start to textbox
+            XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start'
+
+            # add speaker to first textline
+            XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, int_uniqueID, str_canton)
+            # TODO: split speaker from text (check on which line and split that line accordingly)
+#            dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
+#                    text[colon_index_text+1:])
+
+            # set flag
+            this_is_speech = True
+            if bln_print:
+                print('found a name:', list_oi, str_name, str_role, '\n')
+    else:
+        # set flag
+        this_is_speech = False
+
+    return XML_new, this_is_speech
+# %%
+
+# function to extract votation paragraphs
+# !!! error prone, possible improvements see notebook extract_discussions
+# input:
+# - XML_new:
+# - text: string
+# - list_votationterms: list of votation terms
+# - bln_print: whether to print during execution, default False
+# output:
+# - XML_new: updated
+def label_votations(XML_new, text, list_votationterms, bln_print=True):
+
+    # get first terms of that text
+    list_oi = tokenizer.tokenize(text)[:15]
+#        if len(set(list_oi).intersection(set(list_votationterms))) > 1:
+    # if there is an overlap with typical votation terms:
+    if set(list_oi).intersection(set(list_votationterms)):
+        # add attribute vote to textbox
+        XML_new[ind_p][ind_t].attrib['text_type'] = 'vote'
+
+        # set flag
+        this_is_vote = True
+        if bln_print:
+            print('found a vote:', list_oi)
+    else:
+        #pass
+        # set flag
+        this_is_vote = False
+        if bln_print:
+            print('not a vote', list_oi)
+
+    return XML_new, this_is_vote
+
+#%%
+
+
+
+def label_speechcont(XML_new):
+
+    XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont'
 
-                        # for every text (actually just a letter)
-#                            for ind_ch, text in enumerate(textline):
-#                                print(ind_ch, text.text)   #, len(tex0tline), len(XML_new[ind_p][ind_t][ind_tl]))
+    return XML_new
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index afe3f906d197b500d5f2317e974b888178689967..379fb44ac7257b7fcd6a1760b31af3cf11c34401 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -176,7 +176,7 @@ class Document:
     def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1,
                     pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml',
                     name_outcorrxml = '04_correctedxml', flag_save = 1):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
 
@@ -204,8 +204,8 @@ class Document:
 
         XML_new = ET.Element('pages')
 
-        for ind_abs, ind_page in enumerate(self.n_pages): 
-            
+        for ind_abs, ind_page in enumerate(self.n_pages):
+
             XML_root = ET.Element('pages')
             #print(ind_abs,len(self.XML_main))
             XML_root.append(self.XML_main[ind_abs])
@@ -237,7 +237,7 @@ class Document:
                 # Top and bottom line
                 ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32),
                                                coord_horz.astype(np.uint32))
-                #print(info_font_sizes)                                      
+                #print(info_font_sizes)
                 #####
                 # Label the textboxes based on a set of simple rules that make use of
                 # the margins and the fontsizes
@@ -263,7 +263,7 @@ class Document:
                     im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def)
                     im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1)
                     im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)
-                    im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])    
+                    im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])
                     im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page)
 
                     # Create figure with 4 subplots, for showing all results
@@ -308,13 +308,13 @@ class Document:
         name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml'
         tree = ET.ElementTree(XML_new)
         tree.write(name_xml, encoding = 'utf-8')
-        
+
         if flag_save:
             name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml)
         else:
             print('Not saving to tar')
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'        
-        
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
+
         self.name_outcorrxml = name_outcorrxml
         self.name_xml_corr = [name_tar, name_xml]
         self._xml_ext(suffix_xml, self.name_outcorrxml)
@@ -330,7 +330,7 @@ class Document:
     def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0,
                            name_outxml = '02_extractedxml'):
         # ind_page has to be a scalar
-        
+
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
         if 'XML_main' not in self.__dict__.keys():
@@ -342,8 +342,8 @@ class Document:
                     XML_tree = ET.parse(h_xml)
                     self.XML_main = XML_tree.getroot()
             else:
-                self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
+                self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)
+            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))
         else:
             #print('Run this')
             self._get_pages()
@@ -351,11 +351,11 @@ class Document:
 
         #print(ind_abs, type(ind_abs))
         #print(self.XML_main, len(self.imgobj))
-        
+
         if ind_page > (len(self.XML_main) - 1):
             flag_error = 1
             return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
-        
+
         XML_root = ET.Element('pages')
         XML_root.append(self.XML_main[ind_abs[0]])
         imarray = np.array(self.imgobj[ind_page])
@@ -392,32 +392,32 @@ class Document:
 
         if level_proc > 4:
             label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
-                preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes)   
-            
-        if level_proc > 5:            
-            set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
-                                                                   list_allcoords_textlines, margins)            
-            
-        if level_proc > 6:            
-            XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
-                                                rescale_factor, centrall_ord, ind_page, dim_img)        
-        
+                preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes)
+
+        if level_proc > 5:
+            set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz,
+                                                                   list_allcoords_textlines, margins)
+
+        if level_proc > 6:
+            XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline,
+                                                rescale_factor, centrall_ord, ind_page, dim_img)
+
         # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we
         # send a 1 instead
         flag_error = 0
         return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \
             set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error
-            
+
     def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'):
-        
+
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
-        if 'XML_main_corr' not in self.__dict__.keys():            
+        if 'XML_main_corr' not in self.__dict__.keys():
             name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
             if os.path.isfile(name_tar):
                 name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
                 #print(name_xml)
-                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:    
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:
                     #print('Run this')
                     h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
                     XML_tree = ET.parse(h_xml)
@@ -426,13 +426,13 @@ class Document:
                     print('You need to have the tar file to use flag_compute = 0!')
                     flag_error = 1
                     return 0, 0, 0, 0, 0, 0, flag_error
-                    #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
+                    #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)
             else:
                 print('You need to have the tar file to use flag_compute = 0!')
                 flag_error = 1
                 return 0, 0, 0, 0, 0, 0, flag_error
-                #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
+                #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)
+            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))
         else:
             #print('Run this')
             self._get_pages()
@@ -440,19 +440,19 @@ class Document:
 
         #print(ind_abs, type(ind_abs))
         #print(self.XML_main, len(self.imgobj))
-        
+
         if ind_page > (len(self.XML_main_corr) - 1):
             flag_error = 1
             return 0, 0, 0, 0, 0, 0, flag_error
-        
+
         XML_root = ET.Element('pages')
         XML_root.append(self.XML_main_corr[ind_abs[0]])
         imarray = np.array(self.imgobj[ind_page])
-        
+
         bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-        dim_img = imarray.shape[:2]        
-        
-        ######        
+        dim_img = imarray.shape[:2]
+
+        ######
         # For obtaining label_textlines, list_allcoords_textlines
         coord_textline = np.array([]).reshape((4,0))
         label_textlines = dict()
@@ -465,7 +465,7 @@ class Document:
                     if 'type' in XML_root[0][ind_el][ind_line].attrib:
                         coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
                         coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
-                    
+
                         type_textl = XML_root[0][ind_el][ind_line].attrib['type']
                         #print(ind_el)
                         if XML_root[0][ind_el].attrib['type_textbox'] == 'line':
@@ -482,18 +482,18 @@ class Document:
                             aux_type = np.array([count])
                             label_textlines[type_textl] = aux_type
                         count += 1
-        
+
         coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page)
-        
-        ##### 
+
+        #####
         # To obtain set_of_blocks. This variable simply contains the coordinates, and
-        # then a final row indicating the order (here are already ordered), and if it 
-        # is a line, which is indicated with a -1        
+        # then a final row indicating the order (here are already ordered), and if it
+        # is a line, which is indicated with a -1
         set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0)
         set_of_blocks = dict()
         set_of_blocks[0] = set_of_blocks_aux
         #print(set_of_blocks.shape)
-                        
+
         # The last is the flag_error
         #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks),
         #     len(XML_root), bbox_page.shape)
@@ -501,7 +501,7 @@ class Document:
         return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error
 #                        imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error
 #                        imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error
-                        
+
 
 
     def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data',
@@ -519,8 +519,8 @@ class Document:
             else:
                 imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root)
 
-                self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, 
-                                flag_plot, flag_save_figs)   
+                self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file,
+                                flag_plot, flag_save_figs)
 
             self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file,
                             flag_plot, flag_save_figs)
@@ -537,24 +537,24 @@ class Document:
                                                         name_outxml = self.name_outxml)
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:            
+            else:
                 im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1)
 
                 self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)             
-                
-    def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', 
+                       flag_plot, flag_save_figs)
+
+    def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data',
                          flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
                          name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml     
+            self.name_outxml = name_outxml
         if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml              
-        
+            self.name_outcorrxml = name_outcorrxml
+
         name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
         for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar): 
+            if flag_compute or not os.path.isfile(name_tar):
                 imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \
                     self._plot_generic_open(ind_page, suffix_xml, level_proc = 5,
                                             name_outxml = self.name_outxml)
@@ -562,70 +562,70 @@ class Document:
             else:
                 imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
                 #print(len(array_elements))
-                
+
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:             
-                im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)     
+            else:
+                im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)
                 self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file,
                                        flag_plot, flag_save_figs, flag_legend)
-                            
-                
-    def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', 
+
+
+    def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data',
                            flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
                            name_outcorrxml = '04_correctedxml', flag_compute = 0):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
         if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml            
-        
+            self.name_outcorrxml = name_outcorrxml
+
         name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
         for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar):            
+            if flag_compute or not os.path.isfile(name_tar):
                 imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \
                     self._plot_generic_open(ind_page, suffix_xml, level_proc = 6,
                                             name_outxml = self.name_outxml)
-            else: 
+            else:
                 imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \
-                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                   
-                                    
+                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
+
             #print(set_of_blocks)
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:            
-                im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) 
+            else:
+                im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])
 
                 self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)            
-                
-    def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', 
+                       flag_plot, flag_save_figs)
+
+    def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data',
                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
                         name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1):
         # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
         if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml              
-        
+            self.name_outcorrxml = name_outcorrxml
+
         name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
         for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar):             
+            if flag_compute or not os.path.isfile(name_tar):
                 imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \
                     self._plot_generic_open(ind_page, suffix_xml, level_proc = 7,
                                             name_outxml = self.name_outxml)
-            else:                    
+            else:
                 imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \
-                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                  
-            
+                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
+
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:            
+            else:
                 im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl)
 
                 self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file,
                        flag_plot, flag_save_figs)
-        
+
     def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '',
                    flag_plot = 1, flag_save_figs = 0, dpi = 200):
         if flag_plot:
@@ -638,8 +638,8 @@ class Document:
             name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                         + '_page' + str(ind_page) + '.' + format_fig)
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)       
-            
+            plt.close(fig)
+
     def _plot_save_labels(self, im_met, str_title, str_name, ind_page, groups, colors, folder_save = '',
                            flag_plot = 1, flag_save_figs = 0, flag_legend = 1, dpi = 200):
         #print(groups)
@@ -654,17 +654,17 @@ class Document:
                     coords[0] = in_coord
                     coords[1] += int(im_met.shape[1]/1.5)
                     coords[2] = in_coord + 10
-                    coords[3] += int(im_met.shape[1]/1.5)                    
-                im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6)        
+                    coords[3] += int(im_met.shape[1]/1.5)
+                im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6)
                 coords[0] += inc_page
                 coords[2] += inc_page
-        
+
         if flag_plot:
             fig, axes = plt.subplots(1, 1, figsize=(8, 10))
             axes.axis('off')
-            axes.imshow(im_met) 
+            axes.imshow(im_met)
             plt.title(str_title)
-        
+
         if flag_legend:
             coords = in_coord + np.array([0, 0, 10, 10])
             flag_notinto = 1
@@ -674,20 +674,20 @@ class Document:
                     coords[0] = in_coord
                     coords[1] += int(im_met.shape[1]/1.5)
                     coords[2] = in_coord + 10
-                    coords[3] += int(im_met.shape[1]/1.5)                    
-                plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left')      
+                    coords[3] += int(im_met.shape[1]/1.5)
+                plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left')
                 coords[0] += inc_page
-                coords[2] += inc_page   
-        
+                coords[2] += inc_page
+
         if flag_save_figs:
             format_fig = 'png'
-            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
+            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                         + '_page' + str(ind_page) + '.' + format_fig)
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)             
-            
-        
-                           
+            plt.close(fig)
+
+
+
 
 
     def check_discussion(self):
@@ -732,6 +732,7 @@ class Document:
         print('we have a main corr XML file here')
         #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
         XML_corr = utils_annot.get_text_corrected(self.XML_main_corr)
+        self.XML_corr = XML_corr
 
 
 
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 37ca9694c284a5c9bdbde04f189188147e31c6a6..b5cde5d7b26a4758a412c9174a23877f0c5cac27 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -20,10 +20,10 @@ import utils_annot
 #%%
 
 # needed for running in atom, can be ignored
-input_lastnames = "data/politicians/lastnames/1891_lastnames.pickle"
-input_correctedxml = "/home/lili/Downloads/1891/04_correctedxml.tar.gz"
-input_correctedmeta = "data/AB/1891/03_correctedmeta.tar.gz"
-output_annotatedxml = "data/AB/1891/05_annotatedxml.tar.gz"
+input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle"
+input_correctedxml = "data/AB/1893/04_correctedxml.tar.gz"
+input_correctedmeta = "data/AB/1893/03_correctedmeta.tar.gz"
+output_annotatedxml = "data/AB/1893/05_annotatedxml.tar.gz"
 
 # detect arguments
 #input_lastnames = sys.argv[1]
@@ -98,21 +98,21 @@ with open(input_lastnames, 'rb') as f:
 df_lastnames.columns
 #%%
 
-file_tarpath = './1891/20026448_datacorr.xml'
+file_tarpath = './1893/20026528_datacorr.xml'
 
-file_number = file_tarpath.split('/')[-1][:8]
-metafile_tarpath = './{}/{}{}.xml'.format(year, file_number, suffix_correctedmeta)
+id_doc = file_tarpath.split('/')[-1][:8]
+metafile_tarpath = './{}/{}{}.xml'.format(year, id_doc, suffix_correctedmeta)
 
 # instantiate document object (always from original pdf)
-infile_aux = year + '/' + file_number + '.pdf'
+infile_aux = year + '/' + id_doc + '.pdf'
 file_doc = defc.Document(infile_aux, folder_database)
-file_doc
-
+file_doc.name_xml_corr
 
 if (file_doc.check_discussion()) and (file_number not in ['20032463', '20032952', '20014332']):
-    print(file_number + '\n')
+    print(id_doc + '\n')
 
     file_doc.annotate_speakers()
+    file_doc.XML_corr[0][3]
 
 #%%
 
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index a9f27fa6c8b620bf4a6af2d208998e849dd32c92..ebc3804fe80337d63b72299cc981ed7122bf3e15 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -78,9 +78,8 @@ def get_text_corrected(XML_root):
                         for ind_tl, textline in enumerate(textbox):
                             if textline.tag == 'textline':
                                 print(textline.tag, textline.attrib)
-                                print(len(textline.text))
                                 print(get_text(textline.text))
-#                                print('this is text', textline.text)
+                                XML_new[ind_p][ind_t][ind_tl].text = get_text(textline.text)
 
 
 
@@ -425,7 +424,7 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln
                 print('*******************', str_name, term_approx)
 
 
-    return str_name, str_role, int_uniqueID, find_names
+    return str_name, str_role, int_uniqueID, str_canton