diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py
index 082289b305219f6495943e8098e924dab8ea6723..e0a855fc6689cf31e6d37306737d0bb85b01801c 100644
--- a/src/python/bla_tryreadxml.py
+++ b/src/python/bla_tryreadxml.py
@@ -4,422 +4,22 @@
 %load_ext autoreload
 %autoreload 2
 
-import xml.etree.ElementTree as ET
-import re
 import pickle
-import string
-from nltk.corpus import stopwords
-from nltk.tokenize import RegexpTokenizer
-import copy
 
-import sys
-sys.path.append('src/python/')
-import utils_annot
 
-tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
-
-xml_file = 'data/AB/1893/1893/20026528_datacorr.xml'
-input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle"
-
-XML_tree = ET.parse(xml_file)
-XML_root = XML_tree.getroot()
-
-# list of stopwords
-list_stopwords = stopwords.words('german')
-list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
-list_stopwords.extend(stopwords.words('french'))
-list_stopwords.extend(['ils', 'les', 'celle'])
-
-# add a few terms to list_stopwords that are easily mistaken as last names
-list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen',
-                       'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter'])
-
-# list of votation terms
-# TODO: make it work for Ã©, etc.
-list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
-                      'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)',
-                      'Votation', 'Vote', 'votation', #'(Adoptï¿½s)', 'adoptï¿½s', 'adoptï¿½e', 'rejetï¿½e',
-                      "D'accord", 'voix']
-
-# open dataframe of last names from pickle file
-with open(input_lastnames, 'rb') as f:
-    df_lastnames = pickle.load(f)
-
-#%%
-# create new XML as a copy of the corrected one
-XML_new = copy.deepcopy(XML_root)
-
-# initialize flags to distinguish speeches from votes
-this_is_speech = False
-prev_is_speech = False
-this_is_vote = False
-
-# for every page
-for ind_p, page in enumerate(XML_root):
-    print(page.tag, page.attrib)
-    # for every textbox on that page
-    for ind_t, textbox in enumerate(page):
-        if (textbox.tag == 'textbox'):
-            if 'type_textbox' in textbox.attrib.keys():
-                if (textbox.attrib['type_textbox'] == 'text'):
-                    print(textbox.tag, textbox.attrib)
-
-                    # get complete text of that textbox
-                    complete_text = get_complete_text(textbox)
-
-                    # identify and label language in XML
-                    dict_lang = identify_language(complete_text)
-                    XML_new = label_language(XML_new, dict_lang)
-
-                    # get texttype of that textbox by majority vote
-                    textbox_texttype = get_textbox_type(textbox)
-                    print(textbox_texttype)
-
-                    if textbox_texttype in ['text_col1', 'text_col2']:
-
-                        print(complete_text)
-                        XML_new, this_is_speech = label_speechstart(XML_new, complete_text, df_lastnames, list_stopwords, bln_print=False)
-                        if this_is_speech:
-                            prev_is_speech = True
-                            print('stopped after finding speech start')
-                            continue
-                        XML_new, this_is_vote = label_votations(XML_new, complete_text, list_votationterms, bln_print=False)
-                        if this_is_vote:
-                            prev_is_speech = False
-                            print('stopped after finding vote')
-                            continue
-                        if prev_is_speech and (not this_is_vote):
-                            XML_new = label_speechcont(XML_new)
-
-        print('go to next textbox \n')
-
-
-name_xml = 'data/AB/1893/id_doc_previewannotated.xml'
-tree = ET.ElementTree(XML_new)
-tree.write(name_xml, encoding = 'utf-8')
-
-
-
-#%%
-sometext = '[font face="8.071" size="Times-Bold"]FÃ¼r  die  Bedaktion  verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud.  SdMarst[/font][font face="8.071" size="Times-Bold"]  â€”â€¢  Druck  und Expedition  von[/font][font face="7.973" size="Times-BoldItalic"]  Jmi  Ã¨  Ã‰eineft[/font][font face="8.071" size="Times-Bold"]  fa[/font][font face="7.973" size="Times-BoldItalic"]  SeÃ¬rit. [/font]'
-
-#re.split('[ | ]', sometext)
-def get_text(sometext):
-    newtext = ''
-    for text in re.findall('\].*?\[',sometext):
-        #print(text)
-        if text.startswith(']') and text.endswith('['):
-            newtext += text[1:-1]
-    #print(newtext)
-    return newtext
-get_text(sometext)
 #%%
+path_data = '/home/lili/NLP_DemocraSci/data_from_nlp-democracy/results_overlap/'
+# open dictionary of overlaps
+with open(path_data + 'DictOverlap1891to1930.pkl', 'rb') as f:
+    dict_overlaps_1 = pickle.load(f)
+with open(path_data + 'DictOverlap1931to1995.pkl', 'rb') as f:
+    dict_overlaps_2 = pickle.load(f)
+with open(path_data + 'DictOverlap1991to1995.pkl', 'rb') as f:
+    dict_overlaps_3 = pickle.load(f)
+dict_overlaps = {**dict_overlaps_1, **dict_overlaps_2, **dict_overlaps_3}
 
-# helper function to get type of textbox_type
-# corresponds to majority vote of types of textlines
-# input:
-# - textbox
-# output:
-# - textbox_type: string
-def get_textbox_type(textbox):
-
-    # initialize empty dictionary
-    dict_type = {}
-
-    # for every textline in that textbox
-    for ind_tl, textline in enumerate(textbox):
-        if textline.tag == 'textline':
-#            print(textline.tag, textline.attrib)
-
-            # count types
-            if textline.attrib['type'] not in dict_type.keys():
-                dict_type[textline.attrib['type']] = 1
-            else:
-                dict_type[textline.attrib['type']] += 1
-
-#    print(dict_type)
-    # list of all types with maximum count
-    list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())]
-#    print(list_types)
-    # if only one with maximum value
-    if len(list_types) == 1:
-        textbox_type = list_types[0]
-    # if several with same maximum value
-    else:
-        textbox_type = 'notdistinct'
-    return textbox_type
-#%%
-
-# helper function to get complete text of a textbox
-# input:
-# - textbox
-# output:
-# - complete_text: string
-def get_complete_text(textbox):
-
-    # helper function to get text without font information
-    def get_text(sometext):
-        newtext = ''
-        for text in re.findall('\].*?\[',sometext):
-            #print(text)
-            if text.startswith(']') and text.endswith('['):
-                newtext += text[1:-1]
-        #print(newtext)
-        return newtext
-    # initialize empty string
-    complete_text = ''
-
-    # for every textline in that textbox
-    for ind_tl, textline in enumerate(textbox):
-        if textline.tag == 'textline':
-            # append text to string
-            complete_text += get_text(textline.text)
-
-    return complete_text
-
-
-#%%
-
-# function to label speech starts
-# input:
-# - text: stringt to be analyzed
-# - df_names: dataframe of politicians
-# - list_stopwords: list of german and french stopwords
-# - bln_print: whether to print during execution, default False
-# output:
-# - (str_name, str_role, int_uniqueID, str_canton): tuple with strings and ID
-# TODO: speakers with double get recognized twice (1893, 20026528, p2, Scherrer-FÃ¼llemann)
-def label_speechstart(XML_new, text, df_names, list_stopwords, bln_print=False):
-
-    # initialize strings and ID
-    str_name = ''
-    str_role = ''
-    int_uniqueID = int(0)
-    str_canton = ''
-
-    # very consistently, a speaker can be identified by looking for a colon
-    # at the beginning of a textbox and identifiying a name or a role in front
-    # of that colon
-    if ':' in text[:100]:
-        # extract the index of the colon in the text
-        colon_index_text = text.index(':')
-
-        # look at first few terms of that textbox
-        text_start = re.sub(r'[\(\)]','',text[:colon_index_text])
-        list_oi = tokenizer.tokenize(text_start)
-        print('possible speech start: ', list_oi)
-
-        # remove stopwords
-        list_oi = [term for term in list_oi if term.lower() not in list_stopwords]
-
-        # remove punctuation
-        list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi]
-        list_oi = [s for s in list_oi if s]
-
-        # remove lower case terms
-#        list_oi = [term for term in list_oi if not term.islower()]
-
-        # remove numbers
-        list_oi = [term for term in list_oi if not term.isdigit()]
-
-        # remove single characters
-        list_oi = [term for term in list_oi if len(term)>1]
-
-        # for every term, reversed finds canton before it finds name
-        for term in reversed(list_oi):
-            # if possible, find a name in a list
-            str_name, str_role, int_uniqueID, str_canton = utils_annot.find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True)
-        print('name', str_name, 'role', str_role)
-
-        # get rid of doubled double names
-
-
-        # get rid of 'PrÃ¤sident stimmt nicht PrÃ©sident ne vote pas'
-        if set(str_role.split()).intersection(set(['PrÃ¤sident', 'PrÃ¤sidentin', 'PrÃ©sident', 'PrÃ©sidente'])) and not str_name:
-            if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
-                print('get rid of PrÃ¤sident stimmt nicht, PrÃ©sident ne vote pas', list_oi)
-                str_role = ''
-
-        # get rid of 'FÃ¼r den Antrag "Name" stimmen: Votent pour la proposition "Name":'
-        if str_name:
-            if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1:
-                print('get rid of FÃ¼r den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi)
-                str_name = ''
-
-        # if a name has been found, add it to XML_new
-        if str_name or str_role:
-            # add attribute speech_start to textbox
-            XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start'
-
-            # add speaker to first textline
-            XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, int_uniqueID, str_canton)
-            # TODO: split speaker from text (check on which line and split that line accordingly)
-#            dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
-#                    text[colon_index_text+1:])
-
-            # set flag
-            this_is_speech = True
-            if bln_print:
-                print('found a name:', list_oi, str_name, str_role, '\n')
-    else:
-        # set flag
-        this_is_speech = False
-
-    return XML_new, this_is_speech
-# %%
-
-# function to extract votation paragraphs
-# !!! error prone, possible improvements see notebook extract_discussions
-# input:
-# - XML_new:
-# - text: string
-# - list_votationterms: list of votation terms
-# - bln_print: whether to print during execution, default False
-# output:
-# - XML_new: updated
-def label_votations(XML_new, text, list_votationterms, bln_print=True):
-
-    # get first terms of that text
-    list_oi = tokenizer.tokenize(text)[:15]
-#        if len(set(list_oi).intersection(set(list_votationterms))) > 1:
-    # if there is an overlap with typical votation terms:
-    if set(list_oi).intersection(set(list_votationterms)):
-        # add attribute vote to textbox
-        XML_new[ind_p][ind_t].attrib['text_type'] = 'vote'
-
-        # set flag
-        this_is_vote = True
-        if bln_print:
-            print('found a vote:', list_oi)
-    else:
-        #pass
-        # set flag
-        this_is_vote = False
-        if bln_print:
-            print('not a vote', list_oi)
-
-    return XML_new, this_is_vote
-
-#%%
-
-
-
-def label_speechcont(XML_new):
-
-    XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont'
-
-    return XML_new
-
-#%%
-# two functions for language identification
-# Author: Luis Salamanca
-# small modifications by Lili Gasser
-# Using stopwords
-# input:
-# - text: string
-# - valid_lang: tuple of valid languages
-# output:
-# - dict_language_counts: dictionary of stopword counts for each valid language
-def identify_language(text, valid_lang = ('german', 'french', 'italian')):
-
-    # tokenize
-    tokens = text.split(' ')
-    # all lowercase
-    test_words = [word.lower() for word in tokens]
-    # make a set
-    test_words_set = set(test_words)
-
-    # initialize dictionary of language elements
-    dict_language_counts = {}
-
-    # iterate through languages of stopwords
-    for language in stopwords.fileids():
-        if language in valid_lang:
-            # get stopword set
-            stopwords_set = set(stopwords.words(language))
-            # get intersection between text of interest and stopword set for this language
-            common_elements = test_words_set.intersection(stopwords_set)
-            # save number of common elements to dictionary
-            dict_language_counts[language] = len(common_elements)
-
-    return dict_language_counts
-
-
-# Simply, given the number of ocurrences of the stopwords, it assigns a label
-# to a specific textbox, also considering the possibility of textboxes
-# mixing languages. For this case, the value ratio_similar is intended
-# input:
-# - XML_new: XML file to update
-# - aux_dict_l: corresponds to dict_language_counts
-# output:
-# - lang_max: string
-def label_language(XML_new, aux_dict_l):
-
-    # specify a similarity ratio
-    ratio_similar = 0.8
-    # if there are counts, determine language
-    if sum(aux_dict_l.values()):
-        aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
-        lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
-        lang_max = ''
-        count_l = 0
-        for lang in aux_dict_l_norm.keys():
-            if (aux_dict_l_norm[lang] >  aux_dict_l_norm[lang_max_aux] * ratio_similar):
-                if count_l > 0:
-                    lang_max += '_'
-                lang_max += lang
-                count_l += 1
-        if count_l > 1:
-            lang_max = 'mixed_' + lang_max
-    else:
-        lang_max = 'languageNotIdentified'
-
-    # add attribute to textbox
-    XML_new[ind_p][ind_t].attrib['language'] = lang_max
-
-    return XML_new
-
-
-
-#%%
-
-int_uniqueID = (123, 123)
-print(type(int_uniqueID))
-print(isinstance(int_uniqueID, tuple))
-
-
-tpl = ()
-tpl2 = (tpl, 2)
-tpl2
-tpl.append(2)
-
-
-lst = []
-list2 = [lst, 2]
-list2
-lst.append(2)
-lst
-
-
-lst
-tuple(lst)
-(1, 2, lst, 3)
-('a', 'b', 'c', lst)
-
-[2]
-[1, 2, 3].append([4])
-lst = [1,2,3]
-lst
-lst.append(3)
-lst
-lst.append([4, 5, 6])
-lst
-len(lst)
-set(lst)
-
-
-lst = [2, 3]
+print(dict_overlaps.keys())
+print(dict_overlaps[1891])
 
-list_temptemp = []
-for item in lst:
-    list_temptemp.extend(item)
+with open(path_data + 'dict_overlaps.pickle', 'wb') as f:
+    pickle.dump(f)
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 041d1b66a7ca485b37934aab52aacf3a45d4acf7..4e55da1317e3683ec3799e015085be48a3b57f55 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -724,13 +724,40 @@ class Document:
             self.name_outmeta = name_outmeta
 
         utils_proc.tar_extractfile(self.name_meta_corr[1], self.folder_database, name_file = self.name_outmeta)
-        (str_council, str_date) = utils_annot.get_council_and_date(self.name_meta_corr[1])
+        (self.str_council, self.str_date) = utils_annot.get_council_and_date(self.name_meta_corr[1])
 
         command = 'rm -rf ./' + str(self.year)
         #print(command)
         utils_proc.call_with_out(command)
 
-        return (str_council, str_date)
+
+    # function to exclude overlapping textboxes between documents
+    # input:
+    # - dict_overlaps: dictionary with overlaps
+    # output:
+    # - (first_entry, last_entry): tuple of first and last textbox id
+    def get_first_last_textbox(self, dict_overlaps):
+
+        # get yearly dictionary
+        dict_overlaps_year = dict_overlaps[self.year]
+
+        # initialize to impossible values
+        first_entry = -1
+        last_entry = 1000
+
+        # get index of textbox from first and last page
+        # the overlap dictionary only contains an entry, if an overlap was detected
+        if self.id_doc in dict_overlaps_year.keys():
+            for entry, array in dict_overlaps_year[self.id_doc].items():
+                if entry == 'first':
+                    first_entry = int(array[0])
+                if entry == 'last':
+                    last_entry = int(array[0])
+
+        return (first_entry, last_entry)
+
+
+
 
 
     def annotate_xml(self, flag_save = 1, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'):
@@ -764,17 +791,25 @@ class Document:
         print('we have a main corr XML file')
 
         # get council and date
-        (str_council, str_date) = self.get_council_date()
-        self.str_council = str_council
-        self.str_date = str_date
+        self.get_council_date()
+
+        # get start and end of document
+        path_data = '/home/lili/NLP_DemocraSci/data_from_nlp-democracy/results_overlap/'
+        with open (path_data + 'dict_overlaps.pickle', 'rb') as f:
+            dict_overlaps = pickle.load(f)
+        self.entries = self.get_first_last_textbox(dict_overlaps)
+        print(self.entries)
 
         # file to track speakers
         self.name_speakers = '_'.join((str(self.year), self.id_doc, 'speakers.txt'))
-        with open('data/lists/speakers/' + self.name_speakers, 'w') as f:
-            f.write(' '.join((str(self.year), self.id_doc, str_date, '\n')))
+        path_speakers = 'data/lists/speakers/'
+        if not os.path.exists(path_speakers):
+            os.makedirs(path_speakers)
+        with open(path_speakers + self.name_speakers, 'w') as f:
+            f.write(' '.join((str(self.year), self.id_doc, self.str_date, '\n')))
 
         #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
-        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, bln_print=False)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, self.entries, bln_print=False)
         self.XML_main_annot = XML_main_annot
 
         # save xml file
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 512cc94b0289e23292c75a59926ed6008973b885..812e0ff5ee2419f17bcd58fbb2868f64ae4fb1c5 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -115,7 +115,7 @@ with open(input_notnames) as f:
 list_notnames = [term.rstrip() for term in list_notnames]
 
 # to test for one file
-file_tarpath = './1936/20031986_datacorr.xml'
+file_tarpath = './1936/20031982_datacorr.xml'
 
 id_doc = file_tarpath.split('/')[-1][:8]
 
@@ -123,6 +123,7 @@ id_doc = file_tarpath.split('/')[-1][:8]
 infile_aux = year + '/' + id_doc + '.pdf'
 file_doc = defc.Document(infile_aux, folder_database)
 
+
 if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
     print(id_doc + '\n')
 
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 8e8f554f55a3104cd32874d39502269394382d49..a4cf53ff2146fbc49add9c4fb28657e71c39c3cc 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -84,7 +84,7 @@ def get_text(sometext):
 
 
 # function to annotated corrected XML
-def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, bln_print=False):
+def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, entries, bln_print=False):
 
     # list of votation terms
     list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
@@ -100,6 +100,7 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da
 
     # create new XML as a copy of the corrected one
     XML_new = copy.deepcopy(XML_root)
+    last_page = len(XML_root)
 
     # initialize flags to distinguish speeches from votes
     this_is_speech = False
@@ -108,10 +109,24 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da
 
     # for every page
     for ind_p, page in enumerate(XML_root):
+        print('page index:', ind_p)
         if bln_print:
             print(page.tag, page.attrib)
+
         # for every textbox on that page
         for ind_t, textbox in enumerate(page):
+
+            try:
+                if ind_p == 0 and entries[0] == int(textbox.attrib['id']):
+                    with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+                        f.write(' '.join(('<<<=====================', 'the document starts here', '\n\n')))
+
+                if ind_p == last_page - 1 and entries[1] == int(textbox.attrib['id']):
+                    with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+                        f.write(' '.join(('=====================>>>', 'the document ends here', '\n\n')))
+            except KeyError:
+                pass
+
             if (textbox.tag == 'textbox'):
                 if 'type_textbox' in textbox.attrib.keys():
                     if (textbox.attrib['type_textbox'] == 'text'):
diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh
index e769de17cdb4db4b746d143be02e283bcf0e9e50..9495dc345a9e7425d8adfc2de7e9bd62206e128a 100755
--- a/src/sh/extract_discussions_yearly.sh
+++ b/src/sh/extract_discussions_yearly.sh
@@ -6,5 +6,6 @@ year_end=1891
 for year in $(seq $year_start $year_end)
 do
     echo $year
+    # renku run --isolation
     python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz
 done