diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt
index a3d5eaf6a98727e2531b45f0b4d6a6a274fad5e4..be0ee8d508e6fbc8e0b8c943e8507bd2fa4dafc6 100644
--- a/data/lists/not_names.txt
+++ b/data/lists/not_names.txt
@@ -39,6 +39,7 @@ Masse
 Minister
 neben
 nehmen
+nen
 neu
 nicht
 Rath
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 4e55da1317e3683ec3799e015085be48a3b57f55..f8e07833543f350fe3f1a5877f63986f0a898038 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -736,10 +736,7 @@ class Document:
     # - dict_overlaps: dictionary with overlaps
     # output:
     # - (first_entry, last_entry): tuple of first and last textbox id
-    def get_first_last_textbox(self, dict_overlaps):
-
-        # get yearly dictionary
-        dict_overlaps_year = dict_overlaps[self.year]
+    def get_first_last_textbox(self, dict_overlaps_year):
 
         # initialize to impossible values
         first_entry = -1
@@ -794,33 +791,36 @@ class Document:
         self.get_council_date()
 
         # get start and end of document
-        path_data = '/home/lili/NLP_DemocraSci/data_from_nlp-democracy/results_overlap/'
-        with open (path_data + 'dict_overlaps.pickle', 'rb') as f:
-            dict_overlaps = pickle.load(f)
-        self.entries = self.get_first_last_textbox(dict_overlaps)
-        print(self.entries)
+        entries = self.get_first_last_textbox(self.dict_overlaps_year)
+
+        # update if document starts/ends as on pdf
+        if entries[0] == -1:
+            entries = (0, entries[1])
+        last_page = len(self.XML_main_corr)
+        if entries[1] == 1000:
+            entries = (entries[0], len([textbox for textbox in self.XML_main_corr[last_page-1] if textbox.tag == 'textbox']) - 1)
+        self.entries = entries
 
         # file to track speakers
-        self.name_speakers = '_'.join((str(self.year), self.id_doc, 'speakers.txt'))
-        path_speakers = 'data/lists/speakers/'
-        if not os.path.exists(path_speakers):
-            os.makedirs(path_speakers)
-        with open(path_speakers + self.name_speakers, 'w') as f:
+        self.name_speakers = '_'.join((self.id_doc, 'speakers.txt'))
+        name_txt = './' + str(self.year) + '/' + self.name_speakers
+        if not os.path.exists('./' + str(self.year)):
+            os.makedirs('./' + str(self.year))
+        with open(name_txt, 'w') as f:
             f.write(' '.join((str(self.year), self.id_doc, self.str_date, '\n')))
 
         #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
-        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, self.entries, bln_print=False)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, name_txt, self.entries, bln_print=False)
         self.XML_main_annot = XML_main_annot
 
         # save xml file
         name_xml = './' + str(self.year) + '/' + self.name_wo_ext + '.xml'
         tree = ET.ElementTree(XML_main_annot)
-        if not os.path.exists('./' + str(self.year)):
-            os.makedirs('./' + str(self.year))
         tree.write(name_xml, encoding = 'utf-8')
 
         if flag_save:
             name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outannotxml)
+            name_tar = utils_proc.addto_tar(name_txt, self.folder_database, name_file = name_outannotxml)
         else:
             print('Not saving to tar')
             name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outannotxml + '.tar.gz'
@@ -833,8 +833,3 @@ class Document:
         utils_proc.call_with_out(command)
 
         print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))
-
-
-        command = 'rm -rf ./' + str(self.year)
-        #print(command)
-#        utils_proc.call_with_out(command)
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 812e0ff5ee2419f17bcd58fbb2868f64ae4fb1c5..b0b56828cf9f14d7df7f83610e11ddd024f4db41 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -31,6 +31,7 @@ input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
 input_notnames = "data/lists/not_names.txt"
+input_overlaps = "data/lists/dict_overlaps.pickle"
 output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 
 #%%
@@ -39,7 +40,8 @@ input_lastnames = sys.argv[1]
 input_correctedxml = sys.argv[2]
 input_correctedmeta = sys.argv[3]
 input_notnames = sys.argv[4]
-output_annotatedxml = sys.argv[5]
+input_overlaps = sys.argv[5]
+output_annotatedxml = sys.argv[6]
 
 #%%
 # extract suffixes, year, folder_database
@@ -75,13 +77,23 @@ print('files to process loaded:', files_to_process)
 # (there is one file of lastnames per year)
 with open(input_lastnames, 'rb') as f:
     df_lastnames = pickle.load(f)
+df_lastnames = df_lastnames.fillna('')
 
 print('dataframe with lastnames loaded')
 
+# open list of terms that are easily mistaken as names
 with open(input_notnames) as f:
     list_notnames = f.readlines()
 
 list_notnames = [term.rstrip() for term in list_notnames]
+print('list of notnames loaded')
+
+# open dictionary of overlaps from pickle file
+with open(input_overlaps, 'rb') as f:
+    dict_overlaps = pickle.load(f)
+dict_overlaps_year = dict_overlaps[int(year)]
+print('dictionary of overlaps loaded')
+
 
 #%%
 # for each file
@@ -98,6 +110,7 @@ for file_tarpath in files_to_process:
         print(id_doc + '\n')
         file_doc.df_lastnames = df_lastnames
         file_doc.list_notnames = list_notnames
+        file_doc.dict_overlaps_year = dict_overlaps_year
         file_doc.annotate_xml()
 
 # Commands to get the compressegid version of the file
@@ -115,7 +128,7 @@ with open(input_notnames) as f:
 list_notnames = [term.rstrip() for term in list_notnames]
 
 # to test for one file
-file_tarpath = './1936/20031982_datacorr.xml'
+file_tarpath = './1936/20031998_datacorr.xml'
 
 id_doc = file_tarpath.split('/')[-1][:8]
 
@@ -129,6 +142,7 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20
 
     file_doc.df_lastnames = df_lastnames
     file_doc.list_notnames = list_notnames
+    file_doc.dict_overlaps_year = dict_overlaps_year
     file_doc.annotate_xml()
 
 
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index a4cf53ff2146fbc49add9c4fb28657e71c39c3cc..7d8ce8053992f159caae552c4d84bed90bcae5af 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -84,7 +84,7 @@ def get_text(sometext):
 
 
 # function to annotated corrected XML
-def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, entries, bln_print=False):
+def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, name_txt, entries, bln_print=False):
 
     # list of votation terms
     list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
@@ -100,7 +100,7 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da
 
     # create new XML as a copy of the corrected one
     XML_new = copy.deepcopy(XML_root)
-    last_page = len(XML_root)
+    ind_last_page = len(XML_root) - 1
 
     # initialize flags to distinguish speeches from votes
     this_is_speech = False
@@ -109,20 +109,22 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da
 
     # for every page
     for ind_p, page in enumerate(XML_root):
-        print('page index:', ind_p)
         if bln_print:
             print(page.tag, page.attrib)
 
         # for every textbox on that page
         for ind_t, textbox in enumerate(page):
 
+            # specify start and end of document
             try:
                 if ind_p == 0 and entries[0] == int(textbox.attrib['id']):
-                    with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+                    XML_new = label_docstartend(XML_new, ind_p, ind_t, 'doc_start')
+                    with open(name_txt, 'a') as f:
                         f.write(' '.join(('<<<=====================', 'the document starts here', '\n\n')))
 
-                if ind_p == last_page - 1 and entries[1] == int(textbox.attrib['id']):
-                    with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+                if ind_p == ind_last_page and entries[1] == int(textbox.attrib['id']):
+                    XML_new = label_docstartend(XML_new, ind_p, ind_t, 'doc_end')
+                    with open(name_txt, 'a') as f:
                         f.write(' '.join(('=====================>>>', 'the document ends here', '\n\n')))
             except KeyError:
                 pass
@@ -150,11 +152,11 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da
 
                         if textbox_texttype in ['text_col1', 'text_col2']:
 
-                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False)
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, name_txt, bln_print=False)
                             if this_is_speech:
                                 prev_is_speech = True
                                 continue
-                            XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, str_file_speakers, bln_print=False)
+                            XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, name_txt, bln_print=False)
                             if this_is_vote:
                                 prev_is_speech = False
                                 continue
@@ -249,7 +251,7 @@ def get_complete_text(textbox):
 # - bln_print: whether to print during execution, default False
 # output:
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
-def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False):
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, name_txt, bln_print=False):
 
     # lists of roles
     list_roles = ['PrÃ¤sident', 'PrÃ¤sidentin', 'VizeprÃ¤sident', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente',
@@ -332,7 +334,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                             print('get rid of FÃ¼r den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi)
                         str_name = ''
 
-                with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+                with open(name_txt, 'a') as f:
                     f.write(' '.join(('page', str(ind_p + 1), str(list_oi), '\n')))
                     f.write(' '.join(('name:', str_name, '\n')))
                     f.write(' '.join(('role:', str_role, '\n')))
@@ -424,7 +426,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
 # - bln_print: whether to print during execution, default False
 # output:
 # - XML_new: updated
-def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, str_file_speakers, bln_print=True):
+def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, name_txt, bln_print=True):
 
     # get first terms of that text
     list_oi = tokenizer.tokenize(text)[:15]
@@ -437,7 +439,7 @@ def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, str_file_sp
         # set flag
         this_is_vote = True
 
-        with open('data/lists/speakers/' + str_file_speakers, 'a') as f:
+        with open(name_txt, 'a') as f:
             f.write(' '.join(('page', str(ind_p + 1), text, '\n')))
             f.write(' '.join(('is a vote', '\n\n')))
 
@@ -461,6 +463,17 @@ def label_speechcont(XML_new, ind_p, ind_t):
 
     return XML_new
 
+
+# function to label start and end of document
+# only adds label to corresponding textbox
+# type_ is either 'doc_start' or 'doc_end'
+def label_docstartend(XML_new, ind_p, ind_t, type_):
+
+    XML_new[ind_p][ind_t].attrib[type_] = 'here'
+
+    return XML_new
+
+
 # helper function to flatten nested irregular list
 def flatten(l):
     for el in l:
@@ -695,6 +708,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str
 
 
                 else:
+                    print(canton_type, str_canton, str_name, df_temp)
                     list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')])
                     str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0]
 
diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh
index 9495dc345a9e7425d8adfc2de7e9bd62206e128a..290554ff2d4182ddffd659afb3e34f39f3dd9be0 100755
--- a/src/sh/extract_discussions_yearly.sh
+++ b/src/sh/extract_discussions_yearly.sh
@@ -3,9 +3,16 @@
 year_start=1891
 year_end=1891
 
+input_lastnames = data/politicians/lastnames/${year}_lastnames.pickle
+input_correctedxml = data/AB/${year}/04_correctedxml.tar.gz
+input_correctedmeta = data/AB/${year}/03_correctedmeta.tar.gz
+input_notnames = data/lists/not_names.txt
+input_overlaps = data/lists/dict_overlaps.pickle
+output_annotatedxml = data/AB/${year}/05_annotatedxml.tar.gz
+
 for year in $(seq $year_start $year_end)
 do
     echo $year
     # renku run --isolation
-    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz
+    python src/python/run_extract_discussions.py input_lastnames input_correctedxml input_correctedmeta input_notnames input_overlaps output_annotatedxml
 done