diff --git a/.gitattributes b/.gitattributes
index 69c57ff2c0190283158eabd6960d27fe9cffc6ed..c9a687eeb7fb05ad93c356eb8fe3c2e6c148236e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -166,3 +166,5 @@ data/AB/1967/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
 data/AB/1968/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
 data/AB/1969/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
 data/train_NER/20190109_train_NER.tar.gz filter=lfs diff=lfs merge=lfs -text
+data/AB/1970/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
+data/AB/1971/04_correctedxml.tar.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 81899c1deb45d25dae2346fbaba8fcb57610eabe..f17696717cfd74906dad9a8b441cc8f59bf68623 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -24,7 +24,7 @@ image_build:
 
 dot:
   stage: build
-  image: renku/renku-python:0.2.0
+  image: renku/renku-python:v0.3.3
   script:
     - renku log --format dot $(git ls-files --no-empty-directory --recurse-submodules) > graph.dot
   artifacts:
diff --git a/.renku/workflow/3c8d4b7f4a2e4742b98b1e1cbd1aa493_python.cwl b/.renku/workflow/3c8d4b7f4a2e4742b98b1e1cbd1aa493_python.cwl
new file mode 100644
index 0000000000000000000000000000000000000000..752abc673e19029f2e3ee7aa0b3083755eba0533
--- /dev/null
+++ b/.renku/workflow/3c8d4b7f4a2e4742b98b1e1cbd1aa493_python.cwl
@@ -0,0 +1,51 @@
+arguments: []
+baseCommand:
+- python
+class: CommandLineTool
+cwlVersion: v1.0
+hints: []
+inputs:
+  input_1:
+    default:
+      class: File
+      path: ../../src/python/run_correctxml.py
+    inputBinding:
+      position: 1
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_2:
+    default:
+      class: File
+      path: ../../data/AB/1970/02_extractedxml.tar.gz
+    inputBinding:
+      position: 2
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_3:
+    default: data/AB/1970/04_correctedxml.tar.gz
+    inputBinding:
+      position: 3
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+outputs:
+  output_0:
+    outputBinding:
+      glob: $(inputs.input_3)
+    streamable: false
+    type: File
+permanentFailCodes: []
+requirements:
+- class: InlineJavascriptRequirement
+- class: InitialWorkDirRequirement
+  listing:
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: data/AB/1970
+    writable: true
+successCodes: []
+temporaryFailCodes: []
diff --git a/.renku/workflow/bc7b832a372149e2986b571e0e8fd144_python.cwl b/.renku/workflow/bc7b832a372149e2986b571e0e8fd144_python.cwl
new file mode 100644
index 0000000000000000000000000000000000000000..db26d337488a99b3b883ef2a729eefaed948aa48
--- /dev/null
+++ b/.renku/workflow/bc7b832a372149e2986b571e0e8fd144_python.cwl
@@ -0,0 +1,51 @@
+arguments: []
+baseCommand:
+- python
+class: CommandLineTool
+cwlVersion: v1.0
+hints: []
+inputs:
+  input_1:
+    default:
+      class: File
+      path: ../../src/python/run_correctxml.py
+    inputBinding:
+      position: 1
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_2:
+    default:
+      class: File
+      path: ../../data/AB/1971/02_extractedxml.tar.gz
+    inputBinding:
+      position: 2
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_3:
+    default: data/AB/1971/04_correctedxml.tar.gz
+    inputBinding:
+      position: 3
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+outputs:
+  output_0:
+    outputBinding:
+      glob: $(inputs.input_3)
+    streamable: false
+    type: File
+permanentFailCodes: []
+requirements:
+- class: InlineJavascriptRequirement
+- class: InitialWorkDirRequirement
+  listing:
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: data/AB/1971
+    writable: true
+successCodes: []
+temporaryFailCodes: []
diff --git a/data/AB/1970/04_correctedxml.tar.gz b/data/AB/1970/04_correctedxml.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..66ffa3eb3585962e60062c84e964b6c29d1a892e
--- /dev/null
+++ b/data/AB/1970/04_correctedxml.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcfe774bc163f3d2507bdd42e18ed08efc1c80e601bfb579e7e7ee2496b1cf87
+size 9925496
diff --git a/data/AB/1971/04_correctedxml.tar.gz b/data/AB/1971/04_correctedxml.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..66cc5664e06f6b82cf70446543bc85ba5154dfc3
--- /dev/null
+++ b/data/AB/1971/04_correctedxml.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8a222f55fcfcf9f0d28e0e27cd9e804f9401adfc8fd01ec31b6d2651598f34e
+size 26675254
diff --git a/src/python/bla_tryreadxml.py b/src/python/bla_tryreadxml.py
new file mode 100644
index 0000000000000000000000000000000000000000..082289b305219f6495943e8098e924dab8ea6723
--- /dev/null
+++ b/src/python/bla_tryreadxml.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#%%
+%load_ext autoreload
+%autoreload 2
+
+import xml.etree.ElementTree as ET
+import re
+import pickle
+import string
+from nltk.corpus import stopwords
+from nltk.tokenize import RegexpTokenizer
+import copy
+
+import sys
+sys.path.append('src/python/')
+import utils_annot
+
+tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
+
+xml_file = 'data/AB/1893/1893/20026528_datacorr.xml'
+input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle"
+
+XML_tree = ET.parse(xml_file)
+XML_root = XML_tree.getroot()
+
+# list of stopwords
+list_stopwords = stopwords.words('german')
+list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
+list_stopwords.extend(stopwords.words('french'))
+list_stopwords.extend(['ils', 'les', 'celle'])
+
+# add a few terms to list_stopwords that are easily mistaken as last names
+list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen',
+                       'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter'])
+
+# list of votation terms
+# TODO: make it work for é, etc.
+list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
+                      'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)',
+                      'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e',
+                      "D'accord", 'voix']
+
+# open dataframe of last names from pickle file
+with open(input_lastnames, 'rb') as f:
+    df_lastnames = pickle.load(f)
+
+#%%
+# create new XML as a copy of the corrected one
+XML_new = copy.deepcopy(XML_root)
+
+# initialize flags to distinguish speeches from votes
+this_is_speech = False
+prev_is_speech = False
+this_is_vote = False
+
+# for every page
+for ind_p, page in enumerate(XML_root):
+    print(page.tag, page.attrib)
+    # for every textbox on that page
+    for ind_t, textbox in enumerate(page):
+        if (textbox.tag == 'textbox'):
+            if 'type_textbox' in textbox.attrib.keys():
+                if (textbox.attrib['type_textbox'] == 'text'):
+                    print(textbox.tag, textbox.attrib)
+
+                    # get complete text of that textbox
+                    complete_text = get_complete_text(textbox)
+
+                    # identify and label language in XML
+                    dict_lang = identify_language(complete_text)
+                    XML_new = label_language(XML_new, dict_lang)
+
+                    # get texttype of that textbox by majority vote
+                    textbox_texttype = get_textbox_type(textbox)
+                    print(textbox_texttype)
+
+                    if textbox_texttype in ['text_col1', 'text_col2']:
+
+                        print(complete_text)
+                        XML_new, this_is_speech = label_speechstart(XML_new, complete_text, df_lastnames, list_stopwords, bln_print=False)
+                        if this_is_speech:
+                            prev_is_speech = True
+                            print('stopped after finding speech start')
+                            continue
+                        XML_new, this_is_vote = label_votations(XML_new, complete_text, list_votationterms, bln_print=False)
+                        if this_is_vote:
+                            prev_is_speech = False
+                            print('stopped after finding vote')
+                            continue
+                        if prev_is_speech and (not this_is_vote):
+                            XML_new = label_speechcont(XML_new)
+
+        print('go to next textbox \n')
+
+
+name_xml = 'data/AB/1893/id_doc_previewannotated.xml'
+tree = ET.ElementTree(XML_new)
+tree.write(name_xml, encoding = 'utf-8')
+
+
+
+#%%
+sometext = '[font face="8.071" size="Times-Bold"]Für  die  Bedaktion  verantwortlich :[/font][font face="7.973" size="Times-BoldItalic"] Sud.  SdMarst[/font][font face="8.071" size="Times-Bold"]  —•  Druck  und Expedition  von[/font][font face="7.973" size="Times-BoldItalic"]  Jmi  è  Éeineft[/font][font face="8.071" size="Times-Bold"]  fa[/font][font face="7.973" size="Times-BoldItalic"]  Seìrit. [/font]'
+
+#re.split('[ | ]', sometext)
+def get_text(sometext):
+    newtext = ''
+    for text in re.findall('\].*?\[',sometext):
+        #print(text)
+        if text.startswith(']') and text.endswith('['):
+            newtext += text[1:-1]
+    #print(newtext)
+    return newtext
+get_text(sometext)
+#%%
+
+# helper function to get type of textbox_type
+# corresponds to majority vote of types of textlines
+# input:
+# - textbox
+# output:
+# - textbox_type: string
+def get_textbox_type(textbox):
+
+    # initialize empty dictionary
+    dict_type = {}
+
+    # for every textline in that textbox
+    for ind_tl, textline in enumerate(textbox):
+        if textline.tag == 'textline':
+#            print(textline.tag, textline.attrib)
+
+            # count types
+            if textline.attrib['type'] not in dict_type.keys():
+                dict_type[textline.attrib['type']] = 1
+            else:
+                dict_type[textline.attrib['type']] += 1
+
+#    print(dict_type)
+    # list of all types with maximum count
+    list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())]
+#    print(list_types)
+    # if only one with maximum value
+    if len(list_types) == 1:
+        textbox_type = list_types[0]
+    # if several with same maximum value
+    else:
+        textbox_type = 'notdistinct'
+    return textbox_type
+#%%
+
+# helper function to get complete text of a textbox
+# input:
+# - textbox
+# output:
+# - complete_text: string
+def get_complete_text(textbox):
+
+    # helper function to get text without font information
+    def get_text(sometext):
+        newtext = ''
+        for text in re.findall('\].*?\[',sometext):
+            #print(text)
+            if text.startswith(']') and text.endswith('['):
+                newtext += text[1:-1]
+        #print(newtext)
+        return newtext
+    # initialize empty string
+    complete_text = ''
+
+    # for every textline in that textbox
+    for ind_tl, textline in enumerate(textbox):
+        if textline.tag == 'textline':
+            # append text to string
+            complete_text += get_text(textline.text)
+
+    return complete_text
+
+
+#%%
+
+# function to label speech starts
+# input:
+# - text: stringt to be analyzed
+# - df_names: dataframe of politicians
+# - list_stopwords: list of german and french stopwords
+# - bln_print: whether to print during execution, default False
+# output:
+# - (str_name, str_role, int_uniqueID, str_canton): tuple with strings and ID
+# TODO: speakers with double get recognized twice (1893, 20026528, p2, Scherrer-Füllemann)
+def label_speechstart(XML_new, text, df_names, list_stopwords, bln_print=False):
+
+    # initialize strings and ID
+    str_name = ''
+    str_role = ''
+    int_uniqueID = int(0)
+    str_canton = ''
+
+    # very consistently, a speaker can be identified by looking for a colon
+    # at the beginning of a textbox and identifiying a name or a role in front
+    # of that colon
+    if ':' in text[:100]:
+        # extract the index of the colon in the text
+        colon_index_text = text.index(':')
+
+        # look at first few terms of that textbox
+        text_start = re.sub(r'[\(\)]','',text[:colon_index_text])
+        list_oi = tokenizer.tokenize(text_start)
+        print('possible speech start: ', list_oi)
+
+        # remove stopwords
+        list_oi = [term for term in list_oi if term.lower() not in list_stopwords]
+
+        # remove punctuation
+        list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi]
+        list_oi = [s for s in list_oi if s]
+
+        # remove lower case terms
+#        list_oi = [term for term in list_oi if not term.islower()]
+
+        # remove numbers
+        list_oi = [term for term in list_oi if not term.isdigit()]
+
+        # remove single characters
+        list_oi = [term for term in list_oi if len(term)>1]
+
+        # for every term, reversed finds canton before it finds name
+        for term in reversed(list_oi):
+            # if possible, find a name in a list
+            str_name, str_role, int_uniqueID, str_canton = utils_annot.find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True)
+        print('name', str_name, 'role', str_role)
+
+        # get rid of doubled double names
+
+
+        # get rid of 'Präsident stimmt nicht Président ne vote pas'
+        if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name:
+            if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
+                print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi)
+                str_role = ''
+
+        # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":'
+        if str_name:
+            if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1:
+                print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi)
+                str_name = ''
+
+        # if a name has been found, add it to XML_new
+        if str_name or str_role:
+            # add attribute speech_start to textbox
+            XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start'
+
+            # add speaker to first textline
+            XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, int_uniqueID, str_canton)
+            # TODO: split speaker from text (check on which line and split that line accordingly)
+#            dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
+#                    text[colon_index_text+1:])
+
+            # set flag
+            this_is_speech = True
+            if bln_print:
+                print('found a name:', list_oi, str_name, str_role, '\n')
+    else:
+        # set flag
+        this_is_speech = False
+
+    return XML_new, this_is_speech
+# %%
+
+# function to extract votation paragraphs
+# !!! error prone, possible improvements see notebook extract_discussions
+# input:
+# - XML_new:
+# - text: string
+# - list_votationterms: list of votation terms
+# - bln_print: whether to print during execution, default False
+# output:
+# - XML_new: updated
+def label_votations(XML_new, text, list_votationterms, bln_print=True):
+
+    # get first terms of that text
+    list_oi = tokenizer.tokenize(text)[:15]
+#        if len(set(list_oi).intersection(set(list_votationterms))) > 1:
+    # if there is an overlap with typical votation terms:
+    if set(list_oi).intersection(set(list_votationterms)):
+        # add attribute vote to textbox
+        XML_new[ind_p][ind_t].attrib['text_type'] = 'vote'
+
+        # set flag
+        this_is_vote = True
+        if bln_print:
+            print('found a vote:', list_oi)
+    else:
+        #pass
+        # set flag
+        this_is_vote = False
+        if bln_print:
+            print('not a vote', list_oi)
+
+    return XML_new, this_is_vote
+
+#%%
+
+
+
+def label_speechcont(XML_new):
+
+    XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont'
+
+    return XML_new
+
+#%%
+# two functions for language identification
+# Author: Luis Salamanca
+# small modifications by Lili Gasser
+# Using stopwords
+# input:
+# - text: string
+# - valid_lang: tuple of valid languages
+# output:
+# - dict_language_counts: dictionary of stopword counts for each valid language
+def identify_language(text, valid_lang = ('german', 'french', 'italian')):
+
+    # tokenize
+    tokens = text.split(' ')
+    # all lowercase
+    test_words = [word.lower() for word in tokens]
+    # make a set
+    test_words_set = set(test_words)
+
+    # initialize dictionary of language elements
+    dict_language_counts = {}
+
+    # iterate through languages of stopwords
+    for language in stopwords.fileids():
+        if language in valid_lang:
+            # get stopword set
+            stopwords_set = set(stopwords.words(language))
+            # get intersection between text of interest and stopword set for this language
+            common_elements = test_words_set.intersection(stopwords_set)
+            # save number of common elements to dictionary
+            dict_language_counts[language] = len(common_elements)
+
+    return dict_language_counts
+
+
+# Simply, given the number of ocurrences of the stopwords, it assigns a label
+# to a specific textbox, also considering the possibility of textboxes
+# mixing languages. For this case, the value ratio_similar is intended
+# input:
+# - XML_new: XML file to update
+# - aux_dict_l: corresponds to dict_language_counts
+# output:
+# - lang_max: string
+def label_language(XML_new, aux_dict_l):
+
+    # specify a similarity ratio
+    ratio_similar = 0.8
+    # if there are counts, determine language
+    if sum(aux_dict_l.values()):
+        aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
+        lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
+        lang_max = ''
+        count_l = 0
+        for lang in aux_dict_l_norm.keys():
+            if (aux_dict_l_norm[lang] >  aux_dict_l_norm[lang_max_aux] * ratio_similar):
+                if count_l > 0:
+                    lang_max += '_'
+                lang_max += lang
+                count_l += 1
+        if count_l > 1:
+            lang_max = 'mixed_' + lang_max
+    else:
+        lang_max = 'languageNotIdentified'
+
+    # add attribute to textbox
+    XML_new[ind_p][ind_t].attrib['language'] = lang_max
+
+    return XML_new
+
+
+
+#%%
+
+int_uniqueID = (123, 123)
+print(type(int_uniqueID))
+print(isinstance(int_uniqueID, tuple))
+
+
+tpl = ()
+tpl2 = (tpl, 2)
+tpl2
+tpl.append(2)
+
+
+lst = []
+list2 = [lst, 2]
+list2
+lst.append(2)
+lst
+
+
+lst
+tuple(lst)
+(1, 2, lst, 3)
+('a', 'b', 'c', lst)
+
+[2]
+[1, 2, 3].append([4])
+lst = [1,2,3]
+lst
+lst.append(3)
+lst
+lst.append([4, 5, 6])
+lst
+len(lst)
+set(lst)
+
+
+lst = [2, 3]
+
+list_temptemp = []
+for item in lst:
+    list_temptemp.extend(item)
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index 8808fb4602673e53ee2b98b6f238c937df9630ac..cce7282200b409284b6b30a108244026631e6ab1 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -15,7 +15,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import xml.etree.ElementTree as ET
 import copy
-import time 
+import time
 import tarfile
 import pickle
 
@@ -33,12 +33,12 @@ import preproc_docs
 # Definition of classes and methods associated
 
 class Document:
-    
+
     limit_year = 1950
     flag_end_run = 1
     name_inpdf = '00_rawpdfs'
     name_inmeta = '01_rawmeta'
-    
+
     def __init__(self, input_file, folder_database):
         self.year = int(input_file.split('/')[-2])
         self.id_doc = input_file.split('/')[-1].split('.')[0]
@@ -48,14 +48,22 @@ class Document:
         self.name_wo_ext = os.path.splitext(self.name_file)[0]
         self.folder_database = folder_database
         self._meta_ext()
-        
+        self._xml_ext()
+
     def _meta_ext(self):
     # Both for the correction and the extraction of the metadata information
         name_file = str(self.year) + '/' + self.id_doc + '.xml'
         name_file_db = str(self.year) + '/' + self.id_doc + '.db'
         name_tar = self.folder_database + str(self.year) + '/' + self.name_inmeta + '.tar.gz'
-        self.name_meta = [name_tar, name_file, name_file_db]        
-    
+        self.name_meta = [name_tar, name_file, name_file_db]
+
+    def _xml_ext(self, suffix_xml = '_data', name_outcorrxml = '04_correctedxml'):
+    # For the extraction, correction and annotation of the xmls
+    # TODO for extraction and annotation
+        name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml'
+        name_tar = self.folder_database + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
+        self.name_xml_corr = [name_tar, name_xml]
+
     def meta_correct(self, name_outmeta = '03_correctedmeta'):
         utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
         utils_proc.tar_extractfile(self.name_meta[2], self.folder_database, name_file = self.name_inmeta)
@@ -65,13 +73,13 @@ class Document:
         command = 'rm -rf ./' + str(self.year)
         #print(command)
         utils_proc.call_with_out(command)
-    
+
     def pdf2imgobj(self, resolution = 100):
-        
+
         self.resolution = resolution
         utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf)
         self.imgobj = convert_from_path(self.input_file, dpi = resolution)
-        command = 'rm -rf ./' + str(self.year)     
+        command = 'rm -rf ./' + str(self.year)
         utils_proc.call_with_out(command)
 
     def _get_pages(self, pages = 'all'):
@@ -82,7 +90,7 @@ class Document:
         elif isinstance(pages,str):
             self.n_pages = np.array(pages.split(',')).astype(np.uint32)
         else:
-            self.n_pages = np.array(pages)  
+            self.n_pages = np.array(pages)
 
     def pdf2xml(self, pages = 'all', suffix_xml = '_data', flag_save = 1,
                 name_outxml = '02_extractedxml'):
@@ -90,7 +98,7 @@ class Document:
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
         self._get_pages(pages = pages)
-        
+
         utils_proc.tar_extractfile(self.input_file, self.folder_database, name_file = self.name_inpdf)
         name_xml = utils_proc.pdf2xml(self.input_file, page_n = self.n_pages + 1, suffix_str = suffix_xml,
                                       flag_end = self.flag_end_run)
@@ -126,58 +134,58 @@ class Document:
                 imarray = np.array(self.imgobj[ind_page])
             else:
                 return print('Not possible! - You need to convert first the pdf to image\n')
-        
+
         if XML_root == None:
             XML_root = ET.Element('pages')
             ind_abs = np.argwhere(self.n_pages == ind_page)
             XML_root.append(XML_main[ind_abs])
-        
+
         bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-        
+
         imarray_textb = np.copy(imarray)
-        
+
         if textb_textl == 1:
             coord_textboxes = np.array([]).reshape((4,0))
             for ind_el in range(0, len(XML_root[0])):
                 if XML_root[0][ind_el].tag == 'textbox':
                     coord_textbox_aux = np.array(XML_root[0][ind_el].attrib['bbox'].split(',')).astype(np.float64)
                     coord_textboxes = np.concatenate((coord_textboxes, np.array(coord_textbox_aux).reshape((4,1))), axis = 1)
-                    imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux, 
-                                                                      bbox_page, color_vec = 'blue', alpha = True, 
-                                                                      filled = False, thick_line = 6) 
+                    imarray_textb = plot_tools.highlight_text(imarray_textb, coord_textbox_aux,
+                                                                      bbox_page, color_vec = 'blue', alpha = True,
+                                                                      filled = False, thick_line = 6)
             return imarray_textb, coord_textboxes
-        elif textb_textl == 2:   
+        elif textb_textl == 2:
             imarray_textl = np.copy(imarray)
             coord_textline = np.array([]).reshape((4,0))
-            all_font_sizes = np.array([])  
+            all_font_sizes = np.array([])
             for ind_el in range(0, len(XML_root[0])):
                 for ind_line in range(0, len(XML_root[0][ind_el])):
                     if XML_root[0][ind_el][ind_line].tag == 'textline':
                         coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
                         if len(XML_root[0][ind_el][ind_line]):
-                            all_font_sizes = np.concatenate((all_font_sizes, 
+                            all_font_sizes = np.concatenate((all_font_sizes,
                                                              np.array([XML_root[0][ind_el][ind_line][0].attrib['size']]).astype(np.float64)))
                         coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
-                        imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page, 
-                                                       color_vec = 'red', alpha = True, filled = False, thick_line = 6)  
-                        
-            all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True)        
+                        imarray_textl = plot_tools.highlight_text(imarray_textl, coord_textline_aux, bbox_page,
+                                                       color_vec = 'red', alpha = True, filled = False, thick_line = 6)
+
+            all_font_sizes, counts_all_font_sizes = np.unique(all_font_sizes, return_counts=True)
             info_font_sizes = np.concatenate((all_font_sizes.reshape((1,all_font_sizes.shape[0])),
-                                              counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64)))                        
-            
-            return imarray_textb, coord_textline, all_font_sizes, info_font_sizes        
-    
+                                              counts_all_font_sizes.reshape((1,all_font_sizes.shape[0])).astype(np.float64)))
+
+            return imarray_textb, coord_textline, all_font_sizes, info_font_sizes
+
     def correct_xml(self, flag_plots = 1, flag_parallel = 0, flag_save_figs = 1,
                     pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml',
                     name_outcorrxml = '04_correctedxml', flag_save = 1):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
-        
+
         start_time = time.time()
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
-            
+
         if 'XML_main' not in self.__dict__.keys():
             name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outxml + '.tar.gz'
             if os.path.isfile(name_tar):
@@ -189,141 +197,142 @@ class Document:
             else:
                 # TODO if already exists 02_extractedxml
                 self.pdf2xml(pages = pages, suffix_xml = suffix_xml)
-        
+
         self._get_pages(pages = pages)
         flag_central = 1
         if self.year > self.limit_year:
             flag_central = 0
         flag_2col = 1
-        
+
         XML_new = ET.Element('pages')
-                
-        for ind_abs, ind_page in enumerate(self.n_pages): 
-            
+
+        for ind_abs, ind_page in enumerate(self.n_pages):
+
             XML_root = ET.Element('pages')
             #print(ind_abs,len(self.XML_main))
             XML_root.append(self.XML_main[ind_abs])
             imarray = np.array(self.imgobj[ind_page])
-        
+
             if XML_root[0][0].tag == 'textbox':
                 bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
                 dim_img = imarray.shape[:2]
                 _, rescale_factor = plot_tools.adapt_coordtoimg(imarray, bbox_page, bbox_page)
-            
+
                 # Image with textboxes highlighted
                 imarray_textblock, coord_textboxes = self._draw_textbl(imarray = imarray, XML_root = XML_root)
-    
-                # Image with textlines highlighted, BUT also, array with all textlines 
+
+                # Image with textlines highlighted, BUT also, array with all textlines
                 # coordinates, and the fontsizes, required for later
                 _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root,
-                                                                           textb_textl = 2)                
-                
+                                                                           textb_textl = 2)
+
                 #####
                 # Central vertical line and horizontal lines, through Hough transform
-                coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
-                                                                          flag_2col, flag_central)                    
-                                        
+                coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page,
+                                                                          flag_2col, flag_central)
+
                 #####
                 # Obtain lateral margins
-                margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+                margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32),
                                                coord_horz.astype(np.uint32))
-                        
+
                 # Top and bottom line
-                ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+                ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32),
                                                coord_horz.astype(np.uint32))
-                #print(info_font_sizes)                                      
+                #print(info_font_sizes)
                 #####
-                # Label the textboxes based on a set of simple rules that make use of 
+                # Label the textboxes based on a set of simple rules that make use of
                 # the margins and the fontsizes
                 label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
                     preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes) # info_font_sizes_est
-                                            
+
                 #####
                 # Order the textlines, taken all them together, in order to later merge
                 # in a single textbox textlines that so far form different textboxes
-                set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
+                set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz,
                                                           list_allcoords_textlines, margins)
-                
-                # Given the ordered textlines, group them in new textboxes, creating a 
+
+                # Given the ordered textlines, group them in new textboxes, creating a
                 # XML, This uses some criteria of distance between paragraphs
-                XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
+                XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline,
                                                     rescale_factor, centrall_ord, ind_page, dim_img)
-                
+
                 # Append to the new XML
                 XML_new.append(XML_enrich[0])
-                
-                
+
+
                 if flag_plots:
                     im_met2 = plot_tools.plot_horzvertlines(imarray_textblock, coord_horz, coord_vert_def)
                     im_met2 = plot_tools.plot_margins(im_met2, margins, ind_limits, gap_line = 1)
                     im_met3, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)
-                    im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])    
+                    im_met4 = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])
                     im_met5 = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page)
-                    
+
                     # Create figure with 4 subplots, for showing all results
                     if flag_save_figs:
                         path_output_img = self.path_file + '/previews'
                         if flag_save_figs:
                             if not os.path.exists(path_output_img):
                                 os.makedirs(path_output_img)
-                
+
                     if flag_parallel:
                         if flag_save_figs:
                             name_pickle = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.pkl'
                             with open(name_pickle, 'wb') as f:  # Python 3: open(..., 'wb')
                                 pickle.dump([im_met2, im_met3, im_met4, im_met5], f)
-                    
+
                     else:
                         fig, axes = plt.subplots(1, 4, figsize=(30, 10))
                         ax = axes.ravel()
                         ax[0].axis('off')
-                        ax[0].imshow(im_met2) 
+                        ax[0].imshow(im_met2)
                         ax[1].axis('off')
                         ax[1].imshow(im_met3)
                         ax[2].axis('off')
-                        ax[2].imshow(im_met4)        
+                        ax[2].imshow(im_met4)
                         ax[3].axis('off')
                         ax[3].imshow(im_met5)
-                        
+
                         if flag_save_figs:
                             format_fig = 'png'
                             name_fig = path_output_img + '/' + self.name_wo_ext + '_page' + str(ind_page) + '.' + format_fig
                             fig.savefig(name_fig, format = format_fig, dpi = 200)
                             plt.close(fig)
-                
-        name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml'  
-        
+
+        name_xml_prev = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corrprev.xml'
+
         tree = ET.ElementTree(XML_new)
         self.XML_main_corr = XML_new
         if not os.path.exists('./' + str(self.year)):
             os.makedirs('./' + str(self.year))
         tree.write(name_xml_prev, encoding = 'utf-8')
         XML_new = preproc_docs.get_text_onefile(self.XML_main_corr)
-        name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml' 
+        name_xml = str(self.year) + '/' + self.name_wo_ext + suffix_xml + 'corr.xml'
         tree = ET.ElementTree(XML_new)
         tree.write(name_xml, encoding = 'utf-8')
-        
+
         if flag_save:
             name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outcorrxml)
         else:
             print('Not saving to tar')
-            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'        
-        
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
+
         self.name_outcorrxml = name_outcorrxml
         self.name_xml_corr = [name_tar, name_xml]
+        self._xml_ext(suffix_xml, self.name_outcorrxml)
         command = 'rm -rf ./' + str(self.year)
         #print(command)
-        utils_proc.call_with_out(command)        
-        
-        print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))  
+        utils_proc.call_with_out(command)
+
+        print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))
 
         #XML_tree = ET.parse(name_xml)
         #self.XML_main = XML_tree.getroot()
-    
+
     def _plot_generic_open(self, ind_page, suffix_xml, level_proc = 0,
                            name_outxml = '02_extractedxml'):
         # ind_page has to be a scalar
-        
+
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
         if 'XML_main' not in self.__dict__.keys():
@@ -335,8 +344,8 @@ class Document:
                     XML_tree = ET.parse(h_xml)
                     self.XML_main = XML_tree.getroot()
             else:
-                self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
+                self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)
+            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))
         else:
             #print('Run this')
             self._get_pages()
@@ -344,7 +353,7 @@ class Document:
 
         #print(ind_abs, type(ind_abs))
         #print(self.XML_main, len(self.imgobj))
-        
+
         if ind_page > (len(self.XML_main) - 1):
             flag_error = 1
             return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
@@ -357,65 +366,65 @@ class Document:
         XML_root = ET.Element('pages')
         XML_root.append(self.XML_main[ind_abs[0]])
         imarray = np.array(self.imgobj[ind_page])
-        
+
         bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
         dim_img = imarray.shape[:2]
-        
+
         _, coord_textline, all_font_sizes, info_font_sizes = self._draw_textbl(imarray = imarray, XML_root = XML_root,
-                                                                               textb_textl = 2)        
-        margins = [] 
-        ind_limits = [] 
+                                                                               textb_textl = 2)
+        margins = []
+        ind_limits = []
         label_textlines = []
         list_allcoords_textlines = []
         set_of_blocks = []
         XML_enrich = []
-        
+
         if level_proc > 0:
             coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
                                                                                    flag_2col, flag_central)  
             
         if level_proc > 1:            
             _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page)
-           
-        if level_proc > 2:            
+
+        if level_proc > 2:
             #####
             # Obtain lateral margins
-            margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
-                                           coord_horz.astype(np.uint32))                
-            
-        if level_proc > 3:            
+            margins = preproc_docs.lateral_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32),
+                                           coord_horz.astype(np.uint32))
+
+        if level_proc > 3:
             # Top and bottom line
-            ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32), 
+            ind_limits = preproc_docs.bottomtop_margins(imarray, bbox_page, coord_vert_def.astype(np.uint32),
                                            coord_horz.astype(np.uint32))
-            
-        if level_proc > 4:            
+
+        if level_proc > 4:
             label_textlines, list_allcoords_textlines, relative_ref_textline, all_heights, vec_labels_textline = \
-                preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes)   
-            
-        if level_proc > 5:            
-            set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz, 
-                                                                   list_allcoords_textlines, margins)            
-            
-        if level_proc > 6:            
-            XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline, 
-                                                rescale_factor, centrall_ord, ind_page, dim_img)        
-        
+                preproc_docs.label_textblocks(np.copy(imarray), XML_root[0], bbox_page, margins, coord_vert_def, info_font_sizes)
+
+        if level_proc > 5:
+            set_of_blocks, centrall_ord = preproc_docs.order_textl(rescale_factor, coord_vert_def, coord_horz,
+                                                                   list_allcoords_textlines, margins)
+
+        if level_proc > 6:
+            XML_enrich = preproc_docs.group_textl_create_xml(XML_root, set_of_blocks, relative_ref_textline, vec_labels_textline,
+                                                rescale_factor, centrall_ord, ind_page, dim_img)
+
         # The last value returned is only to say that there was not any error during the execution. Before, if there are too many pages, we
         # send a 1 instead
         flag_error = 0
         return imarray, margins, ind_limits, label_textlines, list_allcoords_textlines, \
             set_of_blocks, XML_enrich, bbox_page, XML_root, ind_abs, flag_error
-            
+
     def _plot_obtainfromxml(self, ind_page, suffix_xml, name_outcorrxml = '04_correctedxml'):
-        
+
         if 'imgobj' not in self.__dict__.keys():
             self.pdf2imgobj()
-        if 'XML_main_corr' not in self.__dict__.keys():            
+        if 'XML_main_corr' not in self.__dict__.keys():
             name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outcorrxml + '.tar.gz'
             if os.path.isfile(name_tar):
                 name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
                 #print(name_xml)
-                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:    
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, name_outcorrxml)[0]:
                     #print('Run this')
                     h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, name_outcorrxml)
                     XML_tree = ET.parse(h_xml)
@@ -424,13 +433,13 @@ class Document:
                     print('You need to have the tar file to use flag_compute = 0!')
                     flag_error = 1
                     return 0, 0, 0, 0, 0, 0, flag_error
-                    #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
+                    #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)
             else:
                 print('You need to have the tar file to use flag_compute = 0!')
                 flag_error = 1
                 return 0, 0, 0, 0, 0, 0, flag_error
-                #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)   
-            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))  
+                #self.pdf2xml(pages = 'all', suffix_xml = suffix_xml, flag_save = 0)
+            ind_abs = np.array([ind_page]).astype(int).reshape((-1,))
         else:
             #print('Run this')
             self._get_pages()
@@ -438,19 +447,19 @@ class Document:
 
         #print(ind_abs, type(ind_abs))
         #print(self.XML_main, len(self.imgobj))
-        
+
         if ind_page > (len(self.XML_main_corr) - 1):
             flag_error = 1
             return 0, 0, 0, 0, 0, 0, flag_error
-        
+
         XML_root = ET.Element('pages')
         XML_root.append(self.XML_main_corr[ind_abs[0]])
         imarray = np.array(self.imgobj[ind_page])
-        
+
         bbox_page = np.array(XML_root[0].attrib['bbox'].split(',')).astype(np.float64)
-        dim_img = imarray.shape[:2]        
-        
-        ######        
+        dim_img = imarray.shape[:2]
+
+        ######
         # For obtaining label_textlines, list_allcoords_textlines
         coord_textline = np.array([]).reshape((4,0))
         label_textlines = dict()
@@ -463,7 +472,7 @@ class Document:
                     if 'type' in XML_root[0][ind_el][ind_line].attrib:
                         coord_textline_aux = np.array(XML_root[0][ind_el][ind_line].attrib['bbox'].split(',')).astype(np.float64)
                         coord_textline = np.concatenate((coord_textline, np.array(coord_textline_aux).reshape((4,1))), axis = 1)
-                    
+
                         type_textl = XML_root[0][ind_el][ind_line].attrib['type']
                         #print(ind_el)
                         if XML_root[0][ind_el].attrib['type_textbox'] == 'line':
@@ -480,18 +489,18 @@ class Document:
                             aux_type = np.array([count])
                             label_textlines[type_textl] = aux_type
                         count += 1
-        
+
         coord_textline, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, coord_textline, bbox_page)
-        
-        ##### 
+
+        #####
         # To obtain set_of_blocks. This variable simply contains the coordinates, and
-        # then a final row indicating the order (here are already ordered), and if it 
-        # is a line, which is indicated with a -1        
+        # then a final row indicating the order (here are already ordered), and if it
+        # is a line, which is indicated with a -1
         set_of_blocks_aux = np.concatenate((coord_textline, np.array(vec_textline_lines).reshape((1,-1))), axis = 0)
         set_of_blocks = dict()
         set_of_blocks[0] = set_of_blocks_aux
         #print(set_of_blocks.shape)
-                        
+
         # The last is the flag_error
         #print(imarray.shape, len(label_textlines), coord_textline.shape, len(set_of_blocks),
         #     len(XML_root), bbox_page.shape)
@@ -499,58 +508,60 @@ class Document:
         return imarray, label_textlines, coord_textline, set_of_blocks, XML_root, bbox_page, flag_error
 #                        imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error
 #                        imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error
-                        
 
-    def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data', 
+
+
+    def plot_orig_textb(self, range_pages = range(1), suffix_xml = '_data',
                          flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'):
-                
+
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
-            
+
         for ind_page in range_pages:
             imarray, margins, ind_limits, _, _, \
                 _, _, _, XML_root, _, flag_error = self._plot_generic_open(ind_page, suffix_xml, level_proc = 0,
                                                                name_outxml = self.name_outxml)
-            
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
             else:
                 imarray_textblock, _ = self._draw_textbl(imarray = imarray, XML_root = XML_root)
 
-                self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file, 
-                                flag_plot, flag_save_figs)   
+                self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file,
+                                flag_plot, flag_save_figs)
 
-    def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data', 
+            self._plot_save(imarray_textblock, 'Textboxes original', 'OrigTextb', ind_page, self.path_file,
+                            flag_plot, flag_save_figs)
+
+    def plot_margins_doc(self, range_pages = range(1), suffix_xml = '_data',
                          flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml'):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
-        
+
         for ind_page in range_pages:
             imarray, margins, ind_limits, _, _, \
                 _, _, _, _, _, flag_error= self._plot_generic_open(ind_page, suffix_xml, level_proc = 4,
                                                         name_outxml = self.name_outxml)
-
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:            
+            else:
                 im_met = plot_tools.plot_margins(imarray, margins, ind_limits, gap_line = 1)
 
                 self._plot_save(im_met, 'Page margins', 'Margins', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)             
-                
-    def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data', 
+                       flag_plot, flag_save_figs)
+
+    def plot_boxes_labels(self, range_pages = range(1), suffix_xml = '_data',
                          flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
                          name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_legend = 1):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
-            self.name_outxml = name_outxml     
+            self.name_outxml = name_outxml
         if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml              
-        
+            self.name_outcorrxml = name_outcorrxml
+
         name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
         for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar): 
+            if flag_compute or not os.path.isfile(name_tar):
                 imarray, _, _, label_textlines, list_allcoords_textlines, _, _, _, _, _, flag_error = \
                     self._plot_generic_open(ind_page, suffix_xml, level_proc = 5,
                                             name_outxml = self.name_outxml)
@@ -558,80 +569,80 @@ class Document:
             else:
                 imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
                 #print(len(array_elements))
-                
+
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:             
-                im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)     
+            else:
+                im_met, groups, colors = plot_tools.plot_labelled_boxes(imarray,label_textlines, list_allcoords_textlines)
                 self._plot_save_labels(im_met, 'Textlines labelled', 'TextlLabel', ind_page, groups, colors, self.path_file,
                                        flag_plot, flag_save_figs, flag_legend)
-                            
-                
-    def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data', 
+
+
+    def plot_textl_ordered(self, range_pages = range(1), suffix_xml = '_data',
                            flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
                            name_outcorrxml = '04_correctedxml', flag_compute = 0):
-        
+
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
         if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml            
-        
+            self.name_outcorrxml = name_outcorrxml
+
         name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
         for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar):            
+            if flag_compute or not os.path.isfile(name_tar):
                 imarray, _, _, _, list_allcoords_textlines, set_of_blocks, _, _, _, _, flag_error = \
                     self._plot_generic_open(ind_page, suffix_xml, level_proc = 6,
                                             name_outxml = self.name_outxml)
-            else: 
+            else:
                 imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_root, bbox_page, flag_error \
-                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                   
-                                    
+                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
+
             #print(set_of_blocks)
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:            
-                im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1]) 
+            else:
+                im_met = plot_tools.plot_orderedtextl(imarray,set_of_blocks,list_allcoords_textlines.shape[1])
 
                 self._plot_save(im_met, 'Textlines ordered', 'TextlOrder', ind_page, self.path_file,
-                       flag_plot, flag_save_figs)            
-                
-    def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data', 
+                       flag_plot, flag_save_figs)
+
+    def plot_XMLcorrect(self, range_pages = range(1), suffix_xml = '_data',
                         flag_plot = 1, flag_save_figs = 0, name_outxml = '02_extractedxml',
                         name_outcorrxml = '04_correctedxml', flag_compute = 0, flag_lines_textl = 1):
         # flag_lines_textl, if 1, plots lines and textboxes, if 2, only lines, if 3, only textboxes
         if 'name_outxml' not in self.__dict__.keys():
             self.name_outxml = name_outxml
         if 'name_outcorrxml' not in self.__dict__.keys():
-            self.name_outcorrxml = name_outcorrxml              
-        
+            self.name_outcorrxml = name_outcorrxml
+
         name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
         for ind_page in range_pages:
-            if flag_compute or not os.path.isfile(name_tar):             
+            if flag_compute or not os.path.isfile(name_tar):
                 imarray, _, _, _, _, _, XML_enrich, bbox_page, _, _, flag_error = \
                     self._plot_generic_open(ind_page, suffix_xml, level_proc = 7,
                                             name_outxml = self.name_outxml)
-            else:                    
+            else:
                 imarray, label_textlines, list_allcoords_textlines, set_of_blocks, XML_enrich, bbox_page, flag_error \
-                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)                  
-            
+                    = self._plot_obtainfromxml(ind_page, suffix_xml, name_outcorrxml = name_outcorrxml)
+
             if flag_error:
                 print(str(ind_page) + ': non existing page!')
-            else:            
+            else:
                 im_met = plot_tools.plot_correctedXML(imarray, XML_enrich, bbox_page, flag_lines_textl)
 
                 self._plot_save(im_met, 'XML corrected', 'XMLcorrect', ind_page, self.path_file,
                        flag_plot, flag_save_figs)
-        
+
     def _plot_save(self, im_met, str_title, str_name, ind_page, folder_save = '',
                    flag_plot = 1, flag_save_figs = 0, dpi = 200):
         if flag_plot:
             fig, axes = plt.subplots(1, 1, figsize=(8, 10))
             axes.axis('off')
-            axes.imshow(im_met) 
+            axes.imshow(im_met)
             plt.title(str_title)
         if flag_save_figs:
             format_fig = 'png'
-            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
+            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                         + '_page' + str(ind_page) + '.' + format_fig)
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
             plt.close(fig)       
@@ -659,17 +670,17 @@ class Document:
                     coords[0] = in_coord
                     coords[1] += int(im_met.shape[1]/1.5)
                     coords[2] = in_coord + 10
-                    coords[3] += int(im_met.shape[1]/1.5)                    
-                im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6)        
+                    coords[3] += int(im_met.shape[1]/1.5)
+                im_met = plot_tools.lines_box(im_met, coords, colors[ind_g], thick_line = 6)
                 coords[0] += inc_page
                 coords[2] += inc_page
-        
+
         if flag_plot:
             fig, axes = plt.subplots(1, 1, figsize=(8, 10))
             axes.axis('off')
-            axes.imshow(im_met) 
+            axes.imshow(im_met)
             plt.title(str_title)
-        
+
         if flag_legend:
             coords = in_coord + np.array([0, 0, 10, 10])
             flag_notinto = 1
@@ -679,17 +690,90 @@ class Document:
                     coords[0] = in_coord
                     coords[1] += int(im_met.shape[1]/1.5)
                     coords[2] = in_coord + 10
-                    coords[3] += int(im_met.shape[1]/1.5)                    
-                plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left')      
+                    coords[3] += int(im_met.shape[1]/1.5)
+                plt.text(coords[1] + 10, coords[2], i_g, fontsize = 10, va = 'bottom', ha = 'left')
                 coords[0] += inc_page
-                coords[2] += inc_page   
-        
+                coords[2] += inc_page
+
         if flag_save_figs:
             format_fig = 'png'
-            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc) 
+            name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                         + '_page' + str(ind_page) + '.' + format_fig)
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)             
-            
-        
-                           
\ No newline at end of file
+            plt.close(fig)
+
+
+
+
+
+    def check_discussion(self):
+        utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
+        flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+
+        return flag_discussion
+
+
+
+    def annotate_xml(self, flag_save = 1, suffix_xml='_data', name_outxml = '02_extractedxml', name_outcorrxml='04_correctedxml', name_outannotxml='05_annotatedxml'):
+
+        start_time = time.time()
+        if 'name_outcorrxml' not in self.__dict__.keys():
+            self.name_outcorrxml = name_outcorrxml
+
+        if 'name_outxml' not in self.__dict__.keys():
+            self.name_outxml = name_outxml
+
+        if 'XML_main_corr' not in self.__dict__.keys():
+            print('no main corr')
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + self.name_outcorrxml + '.tar.gz'
+            if os.path.isfile(name_tar):
+                name_xml = './' + str(self.year) + '/' + str(self.id_doc) + suffix_xml + 'corr.xml'
+                if name_xml in utils_proc.get_list(self.year, self.folder_database, self.name_outcorrxml)[0]:
+                    h_xml = utils_proc.get_handlerfile(name_xml, self.folder_database, self.name_outcorrxml)
+                    XML_tree = ET.parse(h_xml)
+                    self.XML_main_corr = XML_tree.getroot()
+                #else:
+                    #self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0,
+                        #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml,
+                        #name_outcorrxml = self.name_outcorrxml)
+            #else:
+                ## TODO if already exists 02_extractedxml
+                #self.correct_xml(flag_plots = 0, flag_parallel = 0, flag_save_figs = 0,
+                    #pages = 'all', suffix_xml = '_data', name_outxml = self.name_outxml,
+                    #name_outcorrxml = self.name_outcorrxml)
+
+
+        print('we have a main corr XML file')
+        #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False)
+        self.XML_main_annot = XML_main_annot
+
+        # save xml file
+        name_xml = './' + str(self.year) + '/' + self.name_wo_ext + '.xml'
+        tree = ET.ElementTree(XML_main_annot)
+        if not os.path.exists('./' + str(self.year)):
+            os.makedirs('./' + str(self.year))
+        tree.write(name_xml, encoding = 'utf-8')
+
+        if flag_save:
+            name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outannotxml)
+        else:
+            print('Not saving to tar')
+            name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outannotxml + '.tar.gz'
+
+        self.name_outannotxml = name_outannotxml
+        self.name_annot_corr = [name_tar, name_xml]
+#        self._xml_ext(suffix_xml, self.name_outannotxml)
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+        utils_proc.call_with_out(command)
+
+        print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time)))
+
+
+        command = 'rm -rf ./' + str(self.year)
+        #print(command)
+#        utils_proc.call_with_out(command)
diff --git a/src/python/preproc_docs.py b/src/python/preproc_docs.py
index 8b4349f037ab126cf60cad2098616713d79a2a0d..b9c9bbb0fb8f88e699daad571d980044a24063bb 100644
--- a/src/python/preproc_docs.py
+++ b/src/python/preproc_docs.py
@@ -6,7 +6,7 @@ Created on Fri Sep 28 13:39:10 2018
 @author: luissalamanca
 """
 
-# File for all the functions used for preprocessing. 
+# File for all the functions used for preprocessing.
 
 import numpy as np
 import os
@@ -46,42 +46,42 @@ import tables
 HEIGHT_CHAR = 12
 WIDTH_CHAR = 6
 
-def find_mainHorandCentral_Hough(img, coord, dim_bbox_page, 
+def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
                                  flag_2col, flag_central = 1):
     # Using the coordinates of the boxes, we put the rest to 0, and then estimate
     # the central line
-    # Here, since we use the image, we have to rely again on a ref00 in topleft, and 
+    # Here, since we use the image, we have to rely again on a ref00 in topleft, and
     # the corners in topleftbottomright
-    # We also look for horizontal lines 
+    # We also look for horizontal lines
     # We assume that we will only have one vertical line, and then many horizontal
     # lines, either spanning the whole image, or at both sides of the central line
-            
+
     coord, rescale_factor = adapt_coordtoimg(img, coord, dim_bbox_page)
     img_aux = np.abs(255 - img[:,:,0])
     img_aux[img_aux < 20] = 0
     img_aux[img_aux >= 20] = 255
     img_aux_in = np.copy(img_aux)
-    
-    
+
+
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     gap_central = int(4 * width_resc)
     top_bbox_red = 0 #int(height_resc/2)
-    
+
     for ind in range(coord.shape[1]):
         img_aux[(coord[0,ind] + top_bbox_red):coord[2,ind],coord[1,ind]:coord[3,ind]] = 0
-    
+
     # Also remove possible mark and artefacts in the edges
     img_aux[:,:int(img_aux.shape[1]/20)] = 0
     img_aux[:int(img_aux.shape[0]/20),:] = 0
     img_aux[int(19 * img_aux.shape[0]/20):,:] = 0
-    img_aux[:,int(19 * img_aux.shape[1]/20):] = 0    
-    
+    img_aux[:,int(19 * img_aux.shape[1]/20):] = 0
+
     img_prev = np.copy(img_aux)
-    
+
     img_aux_rem = remove_small_objects(label(img_aux), 2 * width_resc)
     #img_aux = dilation(img_aux_rem, selem = np.ones((11,11)))
-    img_aux = dilation(img_aux_rem, selem = np.ones((5,5))) 
+    img_aux = dilation(img_aux_rem, selem = np.ones((5,5)))
     max_val = np.max(img_aux)
     if max_val > 0:
         img_aux_norm = (255 * img_aux/max_val).astype(np.uint8)
@@ -90,13 +90,13 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
     else:
         img_aux[:] = 0
     #print(np.unique(img_aux))
-    
+
     # Remove big objects, like the shields and other logos
     #img_label = label(img_aux)
 
     edges = canny(img_aux, 2, 1, 25)
     #img_cent = np.copy(img_aux)
-    
+
     if flag_2col:
         if flag_central:
             img_cent = np.copy(img_prev)
@@ -108,37 +108,37 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
             #lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = 2 * width_resc,
             #                                 line_gap = width_resc)
             lines_vert = probabilistic_hough_line(edges_cent, theta = theta, line_length = int(2 * width_resc),
-                                             line_gap = int(width_resc))            
+                                             line_gap = int(width_resc))
         else:
             sum_img_aux_in = np.sum(img_aux_in, axis = 0)
             sum_img_aux_in = sum_img_aux_in[int(2*img_aux.shape[1]/5):int(3*img_aux.shape[1]/5)]
-            
+
             #plt.plot(sum_img_aux_in)
             #sum_img_aux_in[sum_img_aux_in < np.max(sum_img_aux_in)/10] = 0
             # We need to substract the baseline value, in order to account for
             # central headers and stuff like that
             sum_img_aux_in = sum_img_aux_in - np.min(sum_img_aux_in)
-            #not_end_vect = 1 
+            #not_end_vect = 1
             #while not_end_vect:
             ind_min_start = np.argwhere((sum_img_aux_in) < np.mean(sum_img_aux_in)/10)
-            ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start)   
+            ind_min_end = int(2*img_aux.shape[1]/5) + np.max(ind_min_start)
             ind_min_start = int(2*img_aux.shape[1]/5) + np.min(ind_min_start)
             ind_central = int((ind_min_start + ind_min_end)/2)
-            coord_vert_def = np.array([1, ind_central - int(width_resc/2), 
+            coord_vert_def = np.array([1, ind_central - int(width_resc/2),
                                    img_aux_in.shape[0], ind_central + int(width_resc/2)])
-            #print(lines_vert,img_aux.shape)  
-        
-    theta = np.linspace(-5*pi/8, -3* pi/8,num = 90)    
-    #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90)    
+            #print(lines_vert,img_aux.shape)
+
+    theta = np.linspace(-5*pi/8, -3* pi/8,num = 90)
+    #theta = np.linspace(-9*pi/16, -7*pi/16,num = 90)
     #lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = 2 * width_resc,
-    #                                 line_gap = width_resc)      
+    #                                 line_gap = width_resc)
     lines_horz = probabilistic_hough_line(edges, theta = theta, line_length = int(2 * width_resc),
-                                     line_gap = int(width_resc))  
-        
-    # These lines are given in a standard xy coordinate, with the corner in the 
-    # bottom left    
+                                     line_gap = int(width_resc))
+
+    # These lines are given in a standard xy coordinate, with the corner in the
+    # bottom left
     lines_horz = np.transpose(np.asarray(lines_horz).reshape((len(lines_horz),4)))
-    
+
 
     lines_horz = np.concatenate((np.minimum(lines_horz[1,:],lines_horz[3,:]).reshape((1,lines_horz.shape[1])),
                                  np.minimum(lines_horz[0,:],lines_horz[2,:]).reshape((1,lines_horz.shape[1])),
@@ -150,15 +150,15 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
                                      np.minimum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])),
                                      np.maximum(lines_vert[1,:],lines_vert[3,:]).reshape((1,lines_vert.shape[1])),
                                      np.maximum(lines_vert[0,:],lines_vert[2,:]).reshape((1,lines_vert.shape[1])))).astype(np.int32)
-        
-    
+
+
     #lines_horz = transform_coord(lines_horz, dim_page = img_aux.shape, invert_xy = True)
-    #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True)      
-      
+    #lines_vert = transform_coord(lines_vert, dim_page = img_aux.shape, invert_xy = True)
+
     # First clean the vertical from unexpected outliers
     if flag_central:
-        sum_rows = np.sum(img_cent, axis = 0)/255    
-        ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)]) 
+        sum_rows = np.sum(img_cent, axis = 0)/255
+        ind_central = int(2*img.shape[1]/5) + np.argmax(sum_rows[int(2*img.shape[1]/5):int(3*img.shape[1]/5)])
         ind_valid = np.intersect1d(np.argwhere([(ind_central - gap_central) < aux_l1 < (ind_central + gap_central) for aux_l1 in lines_vert[1,:]]),
                                     np.argwhere([(ind_central - gap_central) < aux_l2 < (ind_central + gap_central) for aux_l2 in lines_vert[3,:]]))
         if len(ind_valid):
@@ -169,19 +169,19 @@ def find_mainHorandCentral_Hough(img, coord, dim_bbox_page,
             coord_vert_def = np.array([0, img_aux.shape[1]/2 - width_resc, height_resc, img_aux.shape[1]/2 + width_resc])
 
     #ind_central = np.mean(coord_vert_def[[1,3]])
-    
+
     # And now, just iterate over the horizontal lines, merging them if required.
     return clean_horz_vert_lines(lines_horz, coord_vert_def, width_resc, height_resc,
                                  ind_central, gap_central, img_aux.shape)
-    
-    
+
+
 def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_resc,
                           ind_central, gap_central, dim_page):
     # We just iterate over all the horizontal lines, merging them if required
     coord_horz = np.array([]).reshape((4,0)).astype(np.int32)
     min_length_line = 2 * width_resc
-    
-    while coord_horz_pre.size > 3:        
+
+    while coord_horz_pre.size > 3:
         if coord_horz_pre.shape[1] == 1:
             coord_horz = np.concatenate((coord_horz, coord_horz_pre[:,0].reshape((4,1))), axis = 1)
             coord_horz_pre = np.array([])
@@ -190,33 +190,33 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
             #print(coord_horz_curr)
             coord_horz_check = coord_horz_pre[:,1:]
             flag_stay = 1
-            while flag_stay:                
+            while flag_stay:
                 # Boxes to the right
                 ind_val1 = np.intersect1d(np.argwhere((abs(coord_horz_check[1,:] - coord_horz_curr[3]) < (width_resc * 10))),
                                          np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc))))
                 # Boxes to the left
                 ind_val2 = np.intersect1d(np.argwhere((abs(coord_horz_check[3,:] - coord_horz_curr[1]) < (width_resc * 10))),
-                                         np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc))))  
-                                
+                                         np.argwhere((abs(coord_horz_check[0,:] - coord_horz_curr[0]) < (height_resc))))
+
                 ind_val = np.unique(np.concatenate((ind_val1,ind_val2)))
                 if len(ind_val) > 0:
                     for i_b in range(len(ind_val)):
-                        coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])), 
+                        coord_horz_curr = np.array([np.min((coord_horz_curr[0],coord_horz_check[0,ind_val[i_b]])),
                                                     np.min((coord_horz_curr[1],coord_horz_check[1,ind_val[i_b]])),
-                                                    np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])), 
+                                                    np.max((coord_horz_curr[2],coord_horz_check[2,ind_val[i_b]])),
                                                     np.max((coord_horz_curr[3],coord_horz_check[3,ind_val[i_b]]))])
                     coord_horz_check = coord_horz_check[:,np.setdiff1d(np.arange(coord_horz_check.shape[1]),
                                                                        ind_val)]
-                    #coord_horz_check = np.delete(coord_horz_check, ind_val, 1)    
+                    #coord_horz_check = np.delete(coord_horz_check, ind_val, 1)
                     if coord_horz_check.shape[1] == 0:
                         flag_stay = 0
                         coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1)
                         coord_horz_pre = np.array([])
-                else: 
+                else:
                     flag_stay = 0
                     coord_horz = np.concatenate((coord_horz, coord_horz_curr.reshape((4,1))), axis = 1)
                     coord_horz_pre = coord_horz_check[:,:]
-    
+
     # Remove overlapping boxes
     coord_horz_def = np.array([]).reshape((4,0))
     while coord_horz.size > 3:
@@ -226,15 +226,15 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
                                               np.argwhere((width_resc/2 + coord_horz_curr[2]) > coord_horz[2,:]),
                                               np.argwhere((width_resc/2 + coord_horz_curr[3]) > coord_horz[3,:])))
         ind_overlap = np.setdiff1d(ind_overlap,0)
-        
+
         coord_horz_def = np.concatenate((coord_horz_def, coord_horz_curr.reshape((4,1))), axis = 1)
         coord_horz = coord_horz[:,np.setdiff1d(np.arange(1,coord_horz.shape[1]),ind_overlap)]
         #coord_horz = np.delete(coord_horz, ind_overlap, 1)
-        
+
         if coord_horz.size == 4:
             coord_horz_def = np.concatenate((coord_horz_def, coord_horz.reshape((4,1))), axis = 1)
             coord_horz = np.array([0])
-    
+
     ind_val_long = np.argwhere((coord_horz_def[3,:] - coord_horz_def[1,:]) > (3 * (coord_horz_def[2,:] - coord_horz_def[0,:])))
     coord_horz_def = coord_horz_def[:,ind_val_long].reshape((4,ind_val_long.shape[0]))
 
@@ -245,35 +245,35 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
         coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0]))
     else:
         coord_horz_def = np.array([]).reshape((4,0))
-    
-   
+
+
     # To identify the topline
     '''
     ind_topline = identify_topline(coord_horz_def, width_resc, dim_page)
-    if str_page == 'firsts':                               
+    if str_page == 'firsts':
         # We correct the top of the vertical line in case it is cutting some of the horizontal lines
         ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)),
                                                np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)),
                                                np.argwhere(coord_horz_def[0,:] > coord_vert_def[0])))
         ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline)
-        
+
         coord_vert_def = np.array([np.max(np.concatenate((np.array([coord_vert_def[0]]),coord_horz_def[2,ind_val_horz]))),coord_vert_def[1],
-                                   coord_vert_def[2],coord_vert_def[3]])  
+                                   coord_vert_def[2],coord_vert_def[3]])
     elif str_page == 'lasts':
-        # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines        
+        # We correct the bottom of the vertical line in case it is cutting some of the horizontal lines
         ind_val_horz = reduce(np.intersect1d, (np.argwhere(coord_horz_def[1,:] < (ind_central - gap_central)),
                                                np.argwhere(coord_horz_def[3,:] > (ind_central + gap_central)),
-                                               np.argwhere(coord_horz_def[2,:] < coord_vert_def[2])))        
+                                               np.argwhere(coord_horz_def[2,:] < coord_vert_def[2])))
         ind_val_horz = np.setdiff1d(ind_val_horz, ind_topline)
-        
+
         coord_vert_def = np.array([coord_vert_def[0],coord_vert_def[1],
-                                   np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]])  
+                                   np.min(np.concatenate((np.array([coord_vert_def[2]]),coord_horz_def[0,ind_val_horz]))),coord_vert_def[3]])
     '''
-    
+
     coord_vert_def[1] = np.max((coord_vert_def[1], int(ind_central - width_resc)))
     coord_vert_def[3] = np.min((coord_vert_def[3], int(ind_central + width_resc)))
-    
-    # Finally, remove short central lines, likely artefacts of the calculation 
+
+    # Finally, remove short central lines, likely artefacts of the calculation
     # of the central vertical line
     length_lines = coord_horz_def[3,:] - coord_horz_def[1,:]
     ind_wrong = reduce(np.intersect1d, (np.argwhere(length_lines < 2* min_length_line),
@@ -283,98 +283,98 @@ def clean_horz_vert_lines(coord_horz_pre, coord_vert_def, width_resc, height_res
     if len(ind_val):
         coord_horz_def = coord_horz_def[:, ind_val].reshape((4,ind_val.shape[0]))
     else:
-        coord_horz_def = np.array([]).reshape((4,0))    
-    
-    return coord_vert_def, coord_horz_def  
+        coord_horz_def = np.array([]).reshape((4,0))
+
+    return coord_vert_def, coord_horz_def
 
 def identify_topline(coord_horz, width_resc, dim_page):
     # Two rules for identifying the top line
     ind_topline = reduce(np.intersect1d, (np.argwhere(coord_horz[2,:] < dim_page[0]/8),
                                                np.argwhere((coord_horz[3,:] - coord_horz[1,:]) > width_resc * 60)))
-    
+
     return ind_topline
 
 def lateral_margins(img, dim_bbox_page, coord_vert, coord_horz):
-    
+
     coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page)
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     gap_central = int(3 * width_resc)
     thres_margin = 0.1
-    
+
     img_aux = np.abs(255 - img[:,:,0])
     for ind in range(coord_horz.shape[1]):
         img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0
-    
+
     img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0
     central_line = (coord_vert[1] + coord_vert[3])/2
-    
+
     # Also remove possible mark and artefacts in the edges
     img_aux[:,:gap_central] = 0
     img_aux[:int(gap_central/2),:] = 0
     img_aux[(img_aux.shape[1] - gap_central):,:] = 0
-    img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0 
-    
+    img_aux[:,(img_aux.shape[1] - int(gap_central/2)):] = 0
+
     sum_imarray_aux = np.sum(img_aux, axis = 0)
     sum_imarray_aux = 1000*sum_imarray_aux.astype(np.float64)/np.max(sum_imarray_aux)
     mean_val_rows_left = np.mean(sum_imarray_aux[:int(central_line - gap_central)])
     mean_val_rows_right = np.mean(sum_imarray_aux[int(central_line + gap_central):])
-        
+
     left_margin = np.min(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_left))
     right_margin = np.max(np.argwhere(sum_imarray_aux > thres_margin * mean_val_rows_right))
-    
+
     return left_margin, right_margin, left_margin/rescale_factor[0,1], right_margin/rescale_factor[0,1]
 
 def bottomtop_margins(img, dim_bbox_page, coord_vert, coord_horz):
 
     val_thres = 300 # In this case we don't use the mean of sum_cols because we have
-                    
+
     coord, rescale_factor = adapt_coordtoimg(img, dim_bbox_page, dim_bbox_page)
     img_aux = np.abs(255 - img[:,:,0])
-    
+
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     gap_central = int(3 * width_resc)
-        
+
     for ind in range(coord_horz.shape[1]):
         img_aux[coord_horz[0,ind]:coord_horz[2,ind],coord_horz[1,ind]:coord_horz[3,ind]] = 0
-    
+
     img_aux[coord_vert[0]:coord_vert[2],coord_vert[1]:coord_vert[3]] = 0
-    
+
     sum_cols = np.sum(img_aux, axis = 1)/255
     sum_cols = 1000 * sum_cols/np.max(sum_cols)
-    
+
     # Now, limit by using the horizontal lines
     ind_topline = identify_topline(coord_horz, width_resc, img_aux.shape)
-    
+
     if len(ind_topline) > 0:
         ind_min_textbox = np.max(coord_horz[2,ind_topline])
         sum_cols[:ind_min_textbox] = 0
-        
+
     #plt.figure()
     #plt.plot(sum_cols)
-    ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)), 
+    ind_limits = np.array([np.min(np.argwhere(sum_cols > val_thres)),
                            np.max(np.argwhere(sum_cols > val_thres))])
-    
-    return ind_limits  
+
+    return ind_limits
 
 def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_font_sizes):
-    
+
     # In xml_page the levels are: xml_page[i][j][k], i for blocks, j for textlines
     # and k for characters
-    
+
     coord, rescale_factor = adapt_coordtoimg(img, bbox_page, bbox_page)
     list_coords_blocks = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_page[:-2]]).astype(np.float64))
     list_coords_blocks, rescale_factor = adapt_coordtoimg(img, list_coords_blocks, bbox_page)
-    
+
     font_main_block = info_font_sizes[0, np.argmax(info_font_sizes[1,:])]
     thres_font = font_main_block/5 # To compensate for error in the fontsize between columns
     width_resc = WIDTH_CHAR * rescale_factor[0,1]
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     gap_central = int(2 * width_resc)
     indentation = int(4 * width_resc)
-    
+
     ind_central = (coord_vert_def[3] + coord_vert_def[1])/2
-    
+
     # First pass just to discover main blocks
     list_col1 = list()
     list_col2 = list()
@@ -383,25 +383,25 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
     all_mean_heights = np.array([]).reshape((1,0))
     list_allcoords_textlines = np.array([]).reshape((4,0))
     relative_ref_textline = np.array([], dtype = np.uint32).reshape((3,0))
-    
+
     count_text = 0
-    
+
     for ind_block in range(len(xml_page)-2):
         xml_block = xml_page[ind_block]
         list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]
                                                       if 'bbox' in o.attrib]).astype(np.float64))
-        #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64))    
+        #list_coords_textline = np.transpose(np.array([o.attrib['bbox'].split(',') for o in xml_block[:]]).astype(np.float64))
         if len(list_coords_textline)>3:
             list_coords_textline
             list_coords_textline, rescale_factor = adapt_coordtoimg(img, list_coords_textline, bbox_page)
             list_allcoords_textlines = np.concatenate((list_allcoords_textlines, list_coords_textline), axis = 1)
             relative_ref_textline_aux = np.zeros((3,list_coords_textline.shape[1]))
-            
+
             relative_ref_textline_aux[0,:] = count_text + np.arange(list_coords_textline.shape[1])
             relative_ref_textline_aux[1,:] = ind_block
             relative_ref_textline_aux[2,:] = np.arange(list_coords_textline.shape[1])
             relative_ref_textline = np.concatenate((relative_ref_textline,relative_ref_textline_aux.astype(np.uint32)), axis = 1)
-            
+
             for ind_textl in range(list_coords_textline.shape[1]):
                 all_heights = np.array([])
                 xml_textline = xml_block[ind_textl]
@@ -412,10 +412,10 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                             all_heights = np.append(all_heights, float(xml_text.attrib['size']))
                     #fontsize = fontsize_fromtextline(img[bbox_textline[0]:bbox_textline[2],
                     #                                        bbox_textline[1]:bbox_textline[3],0])
-        
+
                     fontsize = np.average(all_heights)
                     all_mean_heights = np.append(all_mean_heights, fontsize)
-                    
+
                     # Normal font
                     #if ((font_main_block - thres_font) < mean_height < (font_main_block + thres_font)):
                     if ((font_main_block - thres_font) < fontsize < (font_main_block + thres_font)):
@@ -426,18 +426,18 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                             if len(xml_block[0]) < 12:
                                 list_pagen.append(count_text)
                             else:
-                                list_textinheader.append(count_text)                    
+                                list_textinheader.append(count_text)
                         elif ((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))):
                             list_col1.append(count_text)
                         # Right side of the central line
-                        elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))): 
+                        elif ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central))):
                             list_col2.append(count_text)
                     count_text += 1
-                
+
     discovered_blocks = np.concatenate((np.array(list_col1),np.array(list_col2),
                                         np.array(list_pagen),np.array(list_textinheader)))
     blocks_left = np.setdiff1d(np.arange(list_allcoords_textlines.shape[1]),discovered_blocks)
-    
+
     if len(list_col1):
         bbox_col1 = np.array([np.min(list_allcoords_textlines[0,list_col1]),
                               np.min(list_allcoords_textlines[1,list_col1]),
@@ -445,17 +445,17 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                               np.max(list_allcoords_textlines[3,list_col1])])
     else:
         bbox_col1 = np.array([0,0,10,10]) # Dummy value
-        
+
     if len(list_col2):
         bbox_col2 = np.array([np.min(list_allcoords_textlines[0,list_col2]),
                               np.min(list_allcoords_textlines[1,list_col2]),
                               np.max(list_allcoords_textlines[2,list_col2]),
                               np.max(list_allcoords_textlines[3,list_col2])])
     else:
-        bbox_col2 = np.array([0,0,10,10]) # Dummy value    
+        bbox_col2 = np.array([0,0,10,10]) # Dummy value
 
     list_header = list()
-    list_header_singlecol = list()   
+    list_header_singlecol = list()
     list_footnote = list()
     list_notidentified = list()
     for ind_textline in blocks_left:
@@ -463,7 +463,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
         if xml_textline.tag == 'textline':
             bbox_textline = list_allcoords_textlines[:,ind_textline]
             # Small fontsize and below current bboxes of main blocks
-            if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and 
+            if ((all_mean_heights[ind_textline] < (font_main_block - thres_font)) and
                 (bbox_textline[2] > bbox_col1[2]) and (bbox_textline[2] > bbox_col2[2])):
                 list_footnote.append(ind_textline)
             # Large fontsizes
@@ -473,7 +473,7 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                     list_header.append(ind_textline)
                 # To the left or right of the central line
                 elif (((bbox_textline[1] > (margins[0] - indentation)) and (bbox_textline[3] < (ind_central + gap_central))) or
-                      ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))): 
+                      ((bbox_textline[1] > (ind_central - indentation)) and (bbox_textline[3] < (margins[1] + gap_central)))):
                     list_header_singlecol.append(ind_textline)
             # Standard fontsize
             elif ((font_main_block - thres_font) < all_mean_heights[ind_textline] < (font_main_block + thres_font)):
@@ -483,13 +483,13 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
                     list_col1.append(ind_textline)
                 # Contained into the bbox of the right column
                 elif (((bbox_col2[0] - height_resc) < bbox_textline[0]) and ((bbox_col2[1] - width_resc) < bbox_textline[1])
-                        and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])):                    
+                        and ((bbox_col2[2] + height_resc) > bbox_textline[2]) and ((bbox_col2[3] + width_resc) > bbox_textline[3])):
                     list_col2.append(ind_textline)
             else:
                 list_notidentified.append(ind_textline)
-    
+
     label_textlines = dict()
-    label_textlines['text_col1'] = list_col1    
+    label_textlines['text_col1'] = list_col1
     label_textlines['text_col2'] = list_col2
     label_textlines['footnote'] = list_footnote
     label_textlines['pagen'] = list_pagen
@@ -497,8 +497,8 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
     label_textlines['header'] = list_header
     label_textlines['header_singlecol'] = list_header_singlecol
     label_textlines['notidentified'] = list_notidentified
-    
-    vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str)             
+
+    vec_labels_textline = np.zeros(list_allcoords_textlines.shape[1]).astype(np.str)
     vec_labels_textline[list_col1] = 'text_col1'
     vec_labels_textline[list_col2] = 'text_col2'
     vec_labels_textline[list_footnote] = 'footnote'
@@ -507,56 +507,56 @@ def label_textblocks(img, xml_page, bbox_page, margins, coord_vert_def, info_fon
     vec_labels_textline[list_header] = 'header'
     vec_labels_textline[list_header_singlecol] = 'header_singlecol'
     vec_labels_textline[list_notidentified] = 'notidentified'
-    
-    # relative_ref_textline: three rows with the following, the aboslute reference 
+
+    # relative_ref_textline: three rows with the following, the aboslute reference
     # for the textline, the number of the block, and the number of the textline inside
     # that block
     return label_textlines, list_allcoords_textlines, relative_ref_textline, all_mean_heights, vec_labels_textline
 
-          
+
 def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textlines, margins):
-    # Two steps, first ordering the textlines, grouping them in big blocks separated 
+    # Two steps, first ordering the textlines, grouping them in big blocks separated
     # by horizontal lines. Then, inside these groups, we group them in textboxes,
     # incorporating this to the XML
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     widht_resc = WIDTH_CHAR * rescale_factor[0,1]
-    
+
     gap_central = 3 * widht_resc
     gap_row = height_resc/2
-    
-    # This parameters is intended for removing artefacts such as small dots in the 
+
+    # This parameters is intended for removing artefacts such as small dots in the
     # text. But we have to be careful, as we can remove valuable characters.
     # I first set a value of 3 * width_resc/4
-    min_width_textl = 6 * widht_resc/4 
-        
+    min_width_textl = 6 * widht_resc/4
+
     central_line = (coord_vert_def[3] + coord_vert_def[1])/2
     array_coords_textl = np.concatenate((list_allcoords_textlines[:,:],
                                         np.arange(list_allcoords_textlines.shape[1]).reshape((1,list_allcoords_textlines.shape[1]))))
-    
+
     # Clean from to thin lines, thatn are just probably artefacts
     all_widths = array_coords_textl[3,:] - array_coords_textl[1,:]
     ind_valid = np.argwhere(all_widths > min_width_textl)
     array_coords_textl = array_coords_textl[:,ind_valid].reshape((5,len(ind_valid)))
-    
+
     ind_centralines = np.intersect1d(np.argwhere(coord_horz[1,:] < (central_line - gap_central)),
                                      np.argwhere(coord_horz[3,:] > (central_line + gap_central)))
     ind_sepfootnotes = np.intersect1d(np.argwhere(coord_horz[1,:] < (margins[0] + 2 * widht_resc)),
-                                      np.argwhere(coord_horz[3,:] < (central_line - gap_central))) 
+                                      np.argwhere(coord_horz[3,:] < (central_line - gap_central)))
     ind_centralines = np.union1d(ind_centralines,ind_sepfootnotes)
     ind_collines = np.setdiff1d(np.arange(coord_horz.shape[1]),ind_centralines)
-    
+
     array_coords_centrall = coord_horz[:,ind_centralines]
     array_coords_coll = coord_horz[:,ind_collines]
     array_coords_coll = np.concatenate((array_coords_coll,
                                         -1 * np.ones(array_coords_coll.shape[1]).reshape((1,array_coords_coll.shape[1]))))
-    
-    not_visited = 1    
+
+    not_visited = 1
     toprow = 0
     count_b = 0
     set_of_blocks = dict()
     array_coords_centrall_ord = np.array([]).reshape((4,0))
     while not_visited:
-        
+
         if array_coords_centrall.size > 3:
             bottomrow = np.min(array_coords_centrall[0,:])
             ind_bottomrow = np.argmin(array_coords_centrall[0,:])
@@ -573,47 +573,47 @@ def order_textl(rescale_factor, coord_vert_def, coord_horz, list_allcoords_textl
         coord_cat = np.concatenate((array_coords_textl[:,ind_textl_proc].reshape(5,len(ind_textl_proc)),
                                     array_coords_coll[:,ind_lines_proc].reshape(5,len(ind_lines_proc))),
                                     axis = 1)
-        
+
         if coord_cat.size > 0:
             flag_col = 1
-            ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line)   
+            ind_currcord = topmost_left_box(coord_cat, gap_row, max_col = central_line)
             if ind_currcord == -1:
-                ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)  
+                ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)
                 flag_col = 2
-                
+
             order_coords = np.array([]).reshape(5,0)
             while coord_cat.size > 4:
                 order_coords = np.concatenate((order_coords,coord_cat[:,ind_currcord].reshape(5,1)), axis = 1)
                 curr_coord = coord_cat[:,ind_currcord]
                 coord_cat = np.delete(coord_cat,ind_currcord,1)
                 if coord_cat.size > 4:
-                    if flag_col == 1:                
+                    if flag_col == 1:
                         ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, max_col = central_line)
-                        
+
                         if ind_currcord == -1:
                             ind_currcord = next_textline_samecol(coord_cat, gap_row, max_col = central_line)
-                        
+
                         if ind_currcord == -1 :
-                            ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)   
+                            ind_currcord = topmost_left_box(coord_cat, gap_row, min_col = central_line)
                             flag_col = 2
-                                            
+
                     elif flag_col == 2:
                         ind_currcord = next_textline_samerow(coord_cat, gap_row, curr_coord, min_col = central_line)
-                        
+
                         if ind_currcord == -1:
                             ind_currcord = next_textline_samecol(coord_cat, gap_row, min_col = central_line)
-                        
+
                         if ind_currcord == -1 :
                             flag_col = 1
                             ind_currcord = 0
-                    
+
         else:
-             order_coords = np.array([]).reshape(5,0) 
-             
+             order_coords = np.array([]).reshape(5,0)
+
         toprow = np.copy(bottomrow)
         set_of_blocks[count_b] = order_coords
         count_b += 1
-        
+
     return set_of_blocks, array_coords_centrall_ord
 
 def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000):
@@ -629,13 +629,13 @@ def topmost_left_box(coords, gap_row, min_col = 0, max_col = 10000):
         return curr_ind
     else:
         return -1
-    
+
 def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10000):
     curr_row = curr_coord[2]
     #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col),
-    #                          np.argwhere(coords[3,:] > min_col))    
+    #                          np.argwhere(coords[3,:] > min_col))
     ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)),
-                              np.argwhere(coords[1,:] > (min_col - gap_row)))      
+                              np.argwhere(coords[1,:] > (min_col - gap_row)))
     if len(ind_valid):
         min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (curr_row - gap_row)),
                                  np.argwhere(coords[2,ind_valid] < (curr_row + gap_row)))
@@ -646,41 +646,41 @@ def next_textline_samerow(coords, gap_row, curr_coord, min_col = 0, max_col = 10
         else:
             return -1
     else:
-        return -1 
-    
+        return -1
+
 def next_textline_samecol(coords, gap_row, min_col = 0, max_col = 10000):
     #print(coords, max_col, min_col)
     #ind_valid = np.intersect1d(np.argwhere(coords[1,:] < max_col),
-    #                           np.argwhere(coords[3,:] > min_col))    
+    #                           np.argwhere(coords[3,:] > min_col))
     ind_valid = np.intersect1d(np.argwhere(coords[3,:] < (max_col + gap_row)),
-                              np.argwhere(coords[1,:] > (min_col - gap_row)))      
+                              np.argwhere(coords[1,:] > (min_col - gap_row)))
     if len(ind_valid):
         min_row = np.min(coords[2,ind_valid])
         min_row = np.intersect1d(np.argwhere(coords[2,ind_valid] > (min_row - gap_row)),
-                                 np.argwhere(coords[2,ind_valid] < (min_row + gap_row)))     
+                                 np.argwhere(coords[2,ind_valid] < (min_row + gap_row)))
         ind_valid_min = ind_valid[min_row]
         ind_next_textl = ind_valid_min[np.argmin(coords[1,ind_valid_min])]
         return ind_next_textl
     else:
-        return -1     
+        return -1
 
 
 def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescale_factor,
                            centrall_ord, ind_page, dim_img):
-    
+
     height_resc = HEIGHT_CHAR * rescale_factor[0,1]
     widht_resc = WIDTH_CHAR * rescale_factor[0,1]
-        
+
     max_inrow_sep = 4 * widht_resc
-    max_incol_sep = 1 * height_resc    
+    max_incol_sep = 1 * height_resc
     gap_row = height_resc/2
     similarity_fonts = 0.95
     indentation = 2 * widht_resc
-    
-    centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True, 
+
+    centrall_ord_trans = transform_coord_toorig(centrall_ord, dim_page = dim_img, invert_xy = True,
                                                 rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft',
                                                 refCorners = 'topleftbottomright')
-    
+
     # Start creating the xml
     xml_e = []
     xml_e = ET.Element('pages')
@@ -689,21 +689,21 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
     page_el.attrib['bbox'] = xml_t[0].attrib['bbox']
     page_el.attrib['rotate'] = '0'
     xml_e.append(page_el)
-    
+
     val_type_col1 = {'text_col1', 'notidentified', 'header_singlecol', 'text_inheader'}
     val_type_col2 = {'text_col2', 'notidentified', 'header_singlecol', 'text_inheader'}
-    
-    
+
+
     count_b = 0
     text_b = ET.SubElement(page_el, 'textbox')
-    text_b.attrib['id'] = str(count_b)    
-    text_b.attrib['block'] = '0'    
+    text_b.attrib['id'] = str(count_b)
+    text_b.attrib['block'] = '0'
     for ind_b in range(len(set_of_blocks)):
         all_el = set_of_blocks[ind_b].astype(np.int64)
         all_bbox = np.array([]).reshape((4,0))
         for ind_c in range(all_el.shape[1]):
-            curr_el = all_el[:,ind_c]  
-            flag_copy_textb = 1   
+            curr_el = all_el[:,ind_c]
+            flag_copy_textb = 1
             # If it is a textline with text
             if curr_el[4] > -1:
                 all_bbox = np.concatenate((all_bbox, curr_el[:4].reshape((4,1))), axis = 1)
@@ -713,12 +713,12 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
                 text_l.attrib['type'] = type_textl
                 text_b.append(text_l)
                 type_textbox = 'text'
-                
+
                 # To check if it satisfies the conditions for being a new textbox
                 if ind_c < (all_el.shape[1] - 1):
-                    next_el = all_el[:,ind_c + 1] 
-                    if next_el[4] > -1:                  
-                        if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1)) 
+                    next_el = all_el[:,ind_c + 1]
+                    if next_el[4] > -1:
+                        if (((type_textl in val_type_col1) and (labels_textl[int(next_el[4])] in val_type_col1))
                             or ((type_textl in val_type_col2) and (labels_textl[int(next_el[4])] in val_type_col2))
                             or (type_textl == labels_textl[int(next_el[4])])):
                             # Object to the right or beneath
@@ -728,33 +728,33 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
                                 # Accounting for footnotes or other stuff
                                 curr_fontsize = curr_el[3] - curr_el[1]
                                 next_fontsize = next_el[3] - next_el[1]
-                                if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize < 
+                                if ((curr_fontsize - next_fontsize * similarity_fonts) < curr_fontsize <
                                     (curr_fontsize + next_fontsize * similarity_fonts)):
                                     # Finally, account for indentation
                                     if ((np.min(all_bbox[1,:]) + indentation) > next_el[1]):
                                         flag_copy_textb = 0
-                                
-                # Attributes and stuff in case we need to store as textbox      
+
+                # Attributes and stuff in case we need to store as textbox
                 if flag_copy_textb:
                     bbox_text_b = np.array([np.min(all_bbox[0,:]),np.min(all_bbox[1,:]),
                                             np.max(all_bbox[2,:]),np.max(all_bbox[3,:])])
-                    bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, 
+                    bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True,
                             rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft',
                             refCorners = 'topleftbottomright')
                     all_bbox = np.array([]).reshape((4,0))
-            # Instead, if we have a line        
+            # Instead, if we have a line
             else:
                 bbox_text_b = curr_el[:4]
                 text_l = ET.SubElement(text_b, 'textline')
                 text_l.attrib['type'] = 'col_lines'
-                bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True, 
+                bbox_text_b = transform_coord_toorig(bbox_text_b.reshape((4,1)), dim_page = dim_img, invert_xy = True,
                                             rescale = True, scale_fact = rescale_factor.reshape((2,1)), ref00 = 'topleft',
                                             refCorners = 'topleftbottomright')
                 text_l.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2]
                 type_textbox = 'line'
-            
+
             # Creating the new textbox
-            if flag_copy_textb:    
+            if flag_copy_textb:
                 text_b.attrib['bbox'] = np.array2string(bbox_text_b[:].reshape((1,4)), precision = 3, separator = ',')[2:-2]
                 text_b.attrib['type_textbox'] = type_textbox
                 count_b += 1
@@ -762,10 +762,10 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
                     pass
                 else:
                     text_b = ET.SubElement(page_el, 'textbox')
-                    text_b.attrib['id'] = str(count_b)    
-                    text_b.attrib['block'] = str(ind_b) 
+                    text_b.attrib['id'] = str(count_b)
+                    text_b.attrib['block'] = str(ind_b)
                 all_bbox = np.array([]).reshape((4,0))
-                
+
         if (ind_b < (len(set_of_blocks) - 1)):
             text_l = ET.SubElement(text_b, 'textline')
             text_l.attrib['type'] = 'central_lines'
@@ -775,20 +775,20 @@ def group_textl_create_xml(xml_t, set_of_blocks, ref_textl, labels_textl, rescal
             text_b.attrib['type_textbox'] = 'line'
             count_b += 1
             text_b = ET.SubElement(page_el, 'textbox')
-            text_b.attrib['id'] = str(count_b)    
-            text_b.attrib['block'] = str(ind_b) 
+            text_b.attrib['id'] = str(count_b)
+            text_b.attrib['block'] = str(ind_b)
             all_bbox = np.array([]).reshape((4,0))
-    
-    
-    # Just add the two final elements from the original xml    
+
+
+    # Just add the two final elements from the original xml
     page_el.append(xml_t[0][-2]) # Figure
     page_el.append(xml_t[0][-2]) # Layout
-    
+
     return xml_e
 
- 
+
 def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_col2')):
-    
+
     # helper function to clean text
     # !!! so far only removing new lines and primitive dehyphenation
     def clean_text(text):
@@ -796,10 +796,11 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
         text = text.replace('\n', ' ')
 
         # account for hyphenation (not completely correct...)
+        # TODO: needs to be improved
         text = text.replace('- ', '')
 
         return text
-    
+
     # initialize textbox count and empty dictionary
 
     XML_new = copy.deepcopy(XML_root)
@@ -814,7 +815,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
             if (textbox.tag == 'textbox'):
                 if 'type_textbox' in textbox.attrib.keys():
                     if (textbox.attrib['type_textbox'] == 'text'):
-                        
+
                         # initialize string
 
                         #print(textbox.tag, textbox.attrib)
@@ -827,7 +828,7 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
                             if textline.tag == 'textline':
                             #print(textline.tag, textline.attrib)
                             # for every text (actually just a letter)
-            
+
                                 for ind_ch, text in enumerate(textline):
                                     #print(ind_ch, text.text, len(textline), len(XML_new[ind_p][ind_t][ind_tl]))
                                     # extend string
@@ -847,7 +848,6 @@ def get_text_onefile(XML_root, flag_all = 1, valid_types = ('text_col1', 'text_c
                                 complete_text += '[/font]'
                                 complete_text = clean_text(complete_text)
                                 XML_new[ind_p][ind_t][ind_tl].text = complete_text
-            
-        
+
+
     return XML_new
-    
\ No newline at end of file
diff --git a/src/python/run_correct_meta.py b/src/python/run_correct_meta.py
index 6fa7429475d93fb1b5f1bad4c16ea43e85e6d5f6..0a3788677845ea3fc351755623e82f8353f127a7 100755
--- a/src/python/run_correct_meta.py
+++ b/src/python/run_correct_meta.py
@@ -38,7 +38,7 @@ files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file)
 
 list_proc = list()
 for infile in files_proc:
-    
+
     # 8 is always the length of the id code
     infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf'
     if infile_aux not in list_proc:
@@ -50,5 +50,5 @@ for infile in files_proc:
             print('Meta corrected %s' % infile)
         except:
             print("Meta to correct %s prompted an error" % infile)
-    
+
 print('Total time for correcting meta of year %d: %f' % (int(year_tocomp) ,(time.time() - t1)))
diff --git a/src/python/run_correctxml.py b/src/python/run_correctxml.py
index 45543847c34d783fb8792afb882c159f7a69ed5a..ff0710cb6d532daa9e60fb5a5d9e5c25923e7507 100644
--- a/src/python/run_correctxml.py
+++ b/src/python/run_correctxml.py
@@ -38,9 +38,9 @@ files_proc, _ = utils_proc.get_list(year_tocomp, folder_database, name_tar_file)
 
 list_proc = list()
 for infile in files_proc:
-    
+
     # 8 is always the length of the id code
-    infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf'   
+    infile_aux = year_tocomp + '/' + infile.split('/')[-1][:8] + '.pdf'
     if infile_aux not in list_proc:
         list_proc.append(infile_aux)
         d1 = defc.Document(infile_aux, folder_database)
@@ -54,5 +54,5 @@ for infile in files_proc:
 # Commands to get the compressed version of the file
 #data/AB/${year}/02_extractedxml.tar.gz
 utils_proc.compress_tar(output_file)            
-            
-print('Total time for correction of year %d: %f' % (int(year_tocomp) ,(time.time() - t1)))
\ No newline at end of file
+
+print('Total time for correction of year %d: %f' % (int(year_tocomp) ,(time.time() - t1)))
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 2a5821f533a54c6293413f396ec4fa859bcb6bf9..495b23ceb7f8b38e2849c5d16119e76cf177d48f 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -1,199 +1,128 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-import os
+# Code to extract discussions from corrected XML files
+#%%
+# to work with atom
+%load_ext autoreload
+%autoreload 2
+
 import pickle
-import re
-import pandas as pd
-from nltk.corpus import stopwords
 import time
+import xml.etree.ElementTree as ET
 
-import hf_extractdiscussions as hf
+import sys
+sys.path.append('src/python/')
 
-# specify input values
-years = [1891, 1995]
-range_years = range(years[0], years[1] + 1)
+import def_classes as defc
+import utils_proc
+import utils_annot
 
-# paths
-#path_start = '/home/lili/NLP_DemocraSci/nlp-democracy/'
-path_data = '/data/complete_data/AB/'
-path_output = '/data/output/'
+import os
 
-# open dictionary of last names from pickle file
-with open('/home/lili/nlp-democracy/output/MPs/MPs_lastnames.pickle', 'rb') as f:
-    dict_lastnames = pickle.load(f)
+from utils_proc import call_with_out
+
+#%%
+# specify input and output files
+
+# needed for running in atom, can be ignored
+year = '1891'
+input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
+input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
+input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
+output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
+
+#%%
+# detect arguments
+input_lastnames = sys.argv[1]
+input_correctedxml = sys.argv[2]
+input_correctedmeta = sys.argv[3]
+output_annotatedxml = sys.argv[4]
+
+#%%
+# extract suffixes, year, folder_database
+suffix_tar_correctedxml = input_correctedxml.split('/')[-1].split('.tar.gz')[0]
+#suffix_tar_correctedmeta = input_correctedmeta.split('/')[-1].split('.tar.gz')[0]
+year = input_correctedxml.split('/')[-2]
+folder_database = input_correctedxml.split(year)[0]
+suffix_correctedmeta = '_metacorr'
+#suffix_correctedxml = '_datacorr'
+# needed to instantiate object for every document
+input_rawmeta = folder_database + '/' + year + '/' + '01_rawmeta.tar.gz'
+
+#%%
+# git lfs pull necessary data
+for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]:
+    command = 'git lfs pull -I ' + lfsfile
+    #print(command)
+    call_with_out(command)
+
+#%%
+# TODO: exclude overlaps --> after annotation
+
+
+#%%
+start_time_discussions = time.time()
+print('start to identify discussions of the year', year, '\n')
 
-# open dictionary of overlaps
-with open('/data/complete_data/Results_overlap/DictOverlap1891to1930.pkl', 'rb') as f:
-    dict_overlaps_1 = pickle.load(f)
-with open('/data/complete_data/Results_overlap/DictOverlap1931to1995.pkl', 'rb') as f:
-    dict_overlaps_2 = pickle.load(f)
-with open('/data/complete_data/Results_overlap/DictOverlap1991to1995.pkl', 'rb') as f:
-    dict_overlaps_3 = pickle.load(f)
-dict_overlaps = {**dict_overlaps_1, **dict_overlaps_2, **dict_overlaps_3}
-print(dict_overlaps.keys())
+# extract list of files
+files_to_process, _ = utils_proc.get_list(year, folder_database, suffix_tar_correctedxml)
+files_to_process.sort()
+print('files to process loaded:', files_to_process)
 
-# get dictionary of discussions
-# -----------------------------
+# open dataframe of last names from pickle file
+# (there is one file of lastnames per year)
+with open(input_lastnames, 'rb') as f:
+    df_lastnames = pickle.load(f)
 
-start_time_discussions = time.time()
+print('dataframe with lastnames loaded')
+
+#%%
+# for each file
+# TODO !!!! get rid of [66:]
+for file_tarpath in files_to_process:
+    #print(file_tarpath)
+    id_doc = file_tarpath.split('/')[-1][:8]
+
+    # instantiate document object (always from original pdf)
+    file_aux = year + '/' + id_doc + '.pdf'
+    file_doc = defc.Document(file_aux, folder_database)
+
+    # if document is a discussion
+    if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
+        print(id_doc + '\n')
+        file_doc.df_lastnames = df_lastnames
+        file_doc.annotate_xml()
+
+# Commands to get the compressed version of the file
+# (compressed file is around 5 times smaller than uncompressed file)
+#data/AB/${year}/05_annotatedxml.tar.gz
+utils_proc.compress_tar(output_annotatedxml)
+
+
+
+
+#%%
+## to test for one file
+#file_tarpath = './1893/20026526_datacorr.xml'
+#
+#id_doc = file_tarpath.split('/')[-1][:8]
+#
+## instantiate document object (always from original pdf)
+#infile_aux = year + '/' + id_doc + '.pdf'
+#file_doc = defc.Document(infile_aux, folder_database)
+#
+#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
+    #print(id_doc + '\n')
+#
+    #file_doc.df_lastnames = df_lastnames
+    #file_doc.annotate_xml()
+
+
+#%%
+
+
+
+#id_doc
 
-# list of votation terms
-list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', 
-                      'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)', 
-                      'Votation', 'Vote', 'votation', #'(AdoptÃs)', 'adoptÃs', 'adoptÃe', 'rejetÃe', 
-                      "D'accord", 'voix']
-
-# list of stopwords
-list_stopwords = stopwords.words('german')
-list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
-list_stopwords.extend(stopwords.words('french'))
-list_stopwords.extend(['ils', 'les', 'celle'])
-
-print('start to identify discussions of the years', years, '\n\n\n')
-
-# initialize empty dictionary for all documents
-dict_docs = {}
-
-# for every year
-for year in range_years:
-    start_time = time.time()
-
-    str_year = str(year)
-    print(year, '\n', 30*'=')
-    # initialize empty dictionary for that year
-    dict_year = {}
-    # extract list of numbers for that year
-    list_numbers = next(os.walk(path_data + str_year))[1]#os.listdir(path_data + str_year)
-    list_numbers.sort()
-    # extract list of lastnames for that year and generate dataframe from it
-    lists_lastnames = dict_lastnames[int(year)]
-    df_lastnames = hf.get_df_from_lists_names(lists_lastnames)
-    # extract overlaps of that year
-    dict_overlaps_year = dict_overlaps[year]
-    # for each number, i.e. document
-    for number in list_numbers:
-        path_number = path_data + str_year + '/' + number + '/'
-        # if document is a discussion
-        if (hf.check_if_discussion(path_number + number + '.xml')) and (number not in ['20032463', '20032952', '20014332']):
-            print(number + '\n')
-            # get dictionary with text
-            dict_text = hf.get_text_onefile(path_number + number + '_datacorr.xml')
-            # exclude parts from previous and next document
-            if number in dict_overlaps_year:
-                dict_text = hf.exclude_overlaps(dict_text, dict_overlaps_year[number])
-            # get all discussionstarts
-            dict_discussionstarts = hf.get_discussion_starts(dict_text, df_lastnames, list_stopwords, bln_print=True)
-            # get votation paragraphs
-            dict_votations = hf.get_votations(dict_text, list_votationterms)
-            # put all discussions together in dictionary
-            dict_discussions, list_keys = hf.get_discussions(dict_text, dict_discussionstarts, dict_votations) 
-            # save that discussions dictionary to the yearly dictionary
-            dict_year[number] = dict_discussions
-
-            #print('\n\n')
-    # save that yearly dictionary to the dictionary for all documents
-    dict_docs[year] = dict_year
-    # dump that discussions dictionary in the yearly folder
-    path_year = path_output + 'AB/' + str_year + '/'
-    os.makedirs(path_year, exist_ok=True)
-    with open(path_year + 'dict_discussions.pickle', 'wb') as f:
-        pickle.dump(dict_year, f)
-
-    print("Time to extract discussions for year %s: %s minutes\n" % (year, (time.time() - start_time)/60))
-
-
-# dump dictionary of documents to a pickle file
-year_start = str(list(dict_docs.keys())[0])
-year_end = str(list(dict_docs.keys())[-1])
-with open(path_output + 'dict_discussions_' + year_start + '-' + year_end + '.pickle', 'wb') as f:
-    pickle.dump(dict_docs, f)
-
-print("Time to extract all discussions: %s minutes\n" % ((time.time() - start_time_discussions)/60))
-
-
-# Language identification with Luis' method
-# -----------------------------------------
-
-print('start to identify languages of the years', years, '\n\n\n')
-start_time_languages = time.time()
-
-
-# initialize empty dictionaries
-dict_languages = {}
-dict_german = {}
-dict_french = {}
-dict_italian = {}
-
-# for every year
-for year in range_years:
-    str_year = str(year)
-    start_time = time.time()
-    
-    # initialize empty dictionaries for that year
-    dict_year = {}
-    dict_year_german = {}
-    dict_year_french = {}    
-    dict_year_italian = {}
-    print(year)
-        
-    # load pickle dump for that year
-    with open(path_output + 'AB/' + str_year + '/dict_discussions.pickle', 'rb') as f:
-        dict_disc_year = pickle.load(f) 
-    
-    # for every document in that year
-    for number in dict_disc_year:   #dict_docs[year]:
-        # initiaze empty dictionaries for that document
-        dict_number = {}
-        dict_number_german = {}
-        dict_number_french = {}    
-        dict_number_italian = {}
-        print(number)
- 
-        # tokenize discussion
-        dict_tokenized = hf.tokenize_dictionary(dict_disc_year[number], hf.tokenizer)
-        
-        # identify language
-        dict_lang = hf.identify_lang(dict_tokenized)
-        #print(dict_lang)
-        
-        # assign language
-        for tupel, value in dict_lang.items():
-            #print(tupel)
-            lang = hf.label_language(value)
-            dict_number[tupel] = lang
-            if lang == 'german':
-                dict_number_german[tupel] = dict_disc_year[number][tupel]
-            elif lang == 'french':
-                dict_number_french[tupel] = dict_disc_year[number][tupel]
-            elif lang == 'italian':
-                dict_number_italian[tupel] = dict_disc_year[number][tupel]
-            else:
-                pass
-                #print(lang, value, dict_docs[year][number][tupel])
-        
-        # add to dictionaries of that year
-        dict_year[number] = dict_number
-        dict_year_german[number] = dict_number_german
-        dict_year_french[number] = dict_number_french
-        dict_year_italian[number] = dict_number_italian
-        
-    # add to overall dictionaries
-    dict_languages[year] = dict_year
-    dict_german[year] = dict_year_german
-    dict_french[year] = dict_year_french
-    dict_italian[year] = dict_year_italian
-
-    print("Time to identify languages for discussions of year %s: %s minutes\n" % (year, (time.time() - start_time)/60))
-    
-    # dump for that year
-    with open(path_output + 'AB/' + str_year + '/dict_languages.pickle', 'wb') as f:
-        pickle.dump(dict_year, f)
-    with open(path_output + 'AB/' + str_year + '/dict_discussions_german.pickle', 'wb') as f:
-        pickle.dump(dict_year_german, f)
-    with open(path_output + 'AB/' + str_year + '/dict_discussions_french.pickle', 'wb') as f:
-        pickle.dump(dict_year_french, f)
-    with open(path_output + 'AB/' + str_year + '/dict_discussions_italian.pickle', 'wb') as f:
-        pickle.dump(dict_year_italian, f)         
-
-print("Time to identify languages for all discussions: %s minutes\n" % ((time.time() - start_time_languages)/60))
+#len(files_to_process)
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index 0f555b4887d557a001caabd9b90ab8b76e9b45a0..e41d934ac7914234ab6fae1f425123999971c413 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -9,6 +9,11 @@ import numpy as np
 import pandas as pd
 import string
 import re
+import os, sys
+sys.path.append('src/python/')
+import utils_proc
+import copy
+import collections
 
 
 # function to check whether a file containts discussions
@@ -44,191 +49,385 @@ def check_if_discussion(path_meta_xml_file,
 
     return True
 
+# helper function to get text without font information
+# example for font information: [font face="11.718" size="Times-Roman"] sometext [/font]
+# input:
+# - sometext: string
+# output:
+# - newtext: modified string
+def get_text(sometext):
+    # initialize
+    newtext = ''
+
+    # find text between font information
+    for text in re.findall('\].*?\[',sometext):
+        #print(text)
+        if text.startswith(']') and text.endswith('['):
+            newtext += text[1:-1]
+    #print(newtext)
+    return newtext
+
+
+# function to annotated corrected XML
+def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
+
+    # list of votation terms
+    # TODO: make it work for é, etc.
+    list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
+                          'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)',
+                          'Votation', 'Vote', 'votation', #'(Adopt�s)', 'adopt�s', 'adopt�e', 'rejet�e',
+                          "D'accord", 'voix']
+
+    # list of stopwords
+    list_stopwords = stopwords.words('german')
+    list_stopwords.extend(['dass', 'resp', 'daran', 'dr', 'herr', 'herrn', 'hr'])
+    list_stopwords.extend(stopwords.words('french'))
+    list_stopwords.extend(['ils', 'les', 'celle'])
+
+    # create new XML as a copy of the corrected one
+    XML_new = copy.deepcopy(XML_root)
+
+    # initialize flags to distinguish speeches from votes
+    this_is_speech = False
+    prev_is_speech = False
+    this_is_vote = False
+
+    # for every page
+    for ind_p, page in enumerate(XML_root):
+        if bln_print:
+            print(page.tag, page.attrib)
+        # for every textbox on that page
+        for ind_t, textbox in enumerate(page):
+            if (textbox.tag == 'textbox'):
+                if 'type_textbox' in textbox.attrib.keys():
+                    if (textbox.attrib['type_textbox'] == 'text'):
+                        if bln_print:
+                            print(textbox.tag, textbox.attrib)
+
+                        # get complete text of that textbox
+                        complete_text, ind_tl_colon = get_complete_text(textbox)
+                        if bln_print:
+                            print(complete_text[:100])
+
+                        # identify and label language in XML
+                        dict_lang = identify_language(complete_text)
+                        XML_new = label_language(XML_new, ind_p, ind_t, dict_lang)
+
+                        # get texttype of that textbox by majority vote
+                        # TODO add that type to XML
+                        textbox_texttype = get_textbox_type(textbox)
+                        if bln_print:
+                            print(textbox_texttype)
+
+                        if textbox_texttype in ['text_col1', 'text_col2']:
+
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
+                            if this_is_speech:
+                                prev_is_speech = True
+#                                print('stopped after finding speech start')
+                                continue
+                            XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, bln_print=False)
+                            if this_is_vote:
+                                prev_is_speech = False
+#                                print('stopped after finding vote')
+                                continue
+                            if prev_is_speech and (not this_is_vote):
+                                XML_new = label_speechcont(XML_new, ind_p, ind_t)
+
+
+    return XML_new
+
+
+# helper function to get type of textbox_type
+# corresponds to majority vote of types of textlines
+# input:
+# - textbox
+# output:
+# - textbox_type: string
+def get_textbox_type(textbox):
 
+    # initialize empty dictionary
+    dict_type = {}
 
+    # for every textline in that textbox
+    for ind_tl, textline in enumerate(textbox):
+        if textline.tag == 'textline':
 
+            # count types
+            if textline.attrib['type'] not in dict_type.keys():
+                dict_type[textline.attrib['type']] = 1
+            else:
+                dict_type[textline.attrib['type']] += 1
 
+    # list of all types with maximum count
+    list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())]
 
+    # if only one with maximum value
+    if len(list_types) == 1:
+        textbox_type = list_types[0]
 
+    # if several with same maximum value
+    else:
+        textbox_type = 'notdistinct'
 
+    return textbox_type
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# functions from hf_extractdiscussions.property
-# ==============================================================================
-
-# function to exclude overlapping textboxes between documents
+# helper function to get complete text of a textbox
 # input:
-# - dict_text: dictionary of texts of one document
-# - dict_overlaps_year: dictionary with overlaps
+# - textbox
 # output:
-# - dict_text: modified dict_text
-def exclude_overlaps(dict_text, dict_overlaps):
-    # initialize to impossible values
-    first_entry = -1
-    last_entry = 1000
+# - complete_text: string
+# - ind_tl_colon: index of textline with colon (needed for label speech start)
+def get_complete_text(textbox):
 
-    # get index of textbox from first and last page
-    # the overlap dictionary only contains an entry, if an overlap was detected
-    for entry, array in dict_overlaps.items():
-        if entry == 'first':
-            first_entry = int(array[0])
-        if entry == 'last':
-            last_entry = int(array[0])
+    # helper function to get text without font information
+    def get_text(sometext):
+        newtext = ''
+        for text in re.findall('\].*?\[',sometext):
+            #print(text)
+            if text.startswith(']') and text.endswith('['):
+                newtext += text[1:-1]
+        #print(newtext)
+        return newtext
 
-    # get list of keys for first and last page
-    list_first_page = [key for key in dict_text if key.split(',')[1] == '0']
-    last_page = max([int(key.split(',')[1]) for key in dict_text])
-    list_last_page = [key for key in dict_text if key.split(',')[1] == str(last_page)]
+    # initialize empty string
+    complete_text = ''
 
-    # modify dict_text on first page
-    for key in list_first_page:
-        if int(key.split(',')[2]) < first_entry:
-            dict_text[key] = ''
+    # initialize index of textline colon to impossible value
+    ind_tl_colon = -1
 
-    # ... and on last page
-    for key in list_last_page:
-        if int(key.split(',')[2]) > last_entry:
-            dict_text[key] = ''
+    # for every textline in that textbox
+    for ind_tl, textline in enumerate(textbox):
+        if textline.tag == 'textline':
+            # get that text
+            thattext = get_text(textline.text)
 
-    return dict_text
+            # append that text to string
+            complete_text += thattext
 
+            # in first two textlines of textbox, check for colon
+            if ind_tl < 3:
+                if ':' in thattext:
+                    ind_tl_colon = ind_tl
 
-# tokenizer
-tokenizer_canton = RegexpTokenizer(r'\w+')    # only leaves words
-#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+')
-# last part \S+ is needed to get colon, \S stands for white space
-tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
+    return complete_text, ind_tl_colon
 
-# function to extract discussion starts
-# !!! maybe we only need a list of discussion starts
+
+# function to label speech starts
 # input:
-# - dict_text: dictionary with text of one file
-# - list_names: list of MPs
+# - text: string to be analyzed
+# - df_names: dataframe of politicians
 # - list_stopwords: list of german and french stopwords
 # - bln_print: whether to print during execution, default False
 # output:
-# - dict_discussionstarts: dictionary with discussion starts
-def get_discussion_starts(dict_text, df_names, list_stopwords, bln_print=False):
-
-    # initialize empty dictionary
-    dict_discussionstarts = {}
-
-    # add a few terms to list_stopwords that are easily mistaken as last names
-    list_stopwords.extend(['art', 'rath', 'alinea', 'stimmen', 'stimme', 'hans', 'walter', 'werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'gallen', 'stgallen',
-                           'kasse', 'fasse', 'sitten', 'herren', 'herr', 'alter'])
-
-    # for every textbox
-    for key, text in dict_text.items():
-        if ':' in text[:100]:
-            # extract the index of the colon in the text
-            colon_index_text = text.index(':')
-
-            # look at first few terms of that textbox
-            text_start = re.sub(r'[\(\)]','',text[:colon_index_text])
-            print('text_start', text_start)
-            list_oi = tokenizer.tokenize(text_start)
-            print('asdf', list_oi)
-
-            # shorten to part before colon
-            list_oi2 = list_oi
-
-            # remove stopwords
-            list_oi2 = [term for term in list_oi2 if term.lower() not in list_stopwords]
-
-            # remove punctuation
-            list_oi2 = [''.join(c for c in s if c not in string.punctuation) for s in list_oi2]
-            list_oi2 = [s for s in list_oi2 if s]
+# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False):
+
+    # initialize flag
+    this_is_speech = False
+
+    # initialize strings and ID
+    str_name = ''
+    str_role = ''
+    list_uniqueID = []
+    str_canton = ''
+
+    # font text end
+    fontend = '[/font]'
+
+    # very consistently, a speaker can be identified by looking for a colon
+    # at the beginning of a textbox and identifiying a name or a role in front
+    # of that colon
+    if ind_tl_colon >= 0:
+#    if ':' in text[:100]:
+        # extract the index of the colon in the text
+        colon_index_text = text.index(':')
+
+        # look at first few terms of that textbox
+        text_start = re.sub(r'[\(\)]','',text[:colon_index_text])
+        list_oi = tokenizer.tokenize(text_start)
+        if bln_print:
+            print('possible speech start: ', list_oi)
+
+        # remove stopwords
+        list_oi = [term for term in list_oi if term.lower() not in list_stopwords]
+
+        # remove punctuation
+        list_oi = [''.join(c for c in s if c not in string.punctuation) for s in list_oi]
+        list_oi = [s for s in list_oi if s]
+
+        # remove lower case terms
+#        list_oi = [term for term in list_oi if not term.islower()]
+
+        # remove numbers
+        list_oi = [term for term in list_oi if not term.isdigit()]
+
+        # remove single characters
+        list_oi = [term for term in list_oi if len(term)>1]
+
+        # for every term
+        for term in list_oi:
+            # if possible, find a name in a list
+            str_name, str_role, list_uniqueID, str_canton = find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False)
+        if bln_print:
+            print('name', str_name, 'role', str_role)
 
-            # remove lower case terms
-#            list_oi2 = [term for term in list_oi2 if not term.islower()]
+        # get rid of doubled double names
+        # TODO
 
-            # remove numbers
-            list_oi2 = [term for term in list_oi2 if not term.isdigit()]
+        # get rid of 'Präsident stimmt nicht Président ne vote pas'
+        if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name:
+            if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
+                if bln_print:
+                    print('get rid of Präsident stimmt nicht, Président ne vote pas', list_oi)
+                str_role = ''
 
-            # remove single characters
-            list_oi2 = [term for term in list_oi2 if len(term)>1]
+        # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":'
+        if str_name:
+            if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1:
+                if bln_print:
+                    print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi)
+                str_name = ''
+
+        # if a name has been found, add it to XML_new
+        if str_name or str_role:
+            # add attribute speech_start to textbox
+            XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start'
+
+            # add speaker as attribute to first textline
+            XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton)
+
+            # update text of XML (speaker is on first line, actual speech start on second line of speech_start textbox)
+            # if colon is on first line
+            if ind_tl_colon == 0:
+                # get text of that line and colon index
+                thattext = XML_new[ind_p][ind_t][0].text
+                colon_index = thattext.index(':')
+
+                # get last font information of thattext
+                fontstart = re.findall('\[font.*?\]', thattext)[-1]
+
+                try:
+                    # write speaker to first line
+                    XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend
+
+                    # get start of speech with correct font start
+                    if thattext[colon_index+1:].startswith('[font'):
+                        startspeech = thattext[colon_index+1:]
+                    elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                        startspeech = ''
+                    else:
+                        startspeech = fontstart + thattext[colon_index+1:]
+
+                    # write beginning of speech to second line
+                    # (create new ET element if necessary)
+                    if len(list(XML_new[ind_p][ind_t])) > 1:
+                        XML_new[ind_p][ind_t][1].text = startspeech + ' ' + XML_new[ind_p][ind_t][1].text
+                    else:
+                        XML_new[ind_p][ind_t].append(copy.deepcopy(XML_new[ind_p][ind_t][0]))
+                        XML_new[ind_p][ind_t][1].attrib.pop('speaker')
+                        XML_new[ind_p][ind_t][1].text = startspeech
+                except:
+                    print('error in self.input_file when splitting speaker')
+                    #print(thattext)
+                    #print(len(list(XML_new[ind_p][ind_t])))
+                    #print(list(XML_new[ind_p][ind_t]))
+                    #print(XML_new[ind_p][ind_t])
+                    #print('gefundener Name:', str_name, str_role)
+                    pass
+
+            # if colon is on second line
+            if ind_tl_colon == 1:
+                # get text of that line and colon index
+                thattext = XML_new[ind_p][ind_t][1].text
+                colon_index = thattext.index(':')
+
+                # get last font information of thattext
+                fontstart = re.findall('\[font.*?\]', thattext)[-1]
+
+                # get start of speech with correct font start
+                if thattext[colon_index+1:].startswith('[font'):
+                    startspeech = thattext[colon_index+1:]
+                elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                    startspeech = ''
+                else:
+                    startspeech = fontstart + thattext[colon_index+1:]
+
+                # write speaker to first line
+                XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend
+                # write beginning of speech to second line
+                XML_new[ind_p][ind_t][1].text = startspeech
+
+            # set flag
+            this_is_speech = True
+            if bln_print:
+                print('found a name:', list_oi, str_name, str_role, '\n')
 
-            # initialize string for name and role
-            str_name = ''
-            str_role = ''
-            int_uniqueID = int(0)
-            str_canton = ''
+    return XML_new, this_is_speech
 
-            # for every term, reversed finds canton before it finds name
-            for term in reversed(list_oi2):
-                # if possible, find a name in a list
-                str_name, str_role, int_uniqueID, str_canton = find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=True)
-            print('name', str_name, 'role', str_role)
 
-            # get rid of 'Präsident stimmt nicht Président ne vote pas'
-            if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name:
-                print('++++++++++ Präsident', list_oi2, list_oi)
-                if set(['stimmt', 'nicht', 'vote', 'pas']).intersection(list_oi):
-                    str_role = ''
-
-            # get rid of 'Für den Antrag "Name" stimmen: Votent pour la proposition "Name":'
-            if str_name:
-                print('++++++++++ Name', list_oi2, list_oi)
-                if len(set(['Antrag', 'stimmen', 'Votent', 'proposition']).intersection(list_oi)) > 1:
-                    str_name = ''
-
-            # if a name has been found
-            if str_name or str_role:
-                # add text to dictionary
-                dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
-                        text[colon_index_text+1:])
-                if bln_print:
-                    print('found a name:', list_oi2, str_name, str_role, '\n')
+# function to extract votation paragraphs
+# !!! error prone, possible improvements see notebook extract_discussions
+# input:
+# - XML_new:
+# - text: string
+# - list_votationterms: list of votation terms
+# - bln_print: whether to print during execution, default False
+# output:
+# - XML_new: updated
+def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, bln_print=True):
 
-    return dict_discussionstarts
+    # get first terms of that text
+    list_oi = tokenizer.tokenize(text)[:15]
+#        if len(set(list_oi).intersection(set(list_votationterms))) > 1:
+    # if there is an overlap with typical votation terms:
+    if set(list_oi).intersection(set(list_votationterms)):
+        # add attribute vote to textbox
+        XML_new[ind_p][ind_t].attrib['text_type'] = 'vote'
+
+        # set flag
+        this_is_vote = True
+        if bln_print:
+            print('found a vote:', list_oi)
+    else:
+        #pass
+        # set flag
+        this_is_vote = False
+        if bln_print:
+            print('not a vote', list_oi)
 
+    return XML_new, this_is_vote
 
 
-# small function to get first item of tupels in a list
-def get_first_item(list_tupels):
-    list_first_item = [tupel[0] for tupel in list_tupels]
-    return list_first_item
+# function to label continuation of speech
+# only adds label to corresponding textbox
+def label_speechcont(XML_new, ind_p, ind_t):
 
-# small function to get last two items of tupels in a list
-def get_last_item(list_tupels):
-    list_last_item = [tupel[-2:] for tupel in list_tupels]
-    return list_last_item
+    XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_cont'
 
+    return XML_new
 
+# helper function to flatten nested irregular list
+def flatten(l):
+    for el in l:
+        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
+            yield from flatten(el)
+        else:
+            yield el
 
 # function to find names
 # input:
 # - term: term that might be name
 # - str_name: string to which name should be attached
 # - str_role: string to which role should be attached
-# - int_uniqueID: integer for uniqueID
-#                 !!! (if there are several possibilities, this becomes a tuple)
+# - list_uniqueID: list with one or several uniqueIDs
 # - list_tupels: list of tupels containing all types of names
-def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln_print=False):
+# TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton
+# TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
+def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False):
 
-    def get_string(term, str_name, str_role, int_uniqueID, str_canton):
+    def get_string(term, str_name, str_role, list_uniqueID, str_canton):
         name_type = ''
         # if it is one of the simple names
         if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
@@ -239,12 +438,16 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln
             str_role = add_to_string(str_role, term)
         # if it is a double name
         elif term in list(df_names['name_short'].loc[df_names['type']=='double']):
-            print(20*'\n', 'DOUBLE NAME')
+            if bln_print:
+                print(20*'\n', 'DOUBLE NAME')
             # get correct name
             correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')]
             if bln_print:
                 print('double name', correct_name)
-            str_name = add_to_string(str_name, correct_name)
+            # only add name if it is not there yet
+            # if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries
+            if correct_name not in str_name.split(' '):
+                str_name = add_to_string(str_name, correct_name)
             name_type = 'double'
         # if it is a composite name
         elif term in list(df_names['name_short'].loc[df_names['type']=='comp']):
@@ -276,23 +479,47 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln
             name_type = 'canton'
 
 
-        temp = ''
+        # extract uniqueID
+        list_temp = []
         if name_type in ['simple', 'double', 'comp']:
-            temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]
+            list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
         elif name_type in ['canton']:
-            temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+            list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
 #            if canton_missing:
 #                temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
 #            else:
 #                temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_correct']==str_correct)].iat[0, df_names.columns.get_loc('uniqueIndex')]
 
-        if temp:
-            if int_uniqueID == 0:
-                int_uniqueID = temp
-            else:
-                int_uniqueID = (int_uniqueID, temp)
-
-        return str_name, str_role, int_uniqueID
+        if len(list_temp) > 0:
+            if bln_print:
+                print(list_temp, list_uniqueID)
+                print(type(list_temp), type(list_uniqueID))
+                print(isinstance(list_uniqueID, list))
+            # if no unique ID has been assigned so far
+            if len(list_uniqueID) == 0:
+                list_uniqueID = list_temp
+            # if there are already one or several people and have a new person, we update
+            elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
+                list_uniqueID.append(list_temp)
+
+            ## if we already have several possible people, e.g. because of canton
+            #elif isinstance(int_uniqueID, tuple):
+                #print('I should be here')
+                ## and refound the uniqueID of one of those, don't update
+                #if temp in int_uniqueID:
+                    #pass
+                ## and update if we don't have that uniqueID yet
+                #else:
+                    #int_uniqueID = (int_uniqueID, temp)
+            ## if a person with that uniqueID exists already, don't update
+            #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
+                #print('but end up here.. not even.....')
+                #pass
+            ## if a different unique ID has been assigned already
+            #else:
+                #int_uniqueID = (int_uniqueID, temp)
+
+        return str_name, str_role, list_uniqueID
 
     # small function to add term to str_name
     def add_to_string(string, term):
@@ -308,8 +535,8 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln
                   'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
                   'Vizepräsident']
 
-    list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'Gallen', 'StGallen',
-                     'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter']
+    list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen',
+                     'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'Rédiger', 'rédiger', 'Wer', 'Fällen']
 
     list_places = get_list_cantons(df_names)
 
@@ -325,7 +552,7 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln
         # if term is in the list of all names and roles
         if term in (list_all_names + list_roles):
             # get correct name and uniqueID, or role, for that term
-            str_name, str_role, int_uniqueID = get_string(term, str_name, str_role, int_uniqueID, str_canton)
+            str_name, str_role, list_uniqueID = get_string(term, str_name, str_role, list_uniqueID, str_canton)
 
             if bln_print:
                 print('=== correct name', term)
@@ -362,166 +589,185 @@ def find_names(term, str_name, str_role, int_uniqueID, df_names, str_canton, bln
                 if bln_print:
                     print('we found several possible names', set_intersection, 'and choose', array_min)
             if term_approx:
-                str_name, str_role, int_uniqueID = get_string(term_approx, str_name, str_role, int_uniqueID, str_canton)
-                print('*******************', str_name, term_approx)
+                str_name, str_role, list_uniqueID = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton)
+                if bln_print:
+                    print('*******************', str_name, term_approx)
+
+
+    return str_name, str_role, list_uniqueID, str_canton
 
 
-    return str_name, str_role, int_uniqueID, find_names
+# two functions for language identification
+# Author: Luis Salamanca
+# small modifications by Lili Gasser
+# Using stopwords
+# input:
+# - text: string
+# - valid_lang: tuple of valid languages
+# output:
+# - dict_language_counts: dictionary of stopword counts for each valid language
+def identify_language(text, valid_lang = ('german', 'french', 'italian')):
 
+    # tokenize
+    tokens = text.split(' ')
+    # all lowercase
+    test_words = [word.lower() for word in tokens]
+    # make a set
+    test_words_set = set(test_words)
 
+    # initialize dictionary of language elements
+    dict_language_counts = {}
 
-# function to get data frame from lists of names
+    # iterate through languages of stopwords
+    for language in stopwords.fileids():
+        if language in valid_lang:
+            # get stopword set
+            stopwords_set = set(stopwords.words(language))
+            # get intersection between text of interest and stopword set for this language
+            common_elements = test_words_set.intersection(stopwords_set)
+            # save number of common elements to dictionary
+            dict_language_counts[language] = len(common_elements)
+
+    return dict_language_counts
+
+
+# Simply, given the number of ocurrences of the stopwords, it assigns a label
+# to a specific textbox, also considering the possibility of textboxes
+# mixing languages. For this case, the value ratio_similar is intended
 # input:
-# - lists_names: lists of names (simple, double, comp, canton)
+# - XML_new: XML file to update
+# - aux_dict_l: corresponds to dict_language_counts
 # output:
-# - df: corresponding dataframe
-def get_df_from_lists_names(lists_names):
-    list_types = ['simple', 'double', 'comp', 'canton']
-    df = pd.DataFrame()
-    for i in range(4):
-        df_temp = pd.DataFrame(lists_names[i],
-                columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName'))
-        df_temp['type'] = list_types[i]
-        df = pd.concat([df, df_temp], ignore_index = True)
-    return df
+# - lang_max: string
+def label_language(XML_new, ind_p, ind_t, aux_dict_l):
 
+    # specify a similarity ratio
+    ratio_similar = 0.8
+    # if there are counts, determine language
+    if sum(aux_dict_l.values()):
+        aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
+        lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
+        lang_max = ''
+        count_l = 0
+        for lang in aux_dict_l_norm.keys():
+            if (aux_dict_l_norm[lang] >  aux_dict_l_norm[lang_max_aux] * ratio_similar):
+                if count_l > 0:
+                    lang_max += '_'
+                lang_max += lang
+                count_l += 1
+        if count_l > 1:
+            lang_max = 'mixed_' + lang_max
+    else:
+        lang_max = 'languageNotIdentified'
 
+    # add attribute to textbox
+    XML_new[ind_p][ind_t].attrib['language'] = lang_max
 
+    return XML_new
 
-# function to extract votation paragraphs
-# !!! maybe we only need a list of votation paragraphs
-# !!! error prone, possible improvements see notebook extract_discussions
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# functions from hf_extractdiscussions
+# ==============================================================================
+# TODO: check whether they still are needed
+
+# function to exclude overlapping textboxes between documents
 # input:
-# - dict_text: dictionary with text of one file
-# - list_names: list of votation terms
-# - bln_print: whether to print during execution, default False
+# - dict_text: dictionary of texts of one document
+# - dict_overlaps_year: dictionary with overlaps
 # output:
-# - dict_votations: dictionary with votations
-def get_votations(dict_text, list_votationterms, bln_print=True):
-    count = 0
-    dict_votations = {}
-    for key, text in dict_text.items():
-        list_oi = tokenizer.tokenize(text)[:15]
-#        if len(set(list_oi).intersection(set(list_votationterms))) > 1:
-        if set(list_oi).intersection(set(list_votationterms)):
-                count += 1
-                dict_votations[key] = text
-                if bln_print:
-                    print(count, 'MATCH', key, list_oi)
-        else:
-            #pass
-            if bln_print:
-                print('-----     ', list_oi)
+# - dict_text: modified dict_text
+def exclude_overlaps(dict_text, dict_overlaps):
+    # initialize to impossible values
+    first_entry = -1
+    last_entry = 1000
+
+    # get index of textbox from first and last page
+    # the overlap dictionary only contains an entry, if an overlap was detected
+    for entry, array in dict_overlaps.items():
+        if entry == 'first':
+            first_entry = int(array[0])
+        if entry == 'last':
+            last_entry = int(array[0])
+
+    # get list of keys for first and last page
+    list_first_page = [key for key in dict_text if key.split(',')[1] == '0']
+    last_page = max([int(key.split(',')[1]) for key in dict_text])
+    list_last_page = [key for key in dict_text if key.split(',')[1] == str(last_page)]
+
+    # modify dict_text on first page
+    for key in list_first_page:
+        if int(key.split(',')[2]) < first_entry:
+            dict_text[key] = ''
+
+    # ... and on last page
+    for key in list_last_page:
+        if int(key.split(',')[2]) > last_entry:
+            dict_text[key] = ''
+
+    return dict_text
+
+
+# tokenizer
+tokenizer_canton = RegexpTokenizer(r'\w+')    # only leaves words
+#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+')
+# last part \S+ is needed to get colon, \S stands for white space
+tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
+
+
+
+
+# small function to get first item of tupels in a list
+def get_first_item(list_tupels):
+    list_first_item = [tupel[0] for tupel in list_tupels]
+    return list_first_item
+
+# small function to get last two items of tupels in a list
+def get_last_item(list_tupels):
+    list_last_item = [tupel[-2:] for tupel in list_tupels]
+    return list_last_item
 
-    if bln_print:
-        print(count)
 
-    return dict_votations
 
 
 
 
-# function to put discussions together
-# !!! needs improvement when OCRed xml is corrected (details see notebook)
+# function to get data frame from lists of names
 # input:
-# - dict_discussionstarts
-# - dict_votations
+# - lists_names: lists of names (simple, double, comp, canton)
 # output:
-# - dict_discussions: dictionary of discussion parts
-#                        key: integer of discussion start
-#                        value: text until next discussion start or votation paragraph
-def get_discussions(dict_text, dict_discussionstarts, dict_votations):
-
-    # helper function to add text to discussion dictionary
-    def add_to_dict(key, i):
-#        print(key, i)
-        if key not in dict_discussions:
-            dict_discussions[key] = dict_discussionstarts[key]
-        else:
-            if i in list_text_keys_integers:
-                actual_i = list(dict_text.keys())[list_text_keys_integers.index(i)]
-                only_text = dict_discussions[key][1] + dict_text[actual_i]
-                dict_discussions[key] = (dict_discussions[key][0], only_text)
-        list_keys.append(i)
-
-
-    # list of keys for discussion starts and votation paragraphs
-    list_discussionstarts = list(dict_discussionstarts.keys())
-    list_discussionstarts_integers = [int(tpl[0].split(',')[0]) for tpl in dict_discussionstarts.keys()]
-    list_votations_strings = list(dict_votations.keys())
-    list_votations_integers = [int(tpl.split(',')[0]) for tpl in list_votations_strings]
-    list_text_keys_integers = [int(tpl.split(',')[0]) for tpl in dict_text.keys()]
-
-    # initialize empty dictionary for discussions and empty list for all added keys
-    dict_discussions = {}
-    list_keys = []
-
-    # if there are no discussion starts, return empty dictionary and list
-    if not list_discussionstarts:
-        return dict_discussions, list_keys
-
-    # for every discussion start except last
-    for idx, key in enumerate(list_discussionstarts_integers[:-1]):
-        #print(idx, key)
-        # write discussion start to dictionary
-        add_to_dict(list_discussionstarts[idx], key)
-
-        # for every textbox until next discussion start
-        for i in range(key + 1, list_discussionstarts_integers[idx + 1]):
-            # if it is not a votation paragraph, write it to dictionary,
-            if i not in list_votations_integers:
-                add_to_dict(list_discussionstarts[idx], i)
-            # else, stop execution of for loop
-            else:
-                break
-
-    # for last discussion start
-    last_key = list_discussionstarts_integers[-1]
-    # write discussion start to dictionary
-    add_to_dict(list_discussionstarts[-1], last_key)
-    # for every textbox until the end of the document
-    for i in range(last_key + 1, max(list_text_keys_integers) + 1):
-        # if it is not a votation paragraph, write it to dictionary
-        if i not in list_votations_strings:
-            add_to_dict(list_discussionstarts[-1], i)
-        # else, stop execution of for loop
-        else:
-            break
+# - df: corresponding dataframe
+def get_df_from_lists_names(lists_names):
+    list_types = ['simple', 'double', 'comp', 'canton']
+    df = pd.DataFrame()
+    for i in range(4):
+        df_temp = pd.DataFrame(lists_names[i],
+                columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName'))
+        df_temp['type'] = list_types[i]
+        df = pd.concat([df, df_temp], ignore_index = True)
+    return df
+
 
-    return dict_discussions, list_keys
 
 
-# function to check whether a file containts discussions
-# achieved by excluding title pages, table of content, etc.
-# !!! function works well for 1891 - 1900, not checked after that !!!
-def check_if_discussion(path_meta_xml_file,
-        list_attributes  = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'],
-        list_nondiscussion = ['inhaltsverzeichnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste',
-            'umschlag', 'sachregister', 'titelblatt', 'numerierung'],
-        list_nondiscussion2 = ['table', 'matières', 'répertoire', 'procès-verbaux']):
-    # parse, get root and then part of interest
-    XML_tree = ET.parse(path_meta_xml_file)
-    XML_root = XML_tree.getroot()
-    XML_poi = XML_root[0].find('ADS_TEXTEINHEIT')
 
-    # for each title attribute
-    for attribute in list_attributes:
-        # if xml contains this attribute
-        if attribute in XML_poi.attrib:
-            # get title and generate set with lower case terms
-            title = XML_poi.attrib[attribute]
-            set_title = set([term.lower() for term in title.split()])
-            #print(set_title)
-            # if one of terms is in list_nondiscussion, return False
-            if set_title.intersection(set(list_nondiscussion)):
-                #print('NOOO', path_meta_xml_file)
-                return False
-            # if two terms are in list_nondiscussion2, also return False
-            if len(set_title.intersection(set(list_nondiscussion2))) > 1:
-                #print('NOOO', path_meta_xml_file)
-                return False
 
-    return True
 
 
 
@@ -573,52 +819,6 @@ def dict_only_text(dictionary):
 
 
 
-# two functions for language identification
-# Author: Luis Salamanca
-# small modifications by Lili Gasser
-# Using stopwords
-def identify_lang(dict_text, valid_lang = ('german', 'french', 'italian')):
-
-    language_ratios_textbox = {}
-
-    for i_k in dict_text.keys():
-        tokens = dict_text[i_k][1]
-        test_words = [word.lower() for word in tokens] # lowercase all tokens
-        test_words_set = set(test_words)
-        language_ratios = {}
-        for language in stopwords.fileids():
-            if language in valid_lang:
-                stopwords_set = set(stopwords.words(language)) # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too.
-                common_elements = test_words_set.intersection(stopwords_set)
-                language_ratios[language] = len(common_elements) # language "score"
-        language_ratios_textbox[i_k] = language_ratios
-
-    return language_ratios_textbox
-
-
-# Simply, given the number of ocurrences of the stopwords, it assigns a label
-# to a specific textbox, also considering the possibility of textboxes
-# mixing languages. For this case, the value ratio_similar is intended
-
-def label_language(aux_dict_l):
-    ratio_similar = 0.8
-    if sum(aux_dict_l.values()):
-        aux_dict_l_norm = {k: v / total for total in (sum(aux_dict_l.values()),) for k, v in aux_dict_l.items()}
-        lang_max_aux = max(aux_dict_l_norm.keys(), key=(lambda key: aux_dict_l_norm[key]))
-        lang_max = ''
-        count_l = 0
-        for lang in aux_dict_l_norm.keys():
-            if (aux_dict_l_norm[lang] >  aux_dict_l_norm[lang_max_aux] * ratio_similar):
-                if count_l > 0:
-                    lang_max += '_'
-                lang_max += lang
-                count_l += 1
-        if count_l > 1:
-            lang_max = 'mixed_' + lang_max
-    else:
-        lang_max = 'NotIdentified'
-    return lang_max
-
 
 # function to get list of places
 def get_list_cantons(df_names):
diff --git a/src/python/utils_proc.py b/src/python/utils_proc.py
index 62e74547d3d1802fd357f84d812b9e9c28e1c630..55491cf7f6a10a7b58fb57ecfda18747796dac5d 100644
--- a/src/python/utils_proc.py
+++ b/src/python/utils_proc.py
@@ -205,4 +205,4 @@ def correct_metadata(year, id_doc, flag_end):
     
     tree = ET.ElementTree(XML_root_meta)
     tree.write(full_path + '_metacorr.xml', encoding = 'utf-8')
-    return full_path + '_metacorr.xml'
\ No newline at end of file
+    return full_path + '_metacorr.xml'
diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dbec0daf596094cf95f50dad868b250320eab8b9
--- /dev/null
+++ b/src/sh/extract_discussions_yearly.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+year_start=1891
+year_end=1893
+
+for year in $(seq $year_start $year_end)
+do
+    echo $year
+    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz
+done