diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8205b48ecd010c778db9a790dc49ace82d0d1c40
--- /dev/null
+++ b/data/lists/not_names.txt
@@ -0,0 +1,66 @@
+Alinea
+Alter
+Ari
+Art
+besser
+bietet
+drehen
+FÃ¤llen
+fasse
+Ferner
+ferner
+findet
+Gallen
+GrÃ¼nde
+hausen
+Herren
+Herr
+immer
+Kasse
+Kollege
+Kollega
+komme
+Leider
+lieber
+nehmen
+neu
+nicht
+Rath
+Schrit
+Seite
+selber
+Sinne
+spÃ¤ter
+Steuer
+StGallen
+Stimmen
+Stimme
+stimmt
+tischen
+Tunnel
+Ueber
+Hans
+Walter
+Werner
+weiterer
+Wer
+wissen
+Ziel
+autre
+Biffer
+biffer
+cerner
+comme
+force
+cause
+dernier
+ouvert
+peu
+pilier
+poser
+projet
+RÃ©diger
+rÃ©diger
+tirer
+vote
+delle
diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..38f3fd03a1853c6f6872ae916bbd851380ede921
--- /dev/null
+++ b/data/lists/wrongly_identified_speakers.txt
@@ -0,0 +1,78 @@
+
+also check for council:
+-----------------------
+1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann
+1925/20029937: Schneider, Berichterstatter --> NR, not SR
+
+
+one MP not active in whole year, leads to other not uniquely identified
+-----------------------------------------------------------------------
+1925/20029860ff: Keller-Aargau (in March, there is only one Keller-Aargau, in June, another one joins --> finds two!)
+1925/20029967: Seiler (in December, the second Seiler already left) --> finds two!)
+1925/20029967: Huber (in December, the second Huber already left) --> finds two!)
+1925/20029882: Naine (in June, there is only one Naine, in December, another one joins --> finds two!) also in ...96, ...97, etc.
+
+
+identified as speech start but is in text:
+------------------------------------------
+do I really need to look on the first two lines? maybe one is sufficient?
+look for typical terms such as gestellt, gesagt, etc.
+
+1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt:
+1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. BÃ¼hler folgend, die ErklÃ¤rung gewÃ¤hlt:
+1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 :
+1894/20026607: MÃ¼ller gegenÃ¼ber drei anderen durchgedrungen, welche lautete:
+1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt:
+1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr BrÃ¼gger gesagt hat: --> finds Lanz and BrÃ¼gger
+1925/20029917: MÃ¶gen Sie nun aber denken wie Herr Oberst BrÃ¼gger oder mÃ¶gen Sie denken wie ich: --> identified as speech start but is in text
+1925/20029917: Herr Hauser sagt:
+1925/20029978: Das ist, was Herr Charles Naine gesagt hat. Herr Hugglersagt:
+1925/20029981: BrÃ¼gger mÃ¶chte ich sagen:
+1971/20000663: de MM. KnÃ¼sel et Leu (there must be more speech starts, this is from a list of cantons and people inside a speech, !!! Layout)
+1971/20000007: La  seconde  rÃ©action  qu'a  suscitÃ© chez  moi  l'intervention  de  M. Weber  est  le  doute:
+1971/20000007: Herr  Kollega  Gut  hat  es  gesagt:
+1971/20000007: Noch  eine  Antwort  an  Kollege  Clottu
+1971/20000010: Nun  noch  etwas  zu  Richard  MÃ¼ller.  Erstens
+1971/20000024: Noch  ein  Wort  zu  Herrn  StÃ¤nderat  Wenk
+1971/20000024: Herr  Kollege  Heimann  stellt  sich  schliesslich  gegen einen  Finanzausgleich  mit  dem  Hinweis
+
+
+wrongly spelled city
+--------------------
+1925/20029963: Jenny Ennend (instead of Ennenda)
+1925/20029995,96: Keller Zurich (instead of ZÃ¼rich)
+1971/? : Berne instead of Bern
+
+
+doubled double names:
+---------------------
+1971/20000010: Meyer-Boller
+
+
+term very similar to one name is actually another name
+------------------------------------------------------
+1925/20029863: ganz --> finds Lanz, there is a Ganz
+1971/20000630 and others: Schweizer --> finds Schneider, there is a Schweizer
+
+
+term is a name
+--------------
+1971/20000010: Ganz wenige Einzelfragen --> finds Ganz
+1971/20000024: Politisch  gesehen  ist  es  doch  ganz  einfach  so --> finds Ganz
+
+
+French name with special characters
+-----------------------------------
+1971/20000055: DebÃ©taz
+
+
+Appenzeller
+-----------
+1894/20026597: Sonderegger
+1894/20026618: Sonderegger
+
+
+
+some other persons wrongly identified as MP
+-------------------------------------------
+1925/20029833: Sauser-Hall (not a MP)--> Hauser
diff --git a/requirements.txt b/requirements.txt
index ae3be3df42108fd3054cad7466f082091af00f57..dec9aa297abfa705fc6e700baca1769ffe3d6f10 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-numpy
+numpy<1.16
 scipy
 pandas
 xlrd
diff --git a/src/python/def_classes.py b/src/python/def_classes.py
index cce7282200b409284b6b30a108244026631e6ab1..8e2a480bd129c7676056182cfea23f0029b1a19e 100644
--- a/src/python/def_classes.py
+++ b/src/python/def_classes.py
@@ -107,7 +107,7 @@ class Document:
         else:
             print('Not saving to tar')
             name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outxml + '.tar.gz'
-            
+
         self.name_xml = [name_tar, name_xml]
         if flag_save:
             h_xml = utils_proc.get_handlerfile(self.name_xml[1], self.folder_database, name_file = name_outxml)
@@ -119,10 +119,10 @@ class Document:
         self.n_pages = np.arange(len(self.XML_main))
         command = 'rm -rf ./' + str(self.year)
         #print(command)
-        utils_proc.call_with_out(command)        
-        
-    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None, 
-                    ind_page = 0, textb_textl = 1):        
+        utils_proc.call_with_out(command)
+
+    def _draw_textbl(self, imarray = np.array([]), XML_root = None, XML_main = None,
+                    ind_page = 0, textb_textl = 1):
         # The page refers here to the page of the imgobj, which might not correspond
         # to the one of the xml. For that reason we use n_pages to obtain the index
         # for the xml
@@ -357,12 +357,12 @@ class Document:
         if ind_page > (len(self.XML_main) - 1):
             flag_error = 1
             return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, flag_error
-        
+
         flag_central = 1
         if self.year > self.limit_year:
             flag_central = 0
-        flag_2col = 1        
-        
+        flag_2col = 1
+
         XML_root = ET.Element('pages')
         XML_root.append(self.XML_main[ind_abs[0]])
         imarray = np.array(self.imgobj[ind_page])
@@ -380,10 +380,10 @@ class Document:
         XML_enrich = []
 
         if level_proc > 0:
-            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page, 
-                                                                                   flag_2col, flag_central)  
-            
-        if level_proc > 1:            
+            coord_vert_def, coord_horz = preproc_docs.find_mainHorandCentral_Hough(np.copy(imarray), coord_textline, bbox_page,
+                                                                                   flag_2col, flag_central)
+
+        if level_proc > 1:
             _, rescale_factor = preproc_docs.adapt_coordtoimg(imarray, bbox_page, bbox_page)
 
         if level_proc > 2:
@@ -645,8 +645,8 @@ class Document:
             name_fig = (folder_save + '/' + str_name + '_' + str(self.id_doc)
                         + '_page' + str(ind_page) + '.' + format_fig)
             fig.savefig(name_fig, format = format_fig, dpi = dpi)
-            plt.close(fig)       
-    
+            plt.close(fig)
+
     def check_discussion(self):
         utils_proc.tar_extractfile(self.name_meta[1], self.folder_database, name_file = self.name_inmeta)
         flag_discussion = utils_annot.check_if_discussion(self.name_meta[1])
@@ -748,7 +748,7 @@ class Document:
 
         print('we have a main corr XML file')
         #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml)
-        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, bln_print=False)
+        XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, bln_print=False)
         self.XML_main_annot = XML_main_annot
 
         # save xml file
diff --git a/src/python/extractMPs.py b/src/python/extractMPs.py
index cd3504da7a302a2f0560644fa1475c0e311e0e99..01f87b7674b82d4895ce39566b243e5629bf0175 100644
--- a/src/python/extractMPs.py
+++ b/src/python/extractMPs.py
@@ -8,7 +8,7 @@ import sys
 
 input_file = sys.argv[1]    #'./data/politicians/Ratsmitglieder_1848_DE_corr.xlsx'
 output_file_csv = sys.argv[2]     #'./data/politicians/MPs_after1890.csv'
-output_folder_dict = sys.argv[3]    
+output_folder_dict = sys.argv[3]
 
 
 class MPs_Extractor(object):
@@ -19,12 +19,12 @@ class MPs_Extractor(object):
         self.output_folder_dict = output_folder_dict
         self.range_years = range(years[0], years[1] + 1)
         self.df_exc = df_exc
-        
+
     # function to get lists of lastnames
     # input:
     # - df_year: dataframe for a year
     # output:
-    # - list_names: 
+    # - list_names:
     #      contains:
     #        - list of last names that appear only once and cannot be split
     #        - list of last name that are made up of two names such as 'Meier-MÃ¼ller'
@@ -44,7 +44,7 @@ class MPs_Extractor(object):
         str_comp = 'comp'
         str_canton2 = 'canton'
 
-	# function to split lastname and save meaningful part(s) to list 
+	# function to split lastname and save meaningful part(s) to list
         def split_lastname(lastname, uniqueID, tpl_canton, str_canton = ''):
 	    # if last name is a composite name, e.g. 'von Arx' and 'de Stoppani'
             lastname_split = lastname.split()
@@ -70,7 +70,7 @@ class MPs_Extractor(object):
                     list_names.append((str_double, lastname, lastname, uniqueID) + tpl_canton)
 		    # write double name without space into list
                     list_names.append((str_double, ''.join(lastname.split('-')), lastname, uniqueID) + tpl_canton)
-                else:       
+                else:
                     if str_canton:
                         list_names.append((str_canton2, lastname, str_canton, uniqueID) + tpl_canton)
                     else:
@@ -82,66 +82,67 @@ class MPs_Extractor(object):
             str_cantonabbr = df_year['CantonAbbreviation'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
             str_citizenship = df_year['Citizenship'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
             str_firstname = df_year['FirstName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
- 
-            return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname)
+            str_doublename = df_year['DoubleName'].loc[df_year['uniqueIndex']==uniqueID].iloc[0]
+
+            return (str_cantonname, str_cantonabbr, str_citizenship, str_firstname, str_doublename)
 
-	# create empty lists for last names
+    	# create empty lists for last names
         list_names = []
-	
-	# for every last name
+
+    	# for every last name
         for lastname in df_year['LastName'].drop_duplicates():
 	    #print('name', lastname, type(lastname))
-	    
-	    # extract all entries with that last name
+
+    	    # extract all entries with that last name
             df_temp = df_year.loc[df_after1890['LastName']==lastname]
-	    #print(df_temp)
-	    
-	    # if there is an extra double name
+    	    #print(df_temp)
+
+    	    # if there is an extra double name
             if df_temp.iloc[0]['DoubleName'] != '':
-		# extract unique index
+    		# extract unique index
                 uniqueID = df_temp.iloc[0]['uniqueIndex']
-		
+
                 # get canton information for that uniqueID
                 tpl_canton = get_canton(df_year, uniqueID)
 
                 #print('double name', df_temp)
                 doublename = df_temp.iloc[0]['DoubleName']
 
-		# if last name is a double name, e.g. 'Meier-MÃ¼ller'
+        		# if last name is a double name, e.g. 'Meier-MÃ¼ller'
                 lastname_split2 = doublename.replace('-', ' ').split()
                 if len(lastname_split2) > 1:
-		    # write each part of double name into corresponding list
+        		    # write each part of double name into corresponding list
                     for item in lastname_split2:
                         list_names.append((str_double, item, lastname, uniqueID) + tpl_canton)
-		    # write double name into list
+        		    # write double name into list
                     list_names.append((str_double, doublename, lastname, uniqueID) + tpl_canton)
-		    # write double name without space into list
+        		    # write double name without space into list
                     list_names.append((str_double, ''.join(doublename.split('-')), lastname, uniqueID) + tpl_canton)
 
-	    # if only one person with that last name
+    	    # if only one person with that last name
             if df_temp.drop_duplicates(['uniqueIndex']).shape[0] == 1:
-		# extract unique index
+    		# extract unique index
                 uniqueID = df_temp.iloc[0]['uniqueIndex']
 
                 # get canton information for that uniqueID
                 tpl_canton = get_canton(df_year, uniqueID)
 
-		# write complete name to list of last names
+        		# write complete name to list of last names
                 split_lastname(lastname, uniqueID, tpl_canton)
-		
-	    # if there are several people with the same last name
+
+    	    # if there are several people with the same last name
             else:
-		# write last name and canton to correct list
+        		# write last name and canton to correct list
                 for idx, row in df_temp.drop_duplicates(['uniqueIndex']).iterrows():
-		    # extract unique index
+        		    # extract unique index
                     uniqueID = df_temp.loc[idx]['uniqueIndex']
-		    
+
                     # get canton information for that uniqueID
                     tpl_canton = get_canton(df_year, uniqueID)
 
-		    # write the lastname to the list
-                    split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')') 
-			
+        		    # write the lastname to the list
+                    split_lastname(lastname, uniqueID, tpl_canton, row['LastName'] + ' (' + row['CantonName'] + ')')
+
         return list_names
 
     def extract(self):
@@ -172,7 +173,7 @@ class MPs_Extractor(object):
         # group by first and last name, and date of birth
         grouped = df_after1890.groupby(["LastName", "FirstName", "DateOfBirth"])
 
-        # assign first index to all entries of a person 
+        # assign first index to all entries of a person
         for list_index in grouped.groups.values():
             df_after1890.loc[list_index, 'uniqueIndex'] = list_index[0]
 
@@ -192,15 +193,15 @@ class MPs_Extractor(object):
             df_year = df_after1890[pd.to_datetime(df_after1890['DateJoining']) <= datetime.datetime(year, 12, 31, 0, 0)]
             df_year = df_year[pd.to_datetime(df_year['DateLeaving']) >= datetime.datetime(year, 1, 1, 0, 0)]
             print(year, df_year.shape)
-            
+
             # write df_year to a yearly csv file
         #    str_year = str(year)
         #    df_year.to_csv('home/lili/NLP_DemocraSci/nlp-democracy/output/MPs/MPs_' + str_year + '.csv')
-            
+
             # create a pandas dataframe from list of names
             # !!! list contains errors, see definition of function
             list_lastnames = self.get_list_of_lastnames(df_year, df_after1890)
-            df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName'))
+            df_lastnames = pd.DataFrame(list_lastnames, columns = ('type', 'name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName', 'DoubleName'))
 
             # dump dictionary of last names to a pickle file
 #           path = pathlib.
@@ -213,19 +214,18 @@ years = [1891, 2016]   #2016
 
 df_exc = pd.DataFrame(columns=['LastName', 'FirstName', 'DoubleName'])
 # exception: Konrad H. Cramer is also reffered to as Cramer-Frey. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Cramer', 'FirstName': 'Konrad H.', 'DoubleName': 'Cramer-Frey'}
 # exception: Johannes Blumer SG is also reffered to as Blumer-Egloff. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Blumer', 'FirstName': 'Johannes', 'DoubleName': 'Blumer-Egloff'}
 # exception: Adolphe Jordan VD is also reffered to as Jordan-Martin. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Jordan', 'FirstName': 'Adolphe', 'DoubleName': 'Jordan-Martin'}
 # exception: Jakob Schmid LU is also reffered to as Schmid-Ronca. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Schmid', 'FirstName': 'Jakob', 'DoubleName': 'Schmid-Ronca'}
 # exception: Eduard Sulzer ZH is also reffered to as Sulzer-Ziegler. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Sulzer', 'FirstName': 'Eduard', 'DoubleName': 'Sulzer-Ziegler'}
 # exception: Howard Eugster AR is also reffered to as Eugster-ZÃ¼st. Add double name in extra-column
-df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-ZÃ¼st'} 
+df_exc.loc[len(df_exc)] = {'LastName': 'Eugster', 'FirstName': 'Howard', 'DoubleName': 'Eugster-ZÃ¼st'}
 #print(df_exc)
 
 mps_extractor = MPs_Extractor(years, input_file, output_file_csv, output_folder_dict, df_exc)
 mps_extractor.extract()
-
diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 495b23ceb7f8b38e2849c5d16119e76cf177d48f..86ae417f57d8d0c4e6e08cd1f1f2c91bcf618f44 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -26,10 +26,11 @@ from utils_proc import call_with_out
 # specify input and output files
 
 # needed for running in atom, can be ignored
-year = '1891'
+year = '1971'
 input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
+input_notnames = "data/lists/not_names.txt"
 output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 
 #%%
@@ -37,7 +38,8 @@ output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz"
 input_lastnames = sys.argv[1]
 input_correctedxml = sys.argv[2]
 input_correctedmeta = sys.argv[3]
-output_annotatedxml = sys.argv[4]
+input_notnames = sys.argv[4]
+output_annotatedxml = sys.argv[5]
 
 #%%
 # extract suffixes, year, folder_database
@@ -51,6 +53,7 @@ suffix_correctedmeta = '_metacorr'
 input_rawmeta = folder_database + '/' + year + '/' + '01_rawmeta.tar.gz'
 
 #%%
+# TODO: make it work!
 # git lfs pull necessary data
 for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]:
     command = 'git lfs pull -I ' + lfsfile
@@ -77,6 +80,11 @@ with open(input_lastnames, 'rb') as f:
 
 print('dataframe with lastnames loaded')
 
+with open(input_notnames) as f:
+    list_notnames = f.readlines()
+
+list_notnames = [term.rstrip() for term in list_notnames]
+
 #%%
 # for each file
 # TODO !!!! get rid of [66:]
@@ -92,6 +100,7 @@ for file_tarpath in files_to_process:
     if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
         print(id_doc + '\n')
         file_doc.df_lastnames = df_lastnames
+        file_doc.list_notnames = list_notnames
         file_doc.annotate_xml()
 
 # Commands to get the compressed version of the file
@@ -103,20 +112,21 @@ utils_proc.compress_tar(output_annotatedxml)
 
 
 #%%
-## to test for one file
-#file_tarpath = './1893/20026526_datacorr.xml'
-#
-#id_doc = file_tarpath.split('/')[-1][:8]
-#
-## instantiate document object (always from original pdf)
-#infile_aux = year + '/' + id_doc + '.pdf'
-#file_doc = defc.Document(infile_aux, folder_database)
-#
-#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
-    #print(id_doc + '\n')
-#
-    #file_doc.df_lastnames = df_lastnames
-    #file_doc.annotate_xml()
+# to test for one file
+file_tarpath = './1971/20000619_datacorr.xml'
+
+id_doc = file_tarpath.split('/')[-1][:8]
+
+# instantiate document object (always from original pdf)
+infile_aux = year + '/' + id_doc + '.pdf'
+file_doc = defc.Document(infile_aux, folder_database)
+
+if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
+    print(id_doc + '\n')
+
+    file_doc.df_lastnames = df_lastnames
+    file_doc.list_notnames = list_notnames
+    file_doc.annotate_xml()
 
 
 #%%
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index e41d934ac7914234ab6fae1f425123999971c413..b19c06404c47e65559ddeeb273fa0bbd271a38fd 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -21,7 +21,7 @@ import collections
 # !!! function works well for 1891 - 1900, not checked after that !!!
 def check_if_discussion(path_meta_xml_file,
         list_attributes  = ['TITEL_NORMAL_DE', 'TITEL_NORMAL_FR', 'TITEL_ORIGINAL_DE', 'TITEL_ORIGINAL_FR'],
-        list_nondiscussion = ['inhaltsverzeichnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste',
+        list_nondiscussion = ['inhaltsverzeiGGchnis', 'inhaltsverzeichniss', 'jahresinhalt', 'rednerliste',
             'umschlag', 'sachregister', 'titelblatt', 'numerierung'],
         list_nondiscussion2 = ['table', 'matiÃ¨res', 'rÃ©pertoire', 'procÃ¨s-verbaux']):
 
@@ -69,13 +69,13 @@ def get_text(sometext):
 
 
 # function to annotated corrected XML
-def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
+def get_annotated_xml(XML_root, df_lastnames, list_notnames, bln_print=False):
 
     # list of votation terms
     # TODO: make it work for Ã©, etc.
     list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt',
                           'Einverstanden', 'Stimmen', '(Eintreten)', '(Nichteintreten)',
-                          'Votation', 'Vote', 'votation', #'(Adoptï¿½s)', 'adoptï¿½s', 'adoptï¿½e', 'rejetï¿½e',
+                          'Votation', 'Vote', 'votation', '(AdoptÃ©s)', 'adoptÃ©s', 'adoptÃ©e', 'rejetÃ©e',
                           "D'accord", 'voix']
 
     # list of stopwords
@@ -121,7 +121,7 @@ def get_annotated_xml(XML_root, df_lastnames, bln_print=False):
 
                         if textbox_texttype in ['text_col1', 'text_col2']:
 
-                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
+                            XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, bln_print=False)
                             if this_is_speech:
                                 prev_is_speech = True
 #                                print('stopped after finding speech start')
@@ -207,7 +207,7 @@ def get_complete_text(textbox):
             complete_text += thattext
 
             # in first two textlines of textbox, check for colon
-            if ind_tl < 3:
+            if ind_tl < 2:
                 if ':' in thattext:
                     ind_tl_colon = ind_tl
 
@@ -222,17 +222,19 @@ def get_complete_text(textbox):
 # - bln_print: whether to print during execution, default False
 # output:
 # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
-def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False):
+def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, bln_print=False):
+
+    # lists of roles
+    list_roles = ['PrÃ¤sident', 'PrÃ¤sidentin', 'VizeprÃ¤sident', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente',
+                  'Berichterstatter', 'Berichterstatterin', 'rapporteur',
+                  'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
+                  'Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller fÃ©dÃ©ral',
+                  'VizeprÃ¤sident']
+    list_roles_ext = ['Mehrheit', 'Minderheit', 'majoritÃ©', 'minoritÃ©', 'deutscher', 'deutsche', 'franÃ§ais', 'franÃ§aise', 'Kommission', 'commission']
 
     # initialize flag
     this_is_speech = False
 
-    # initialize strings and ID
-    str_name = ''
-    str_role = ''
-    list_uniqueID = []
-    str_canton = ''
-
     # font text end
     fontend = '[/font]'
 
@@ -240,7 +242,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
     # at the beginning of a textbox and identifiying a name or a role in front
     # of that colon
     if ind_tl_colon >= 0:
-#    if ':' in text[:100]:
         # extract the index of the colon in the text
         colon_index_text = text.index(':')
 
@@ -264,12 +265,12 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
         list_oi = [term for term in list_oi if not term.isdigit()]
 
         # remove single characters
+        # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters)
+        # TODO: maybe exclude I and A to account for Appenzell
         list_oi = [term for term in list_oi if len(term)>1]
 
-        # for every term
-        for term in list_oi:
-            # if possible, find a name in a list
-            str_name, str_role, list_uniqueID, str_canton = find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False)
+        # if possible, find a name from the list
+        str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False)
         if bln_print:
             print('name', str_name, 'role', str_role)
 
@@ -305,9 +306,6 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                 thattext = XML_new[ind_p][ind_t][0].text
                 colon_index = thattext.index(':')
 
-                # get last font information of thattext
-                fontstart = re.findall('\[font.*?\]', thattext)[-1]
-
                 try:
                     # write speaker to first line
                     XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend
@@ -315,10 +313,12 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                     # get start of speech with correct font start
                     if thattext[colon_index+1:].startswith('[font'):
                         startspeech = thattext[colon_index+1:]
-                    elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                    elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
                         startspeech = ''
+                    elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                        startspeech = thattext[colon_index+8:]
                     else:
-                        startspeech = fontstart + thattext[colon_index+1:]
+                        startspeech = thattext[colon_index+1:]
 
                     # write beginning of speech to second line
                     # (create new ET element if necessary)
@@ -343,16 +343,15 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
                 thattext = XML_new[ind_p][ind_t][1].text
                 colon_index = thattext.index(':')
 
-                # get last font information of thattext
-                fontstart = re.findall('\[font.*?\]', thattext)[-1]
-
                 # get start of speech with correct font start
                 if thattext[colon_index+1:].startswith('[font'):
                     startspeech = thattext[colon_index+1:]
-                elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                elif re.match('^[ ]?\[/font\]$', thattext[colon_index+1:]):
                     startspeech = ''
+                elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]):
+                    startspeech = thattext[colon_index+8:]
                 else:
-                    startspeech = fontstart + thattext[colon_index+1:]
+                    startspeech = thattext[colon_index+1:]
 
                 # write speaker to first line
                 XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend
@@ -362,7 +361,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_
             # set flag
             this_is_speech = True
             if bln_print:
-                print('found a name:', list_oi, str_name, str_role, '\n')
+                print('found a name:', text_start, list_oi, str_name, str_role, '\n')
+            print('found a name:', text_start, list_oi, ind_tl_colon, str_name, str_role, list_uniqueID, '\n')
 
     return XML_new, this_is_speech
 
@@ -423,11 +423,10 @@ def flatten(l):
 # - str_role: string to which role should be attached
 # - list_uniqueID: list with one or several uniqueIDs
 # - list_tupels: list of tupels containing all types of names
-# TODO: correctly extract canton! don't do reversed, find name first that might have issue with canton, then look for possible canton
 # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
-def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bln_print=False):
+def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):
 
-    def get_string(term, str_name, str_role, list_uniqueID, str_canton):
+    def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton):
         name_type = ''
         # if it is one of the simple names
         if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
@@ -439,7 +438,7 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl
         # if it is a double name
         elif term in list(df_names['name_short'].loc[df_names['type']=='double']):
             if bln_print:
-                print(20*'\n', 'DOUBLE NAME')
+                print(5*'\n', 'DOUBLE NAME')
             # get correct name
             correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')]
             if bln_print:
@@ -458,23 +457,11 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl
             str_name = add_to_string(str_name, correct_name)
             name_type = 'comp'
         # if it contains a canton
-        # !!! also pass list_oi to look for canton
-        # !!! how to handle for people mentioned in text???
+        # TODO: how to handle for people mentioned in text???
         elif term in list(df_names['name_short'].loc[df_names['type']=='canton']):
             if bln_print:
                 print('contains a canton', term)
-#            canton_missing = False
-#            df_temp = df_names.loc[df_names['name_short']==term]
-#            print('list_correct', df_temp)
-#            print(str_canton)
-#            if str_canton:
-#                str_correct = check_place(df_temp, str_canton)
-#                if str_correct in ['not found', 'too many']:
-#                    str_name = add_to_string(str_name, term + ' (CANTONT MISSING)')
-#                    canton_missing = True
-#                else:
-#                    str_name = add_to_string(str_name, str_temp)
-#                    name_type = 'canton'
+
             str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
             name_type = 'canton'
 
@@ -485,41 +472,66 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl
             list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
         elif name_type in ['canton']:
             list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
-#            if canton_missing:
-#                temp = tuple(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
-#            else:
-#                temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_correct']==str_correct)].iat[0, df_names.columns.get_loc('uniqueIndex')]
+
 
         if len(list_temp) > 0:
             if bln_print:
                 print(list_temp, list_uniqueID)
                 print(type(list_temp), type(list_uniqueID))
                 print(isinstance(list_uniqueID, list))
-            # if no unique ID has been assigned so far
-            if len(list_uniqueID) == 0:
-                list_uniqueID = list_temp
-            # if there are already one or several people and have a new person, we update
-            elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
-                list_uniqueID.append(list_temp)
-
-            ## if we already have several possible people, e.g. because of canton
-            #elif isinstance(int_uniqueID, tuple):
-                #print('I should be here')
-                ## and refound the uniqueID of one of those, don't update
-                #if temp in int_uniqueID:
-                    #pass
-                ## and update if we don't have that uniqueID yet
-                #else:
-                    #int_uniqueID = (int_uniqueID, temp)
-            ## if a person with that uniqueID exists already, don't update
-            #elif isinstance(int(int_uniqueID), int) and temp == int_uniqueID:
-                #print('but end up here.. not even.....')
-                #pass
-            ## if a different unique ID has been assigned already
-            #else:
-                #int_uniqueID = (int_uniqueID, temp)
-
-        return str_name, str_role, list_uniqueID
+
+            list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
+
+        return str_name, str_role, list_uniqueID, name_type
+
+    def update_list_uniqueID(list_uniqueID, list_temp):
+        # if no unique ID has been assigned so far
+        if len(list_uniqueID) == 0:
+            list_uniqueID = list_temp
+        # if there are already one or several people and have a new person, we update
+        elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
+            list_uniqueID.append(list_temp)
+
+        return list_uniqueID
+
+    # function to find correct term (in case of misspellings, etc.)
+    def get_approximate_term(term, array_all):
+        # TODO: probably need to improve this procedure
+        #       - find better values ....
+
+        # initialize string
+        term_approx = ''
+
+        # get normalized array
+        array_normalized = array_all[normalized_damerau_levenshtein_distance_ndarray(term, array_all) <= 0.35]
+        array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized)
+
+        # get absolute array
+        array_absolute = array_all[damerau_levenshtein_distance_ndarray(term, array_all) <= 2]
+        array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute)
+        if bln_print:
+            print(term)
+            print(array_normalized, array_normalized_values)
+            print(array_absolute, array_absolute_values)
+
+        # intersection
+        set_intersection = set(array_normalized).intersection(set(array_absolute))
+
+        # if a similar name was found
+        if len(set_intersection) == 1:
+            term_approx = list(set_intersection)[0]
+
+        # or several
+        elif len(set_intersection) > 1:
+            # !!! we only look at normalized values
+            # !!! we don't account for names with same values !!!
+            array_min = array_normalized[array_normalized_values.argmin()]
+            term_approx = array_min
+            if bln_print:
+                print('we found several possible names', set_intersection, 'and choose', array_min)
+
+        return term_approx
+
 
     # small function to add term to str_name
     def add_to_string(string, term):
@@ -529,70 +541,109 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl
             string += ' ' + term
         return string
 
-    list_roles = ['PrÃ¤sident', 'PrÃ¤sidentin', 'VizeprÃ¤sident', 'PrÃ¤sidium', 'PrÃ©sident', 'PrÃ©sidente', 'prÃ©sident', 'prÃ©sidente',
-                  'Berichterstatter', 'Berichterstatterin', 'rapporteur',
-                  'Sprecher', 'Sprecherin', 'porte-parole', 'porteparole',
-                  'Bundesrat', 'Bundesrath', 'BundesrÃ¤tin', 'conseiller fÃ©dÃ©ral',
-                  'VizeprÃ¤sident']
-
-    list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen',
-                     'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'RÃ©diger', 'rÃ©diger', 'Wer', 'FÃ¤llen']
-
-    list_places = get_list_cantons(df_names)
+    # initialize strings and ID
+    str_name = ''
+    str_role = ''
+    list_uniqueID = []
+    str_canton = ''
+    name_type = ''
 
-    if bln_print:
-        print('now is about: ------', term)
     # extract list and array of last names
     list_all_names = list(df_names['name_short'])
     array_all_names = np.array(df_names['name_short'])
 
-    # if term is not easily mistaken as a name (avoid false positives)
-    if term not in list_notnames:
+    # for every term
+    for term in list_oi:
 
-        # if term is in the list of all names and roles
-        if term in (list_all_names + list_roles):
+        if bln_print:
+            print('now is about: ------', term)
+
+        if term in list_roles:
             # get correct name and uniqueID, or role, for that term
-            str_name, str_role, list_uniqueID = get_string(term, str_name, str_role, list_uniqueID, str_canton)
+            str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
 
             if bln_print:
-                print('=== correct name', term)
-        # if term in list of cantons
-        elif term in list_places:
-            str_canton = term
-        # if term is not in list_all_names
-        else:
-            # look for similar names based on (normalized) Damerau-Levenshtein distance
-            # !!! probably need to improve this procedure
-            #       - find better values ....
-            if bln_print:
-                print(term)
-            array_normalized = array_all_names[normalized_damerau_levenshtein_distance_ndarray(term, array_all_names) <= 0.35]
-            array_normalized_values = normalized_damerau_levenshtein_distance_ndarray(term, array_normalized)
-            if bln_print:
-                print(array_normalized, array_normalized_values)
-            array_absolute = array_all_names[damerau_levenshtein_distance_ndarray(term, array_all_names) <= 2]
-            array_absolute_values = damerau_levenshtein_distance_ndarray(term, array_absolute)
-            if bln_print:
-                print(array_absolute, array_absolute_values)
-            set_intersection = set(array_normalized).intersection(set(array_absolute))
-            # check if a similar name was found
-            term_approx = ''
-            if len(set_intersection) == 1:
-                term_approx = list(set_intersection)[0]
-                if bln_print:
-                    print('we found the name', set_intersection)
-            elif len(set_intersection) > 1:
-                # !!! we only look at normalized values
-                # !!! we don't account for names with same values !!!
-                array_min = array_normalized[array_normalized_values.argmin()]
-                term_approx = array_min#[0]
-                if bln_print:
-                    print('we found several possible names', set_intersection, 'and choose', array_min)
-            if term_approx:
-                str_name, str_role, list_uniqueID = get_string(term_approx, str_name, str_role, list_uniqueID, str_canton)
+                print('found a role', term)
+
+            # TODO: also look for similar terms (misspellings)
+            # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
+
+        elif term in list_roles_ext:
+            pass
+            # TODO: extract whether it is minority or majority and save that information
+
+        # cannot happen for the first term
+        elif name_type == 'canton':
+            list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
+            canton_type = ''
+            if term in list_cantonname:
+                str_canton = term
+                canton_type = 'CantonName'
+                print('!!! is a canton', term, list_oi, str_name, str_role)
+            elif term in list_cantonabbr:
+                str_canton = term
+                canton_type = 'CantonAbbr'
+                print('!!! is a canton', term, list_oi, str_name, str_role)
+            elif term in list_citizenship:
+                str_canton = term
+                canton_type = 'Citizenship'
+                print('!!! is a canton', term, list_oi, str_name, str_role)
+            elif term in list_firstname:
+                str_canton = term
+                canton_type = 'FirstName'
+                print('!!! is a canton', term, list_oi, str_name, str_role)
+
+            else:
+                print('might be a canton', term, list_oi, str_name, str_role)
+
+            # if a canton or similar was found
+            if canton_type:
+                # get rid of CANTON MISSING
+                str_name = str_name.split(' ')[0]
+                # extract uniqueID
+                # if Citizenship, do proper comparison
+                if canton_type == 'Citizenship':
+                    df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)]
+                    list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])]
+                    print(list_cities)
+                    str_citizenship = ''
+                    try:
+                        if len(list_cities) == 1:
+                            str_citizenship = list_cities[0]
+                    except:
+                        print('found no or more than one person with citizenship', str_canton, str_name)
+                        pass
+
+                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+
+                else:
+                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                print(list_temp, list_uniqueID)
+
+                if len(list_temp) > 0:
+                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
+
+        # if term is not easily mistaken as a name (avoid false positives)
+        elif term not in list_notnames:
+
+            # if term is in the list of all names
+            if term in list_all_names:
+                # get correct name and uniqueID, or role, for that term
+                str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
+
                 if bln_print:
-                    print('*******************', str_name, term_approx)
+                    print('=== correct name', term)
+
+            # if term is not in list_all_names
+            else:
+                # look for similar names based on (normalized) Damerau-Levenshtein distance
+                term_approx = get_approximate_term(term, array_all_names)
 
+                # if one was found, get correct name, etc.
+                if term_approx:
+                    str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton)
+                    if bln_print:
+                        print('=== approximate name', str_name, term_approx)
 
     return str_name, str_role, list_uniqueID, str_canton
 
@@ -665,14 +716,36 @@ def label_language(XML_new, ind_p, ind_t, aux_dict_l):
 
     return XML_new
 
+def get_cities(list_citizenship):
+    return [city[:-5] for item in list_citizenship for city in item.split(',')]
 
+# function to get list of places
+def get_list_cantons(df_names, str_name = ''):
+    if str_name:
+        df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)]
+    else:
+        df_temp = df_names.loc[df_names['type']=='canton']
+    print(df_temp)
+    list_cantonname = list(df_temp['CantonName'])
+    for canton in ['Basel-Stadt', 'Basel-Landschaft']:
+        if canton in list_cantonname:
+            list_cantonname.extend(['Basel'])
+    if 'GraubÃ¼nden' in list_cantonname:
+        list_cantonname.extend(['BÃ¼nden'])
+    if 'Bern' in list_cantonname:    # check how this works!!
+        list_cantonname.extend(['Berne'])
+    list_cantonabbr = list(df_temp['CantonAbbreviation'])
+    list_citizenship = list(df_temp['Citizenship'])
+    list_citizenship = get_cities(list_citizenship)
+    list_firstname = list(df_temp['FirstName'])
 
+    return list_cantonname, list_cantonabbr, list_citizenship, list_firstname
 
 
 
-
-
-
+# tokenizer
+# last part \S+ is needed to get colon, \S stands for white space
+tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
 
 
 
@@ -724,11 +797,6 @@ def exclude_overlaps(dict_text, dict_overlaps):
     return dict_text
 
 
-# tokenizer
-tokenizer_canton = RegexpTokenizer(r'\w+')    # only leaves words
-#tokenizer = RegexpTokenizer(r'\w+(?:-\w+)*|\$[\d\.]+|\S+')
-# last part \S+ is needed to get colon, \S stands for white space
-tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
 
 
 
@@ -766,13 +834,6 @@ def get_df_from_lists_names(lists_names):
 
 
 
-
-
-
-
-
-
-
 def tokenize_dictionary(dictionary, tokenizer, only_text=False):
     dictionary_tokenized = {}
     # if there is only text, e.g. when we look at all texts of a document at once (level 2 in flattened dictionary)
@@ -815,18 +876,3 @@ def dict_only_text(dictionary):
         dictionary_only_text[key] = tupel[1]
 
     return dictionary_only_text
-
-
-
-
-
-# function to get list of places
-def get_list_cantons(df_names):
-    df_temp = df_names.loc[df_names['type']=='canton']
-    list_cantonname = list(df_temp['CantonName'])
-    list_cantonabbr = list(df_temp['CantonAbbreviation'])
-    list_citizenship = list(df_temp['Citizenship'])
-    list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')]
-    list_firstname = list(df_temp['FirstName'])
-
-    return list_cantonname +  list_cantonabbr + list_citizenship + list_firstname
diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh
index dbec0daf596094cf95f50dad868b250320eab8b9..e769de17cdb4db4b746d143be02e283bcf0e9e50 100755
--- a/src/sh/extract_discussions_yearly.sh
+++ b/src/sh/extract_discussions_yearly.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
 year_start=1891
-year_end=1893
+year_end=1891
 
 for year in $(seq $year_start $year_end)
 do
     echo $year
-    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/AB/${year}/05_annotatedxml.tar.gz
+    python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz
 done