diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py
index 02fdbed765592503a953faa48cafcfe773539096..618f7b2cd2320889e25d4eb1336e1adafc7808f8 100644
--- a/src/python/run_extract_discussions.py
+++ b/src/python/run_extract_discussions.py
@@ -27,7 +27,7 @@ from utils_proc import call_with_out
 
 # needed for running in atom, can be ignored
 year = '1971'
-input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
+input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle"
 input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
 input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
 input_notnames = "data/lists/not_names.txt"
@@ -113,7 +113,7 @@ utils_proc.compress_tar(output_annotatedxml)
 
 #%%
 # to test for one file
-file_tarpath = './1971/20000323_datacorr.xml'
+file_tarpath = './1971/20000010_datacorr.xml'
 
 id_doc = file_tarpath.split('/')[-1][:8]
 
@@ -132,7 +132,6 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20
 #%%
 
 
-
 #id_doc
 
 #len(files_to_process)
diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py
index b19c06404c47e65559ddeeb273fa0bbd271a38fd..56f14ce92d7e7fff0f55f814c96febf60b7228b0 100644
--- a/src/python/utils_annot.py
+++ b/src/python/utils_annot.py
@@ -419,78 +419,100 @@ def flatten(l):
 # function to find names
 # input:
 # - term: term that might be name
+# - df_names: yearly dataframe with all MPs
 # - str_name: string to which name should be attached
-# - str_role: string to which role should be attached
 # - list_uniqueID: list with one or several uniqueIDs
 # - list_tupels: list of tupels containing all types of names
 # TODO: double names in extra column: string gets written twice, uniqueID is fine (1893, 20026532, Cramer)
 def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln_print=False):
 
-    def get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton):
-        name_type = ''
-        # if it is one of the simple names
-        if term in list(df_names['name_short'].loc[df_names['type']=='simple']):
-            str_name = add_to_string(str_name, term)
-            name_type = 'simple'
-        # if it is a role
-        elif term in list_roles:
-            str_role = add_to_string(str_role, term)
-        # if it is a double name
-        elif term in list(df_names['name_short'].loc[df_names['type']=='double']):
-            if bln_print:
-                print(5*'\n', 'DOUBLE NAME')
-            # get correct name
-            correct_name = df_names.loc[(df_names['type']=='double') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')]
-            if bln_print:
-                print('double name', correct_name)
-            # only add name if it is not there yet
-            # if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries
-            if correct_name not in str_name.split(' '):
-                str_name = add_to_string(str_name, correct_name)
-            name_type = 'double'
-        # if it is a composite name
-        elif term in list(df_names['name_short'].loc[df_names['type']=='comp']):
-            # get correct name
-            correct_name = df_names.loc[(df_names['type']=='comp') & (df_names['name_short']== term)].iat[0, df_names.columns.get_loc('name_correct')]
-            if bln_print:
-                print('composite name', correct_name)
-            str_name = add_to_string(str_name, correct_name)
-            name_type = 'comp'
-        # if it contains a canton
-        # TODO: how to handle for people mentioned in text???
-        elif term in list(df_names['name_short'].loc[df_names['type']=='canton']):
-            if bln_print:
-                print('contains a canton', term)
-
-            str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
-            name_type = 'canton'
-
+    def get_string(term, df_names, str_name, list_uniqueID):
+        # get name type
+        name_type = df_names['nameType'].loc[df_names['shortName']==term].iloc[0]
+        print(df_names[df_names['shortName']==term])
+        print(term)
+        print(name_type)
 
         # extract uniqueID
         list_temp = []
+        # TODO might lead to doubled double names
         if name_type in ['simple', 'double', 'comp']:
-            list_temp = [df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
+            list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
+            str_completeName = df_names['completeName'].loc[df_names['shortName']==term].iloc[0]
+            str_name = add_to_string(str_name, str_completeName)
+
+        # TODO: how to handle for people mentioned in text???
         elif name_type in ['canton']:
-            list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+            list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+            str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
+
+        print(list_temp)
+        print(str_name)
+
+        ## if it is one of the simple names
+        #if term in list(df_names['shortName'].loc[df_names['nameType']=='simple']):
+            #str_name = add_to_string(str_name, term)
+            #name_type = 'simple'
+        ## if it is a double name
+        #elif term in list(df_names['shortName'].loc[df_names['nameType']=='double']):
+            #if bln_print:
+                #print(5*'\n', 'DOUBLE NAME')
+            ## get correct name
+            #correct_name = df_names.loc[(df_names['nameType']=='double') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
+            #if bln_print:
+                #print('double name', correct_name)
+            ## only add name if it is not there yet
+            ## if a person is referenced by its complete double name, e.g. Meier-Müller, he or she gets two entries
+            #if correct_name not in str_name.split(' '):
+                #str_name = add_to_string(str_name, correct_name)
+            #name_type = 'double'
+        ## if it is a composite name
+        #elif term in list(df_names['shortName'].loc[df_names['nameType']=='comp']):
+            ## get correct name
+            #correct_name = df_names.loc[(df_names['nameType']=='comp') & (df_names['shortName']== term)].iat[0, df_names.columns.get_loc('name_correct')]
+            #if bln_print:
+                #print('composite name', correct_name)
+            #str_name = add_to_string(str_name, correct_name)
+            #name_type = 'comp'
+        ## if it contains a canton
+        ## TODO: how to handle for people mentioned in text???
+        #elif term in list(df_names['shortName'].loc[df_names['nameType']=='canton']):
+            #if bln_print:
+                #print('contains a canton', term)
+#
+            #str_name = add_to_string(str_name, term + ' (CANTON MISSING)')
+            #name_type = 'canton'
+#
+#
+        ## extract uniqueID
+        #list_temp = []
+        #if name_type in ['simple', 'double', 'comp']:
+            #list_temp = [df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iat[0, df_names.columns.get_loc('uniqueIndex')]]
+        #elif name_type in ['canton']:
+            #list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==term)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
 
 
         if len(list_temp) > 0:
             if bln_print:
                 print(list_temp, list_uniqueID)
                 print(type(list_temp), type(list_uniqueID))
-                print(isinstance(list_uniqueID, list))
 
-            list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
+            list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
+        else:
+            print('is this even possible??')
 
-        return str_name, str_role, list_uniqueID, name_type
+        return str_name, list_uniqueID, name_type
 
-    def update_list_uniqueID(list_uniqueID, list_temp):
+    def update_list_uniqueID(list_uniqueID, list_temp, name_type):
         # if no unique ID has been assigned so far
         if len(list_uniqueID) == 0:
             list_uniqueID = list_temp
         # if there are already one or several people and have a new person, we update
         elif len(list_uniqueID) > 0 and len(set(list_temp).intersection(set(flatten(list_uniqueID)))) == 0:
             list_uniqueID.append(list_temp)
+        # if name_type is canton
+        if name_type == 'canton' and len(list_temp) == 1 and list_temp[0] in list_uniqueID:
+            list_uniqueID = list_temp
 
         return list_uniqueID
 
@@ -549,8 +571,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
     name_type = ''
 
     # extract list and array of last names
-    list_all_names = list(df_names['name_short'])
-    array_all_names = np.array(df_names['name_short'])
+    list_all_names = list(df_names['shortName'])
+    array_all_names = np.array(df_names['shortName'])
 
     # for every term
     for term in list_oi:
@@ -559,20 +581,19 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
             print('now is about: ------', term)
 
         if term in list_roles:
-            # get correct name and uniqueID, or role, for that term
-            str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
+            # update str_role
+            # TODO: also look for similar terms (misspellings)
+            # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
+            str_role = add_to_string(str_role, term)
 
             if bln_print:
                 print('found a role', term)
 
-            # TODO: also look for similar terms (misspellings)
-            # TODO: assign role in English, e.g. Berichterstatter and rapporteur become reporter
-
         elif term in list_roles_ext:
             pass
             # TODO: extract whether it is minority or majority and save that information
 
-        # cannot happen for the first term
+        # cannot happen for the first term in list_oi
         elif name_type == 'canton':
             list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
             canton_type = ''
@@ -603,9 +624,8 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
                 # extract uniqueID
                 # if Citizenship, do proper comparison
                 if canton_type == 'Citizenship':
-                    df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)]
+                    df_temp = df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)]
                     list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])]
-                    print(list_cities)
                     str_citizenship = ''
                     try:
                         if len(list_cities) == 1:
@@ -614,14 +634,18 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
                         print('found no or more than one person with citizenship', str_canton, str_name)
                         pass
 
-                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[0]
+                    str_name = add_to_string(str_name, str_completeName)
 
                 else:
-                    list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                    list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
+                    str_completeName = df_names['completeName'].loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name) & (df_names[canton_type]==str_canton)].iloc[0]
+                    str_name = add_to_string(str_name, str_completeName)
                 print(list_temp, list_uniqueID)
 
                 if len(list_temp) > 0:
-                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp)
+                    list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type)
 
         # if term is not easily mistaken as a name (avoid false positives)
         elif term not in list_notnames:
@@ -629,7 +653,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
             # if term is in the list of all names
             if term in list_all_names:
                 # get correct name and uniqueID, or role, for that term
-                str_name, str_role, list_uniqueID, name_type = get_string(term, df_names, str_name, str_role, list_uniqueID, str_canton)
+                str_name, list_uniqueID, name_type = get_string(term, df_names, str_name, list_uniqueID)
 
                 if bln_print:
                     print('=== correct name', term)
@@ -641,7 +665,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, bln
 
                 # if one was found, get correct name, etc.
                 if term_approx:
-                    str_name, str_role, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, str_role, list_uniqueID, str_canton)
+                    str_name, list_uniqueID, name_type = get_string(term_approx, df_names, str_name, list_uniqueID)
                     if bln_print:
                         print('=== approximate name', str_name, term_approx)
 
@@ -722,10 +746,10 @@ def get_cities(list_citizenship):
 # function to get list of places
 def get_list_cantons(df_names, str_name = ''):
     if str_name:
-        df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)]
+        df_temp = df_names.loc[(df_names['nameType']=='canton') & (df_names['shortName']==str_name)]
     else:
-        df_temp = df_names.loc[df_names['type']=='canton']
-    print(df_temp)
+        df_temp = df_names.loc[df_names['nameType']=='canton']
+    #print(df_temp)
     list_cantonname = list(df_temp['CantonName'])
     for canton in ['Basel-Stadt', 'Basel-Landschaft']:
         if canton in list_cantonname:
@@ -744,7 +768,6 @@ def get_list_cantons(df_names, str_name = ''):
 
 
 # tokenizer
-# last part \S+ is needed to get colon, \S stands for white space
 tokenizer = RegexpTokenizer(r'\w+(?:-/w+)*|\$[\d\.]+')
 
 
@@ -815,25 +838,6 @@ def get_last_item(list_tupels):
 
 
 
-
-# function to get data frame from lists of names
-# input:
-# - lists_names: lists of names (simple, double, comp, canton)
-# output:
-# - df: corresponding dataframe
-def get_df_from_lists_names(lists_names):
-    list_types = ['simple', 'double', 'comp', 'canton']
-    df = pd.DataFrame()
-    for i in range(4):
-        df_temp = pd.DataFrame(lists_names[i],
-                columns = ('name_short', 'name_correct', 'uniqueIndex', 'CantonName', 'CantonAbbreviation', 'Citizenship', 'FirstName'))
-        df_temp['type'] = list_types[i]
-        df = pd.concat([df, df_temp], ignore_index = True)
-    return df
-
-
-
-
 def tokenize_dictionary(dictionary, tokenizer, only_text=False):
     dictionary_tokenized = {}
     # if there is only text, e.g. when we look at all texts of a document at once (level 2 in flattened dictionary)