diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt index cd2fce5572016679ab58dc253b5314bdcbca7265..886d368ba157da380457bf44d024fe02d19450f9 100644 --- a/data/lists/not_names.txt +++ b/data/lists/not_names.txt @@ -12,6 +12,7 @@ Herr Kasse nicht Rath +Schrit Seite selber Steuer diff --git a/data/lists/wrongly_identified_speakers.txt b/data/lists/wrongly_identified_speakers.txt index 2c33e4078424c9470845645b17210d3830e96cd9..11b584bb751ae251dcf2cc4b6c15ecf4440757bd 100644 --- a/data/lists/wrongly_identified_speakers.txt +++ b/data/lists/wrongly_identified_speakers.txt @@ -2,6 +2,7 @@ also check for council: ----------------------- 1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann +1925/20029937: Schneider, Berichterstatter --> NR, not SR one MP not active in whole year, leads to other not uniquely identified @@ -14,7 +15,13 @@ one MP not active in whole year, leads to other not uniquely identified identified as speech start but is in text: ------------------------------------------ +do I really need to look on the first two lines? maybe one is sufficient? +look for typical terms such as gestellt, gesagt, etc. + +1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt: +1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. Bühler folgend, die Erklärung gewählt: 1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 : +1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete: 1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt: 1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger 1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text @@ -23,6 +30,18 @@ identified as speech start but is in text: 1925/20029981: Brügger möchte ich sagen: --> identified as speech start but is in text +wrongly spelled city +-------------------- +1925/20029963: Jenny Ennend (instead of Ennenda) +1925/20029995,96: Keller Zurich (instead of Zürich) + + + +Appenzeller +----------- +1894/20026597: Sonderegger +1894/20026618: Sonderegger + some other persons wrongly identified as MP diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index d7fd239333411cce94dc709d9155f1c96863c42e..2864c981f394c9a348da2f2803c66b39a3e4dac0 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -4,8 +4,8 @@ # Code to extract discussions from corrected XML files #%% # to work with atom -#%load_ext autoreload -#%autoreload 2 +%load_ext autoreload +%autoreload 2 import pickle import time @@ -26,7 +26,7 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1925' +year = '1894' input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" @@ -111,20 +111,21 @@ utils_proc.compress_tar(output_annotatedxml) #%% -## to test for one file -#file_tarpath = './1893/20026526_datacorr.xml' -# -#id_doc = file_tarpath.split('/')[-1][:8] -# -## instantiate document object (always from original pdf) -#infile_aux = year + '/' + id_doc + '.pdf' -#file_doc = defc.Document(infile_aux, folder_database) -# -#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): - #print(id_doc + '\n') -# - #file_doc.df_lastnames = df_lastnames - #file_doc.annotate_xml() +# to test for one file +file_tarpath = './1925/20029981_datacorr.xml' + +id_doc = file_tarpath.split('/')[-1][:8] + +# instantiate document object (always from original pdf) +infile_aux = year + '/' + id_doc + '.pdf' +file_doc = defc.Document(infile_aux, folder_database) + +if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): + print(id_doc + '\n') + + file_doc.df_lastnames = df_lastnames + file_doc.list_notnames = list_notnames + file_doc.annotate_xml() #%% @@ -134,3 +135,16 @@ utils_proc.compress_tar(output_annotatedxml) #id_doc #len(files_to_process) +list_bla = [1, 2,3] +list_bla.extend([4, 5]) +list_bla + +if 3 in [1,3, 4]: + print('yes') + + +#%% + +str_name = 'Blumer' # (CANTON MISSING)' + +print(str_name.split(' ')) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index f9148b8da008cf3b26d3ce19570949c51ff4bfcb..8be49c7f76fa69ea0469ec99b70efa42f54b72d7 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -522,7 +522,7 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', 'Vizepräsident'] - list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names) + list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission'] # extract list and array of last names list_all_names = list(df_names['name_short']) @@ -537,7 +537,21 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, if bln_print: print('now is about: ------', term) - if name_type == 'canton': + if term in list_roles: + # get correct name and uniqueID, or role, for that term + str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) + + if bln_print: + print('found a role', term) + + # TODO: also look for similar terms (misspellings) + + elif term in list_roles_ext: + pass + # TODO: extract whether it is minority or majority and save that information + + elif name_type == 'canton': + list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0]) canton_type = '' if term in list_cantonname: str_canton = term @@ -559,20 +573,34 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, else: print('might be a canton', term, list_oi, str_name, str_role) + # if a canton or similar was found if canton_type: # get rid of CANTON MISSING str_name = str_name.split(' ')[0] # extract uniqueID -# list_temp = [] - list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + # if Citizenship, do proper comparison + if canton_type == 'Citizenship': + df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)] + list_citizenship = [term for term in df_temp[canton_type] if str_canton in tokenizer_canton.tokenize(term)] + try: + if len(list_citizenship) == 1: + str_citizenship = list_citizenship[0] + except: + print('found no or more than one person with citizenship', str_canton, str_name) + pass + + list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + + else: + list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) print(list_temp, list_uniqueID) list_uniqueID = list_temp # if term is not easily mistaken as a name (avoid false positives) - if term not in list_notnames: + elif term not in list_notnames: - # if term is in the list of all names and roles - if term in (list_all_names + list_roles): + # if term is in the list of all names + if term in list_all_names: # get correct name and uniqueID, or role, for that term str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) @@ -841,9 +869,18 @@ def dict_only_text(dictionary): # function to get list of places -def get_list_cantons(df_names): - df_temp = df_names.loc[df_names['type']=='canton'] +def get_list_cantons(df_names, str_name = ''): + if str_name: + df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)] + else: + df_temp = df_names.loc[df_names['type']=='canton'] + print(df_temp) list_cantonname = list(df_temp['CantonName']) + for canton in ['Basel-Stadt', 'Basel-Landschaft']: + if canton in list_cantonname: + list_cantonname.extend(['Basel']) + if 'Graubünden' in list_cantonname: + list_cantonname.extend(['Bünden']) list_cantonabbr = list(df_temp['CantonAbbreviation']) list_citizenship = list(df_temp['Citizenship']) list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')]