Skip to content
Snippets Groups Projects
Commit 933b4bab authored by Lili Gasser's avatar Lili Gasser
Browse files

improve recognizing speakers by citizenship

parent 9f1c770e
No related branches found
No related tags found
No related merge requests found
...@@ -12,6 +12,7 @@ Herr ...@@ -12,6 +12,7 @@ Herr
Kasse Kasse
nicht nicht
Rath Rath
Schrit
Seite Seite
selber selber
Steuer Steuer
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
also check for council: also check for council:
----------------------- -----------------------
1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann 1925/20029870 and several more: Baumann, Berichterstatter --> not uniquely identified but there is only one SR Baumann
1925/20029937: Schneider, Berichterstatter --> NR, not SR
one MP not active in whole year, leads to other not uniquely identified one MP not active in whole year, leads to other not uniquely identified
...@@ -14,7 +15,13 @@ one MP not active in whole year, leads to other not uniquely identified ...@@ -14,7 +15,13 @@ one MP not active in whole year, leads to other not uniquely identified
identified as speech start but is in text: identified as speech start but is in text:
------------------------------------------ ------------------------------------------
do I really need to look on the first two lines? maybe one is sufficient?
look for typical terms such as gestellt, gesagt, etc.
1891/20026455: Ein weiterer Antrag ist derjenige des Hrn. Dufour. Herr Dufour hat den Antrag gestellt:
1891/20026465: Wir haben nun aber, dem Gedankengang des Hrn. Bühler folgend, die Erklärung gewählt:
1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 : 1891/20026489: Um jeden Zweifel zu heben, beantrage ich Ihnen folgende Redaktion des Art. 2 :
1894/20026607: Müller gegenüber drei anderen durchgedrungen, welche lautete:
1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt: 1925/20029903: Nun hat Herr Nationalrat Huber schon am 5. De-zember 1924 folgende Kleine Anfrage gestellt:
1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger 1925/20029863: ganz gut gehen. Es ist ja wahr, was Herr Brügger gesagt hat: --> finds Lanz and Brügger
1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text 1925/20029917: Mögen Sie nun aber denken wie Herr Oberst Brügger oder mögen Sie denken wie ich: --> identified as speech start but is in text
...@@ -23,6 +30,18 @@ identified as speech start but is in text: ...@@ -23,6 +30,18 @@ identified as speech start but is in text:
1925/20029981: Brügger möchte ich sagen: --> identified as speech start but is in text 1925/20029981: Brügger möchte ich sagen: --> identified as speech start but is in text
wrongly spelled city
--------------------
1925/20029963: Jenny Ennend (instead of Ennenda)
1925/20029995,96: Keller Zurich (instead of Zürich)
Appenzeller
-----------
1894/20026597: Sonderegger
1894/20026618: Sonderegger
some other persons wrongly identified as MP some other persons wrongly identified as MP
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
# Code to extract discussions from corrected XML files # Code to extract discussions from corrected XML files
#%% #%%
# to work with atom # to work with atom
#%load_ext autoreload %load_ext autoreload
#%autoreload 2 %autoreload 2
import pickle import pickle
import time import time
...@@ -26,7 +26,7 @@ from utils_proc import call_with_out ...@@ -26,7 +26,7 @@ from utils_proc import call_with_out
# specify input and output files # specify input and output files
# needed for running in atom, can be ignored # needed for running in atom, can be ignored
year = '1925' year = '1894'
input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle"
input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz"
input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz"
...@@ -111,20 +111,21 @@ utils_proc.compress_tar(output_annotatedxml) ...@@ -111,20 +111,21 @@ utils_proc.compress_tar(output_annotatedxml)
#%% #%%
## to test for one file # to test for one file
#file_tarpath = './1893/20026526_datacorr.xml' file_tarpath = './1925/20029981_datacorr.xml'
#
#id_doc = file_tarpath.split('/')[-1][:8] id_doc = file_tarpath.split('/')[-1][:8]
#
## instantiate document object (always from original pdf) # instantiate document object (always from original pdf)
#infile_aux = year + '/' + id_doc + '.pdf' infile_aux = year + '/' + id_doc + '.pdf'
#file_doc = defc.Document(infile_aux, folder_database) file_doc = defc.Document(infile_aux, folder_database)
#
#if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']): if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20014332']):
#print(id_doc + '\n') print(id_doc + '\n')
#
#file_doc.df_lastnames = df_lastnames file_doc.df_lastnames = df_lastnames
#file_doc.annotate_xml() file_doc.list_notnames = list_notnames
file_doc.annotate_xml()
#%% #%%
...@@ -134,3 +135,16 @@ utils_proc.compress_tar(output_annotatedxml) ...@@ -134,3 +135,16 @@ utils_proc.compress_tar(output_annotatedxml)
#id_doc #id_doc
#len(files_to_process) #len(files_to_process)
list_bla = [1, 2,3]
list_bla.extend([4, 5])
list_bla
if 3 in [1,3, 4]:
print('yes')
#%%
str_name = 'Blumer' # (CANTON MISSING)'
print(str_name.split(' '))
...@@ -522,7 +522,7 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, ...@@ -522,7 +522,7 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral',
'Vizepräsident'] 'Vizepräsident']
list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names) list_roles_ext = ['Mehrheit', 'Minderheit', 'majorité', 'minorité', 'deutscher', 'deutsche', 'français', 'française', 'Kommission', 'commission']
# extract list and array of last names # extract list and array of last names
list_all_names = list(df_names['name_short']) list_all_names = list(df_names['name_short'])
...@@ -537,7 +537,21 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, ...@@ -537,7 +537,21 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
if bln_print: if bln_print:
print('now is about: ------', term) print('now is about: ------', term)
if name_type == 'canton': if term in list_roles:
# get correct name and uniqueID, or role, for that term
str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
if bln_print:
print('found a role', term)
# TODO: also look for similar terms (misspellings)
elif term in list_roles_ext:
pass
# TODO: extract whether it is minority or majority and save that information
elif name_type == 'canton':
list_cantonname, list_cantonabbr, list_citizenship, list_firstname = get_list_cantons(df_names, str_name.split(' ')[0])
canton_type = '' canton_type = ''
if term in list_cantonname: if term in list_cantonname:
str_canton = term str_canton = term
...@@ -559,20 +573,34 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton, ...@@ -559,20 +573,34 @@ def find_names(list_oi, str_name, str_role, list_uniqueID, df_names, str_canton,
else: else:
print('might be a canton', term, list_oi, str_name, str_role) print('might be a canton', term, list_oi, str_name, str_role)
# if a canton or similar was found
if canton_type: if canton_type:
# get rid of CANTON MISSING # get rid of CANTON MISSING
str_name = str_name.split(' ')[0] str_name = str_name.split(' ')[0]
# extract uniqueID # extract uniqueID
# list_temp = [] # if Citizenship, do proper comparison
list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) if canton_type == 'Citizenship':
df_temp = df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name)]
list_citizenship = [term for term in df_temp[canton_type] if str_canton in tokenizer_canton.tokenize(term)]
try:
if len(list_citizenship) == 1:
str_citizenship = list_citizenship[0]
except:
print('found no or more than one person with citizenship', str_canton, str_name)
pass
list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_citizenship)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
else:
list_temp = list(df_names.loc[(df_names['type']==name_type) & (df_names['name_short']==str_name) & (df_names[canton_type]==str_canton)].iloc[:, df_names.columns.get_loc('uniqueIndex')])
print(list_temp, list_uniqueID) print(list_temp, list_uniqueID)
list_uniqueID = list_temp list_uniqueID = list_temp
# if term is not easily mistaken as a name (avoid false positives) # if term is not easily mistaken as a name (avoid false positives)
if term not in list_notnames: elif term not in list_notnames:
# if term is in the list of all names and roles # if term is in the list of all names
if term in (list_all_names + list_roles): if term in list_all_names:
# get correct name and uniqueID, or role, for that term # get correct name and uniqueID, or role, for that term
str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton) str_name, str_role, list_uniqueID, name_type = get_string(term, str_name, str_role, list_uniqueID, str_canton)
...@@ -841,9 +869,18 @@ def dict_only_text(dictionary): ...@@ -841,9 +869,18 @@ def dict_only_text(dictionary):
# function to get list of places # function to get list of places
def get_list_cantons(df_names): def get_list_cantons(df_names, str_name = ''):
df_temp = df_names.loc[df_names['type']=='canton'] if str_name:
df_temp = df_names.loc[(df_names['type']=='canton') & (df_names['name_short']==str_name)]
else:
df_temp = df_names.loc[df_names['type']=='canton']
print(df_temp)
list_cantonname = list(df_temp['CantonName']) list_cantonname = list(df_temp['CantonName'])
for canton in ['Basel-Stadt', 'Basel-Landschaft']:
if canton in list_cantonname:
list_cantonname.extend(['Basel'])
if 'Graubünden' in list_cantonname:
list_cantonname.extend(['Bünden'])
list_cantonabbr = list(df_temp['CantonAbbreviation']) list_cantonabbr = list(df_temp['CantonAbbreviation'])
list_citizenship = list(df_temp['Citizenship']) list_citizenship = list(df_temp['Citizenship'])
list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')] list_citizenship = [city[:-5] for item in list_citizenship for city in item.split(',')]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment