diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 03355186e36e6c17bca2cb0cfa94da83253d56e3..6ee0909de8db4fe8a185c45145d80cb8a76838c4 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -762,7 +762,7 @@ class Document: #name_outcorrxml = self.name_outcorrxml) with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str(self.year), self.id_doc, '\n'))) + f.write(' '.join(('\n\n-------', str(self.year), self.id_doc, '\n'))) print('we have a main corr XML file') #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 0bb90c94bdaa6425eb85826f40cccdf34d336a9f..e8a827d9f3ca5022cc9dc613054c0902698f9755 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -26,7 +26,7 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -year = '1925' +year = '1971' input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" @@ -115,8 +115,13 @@ utils_proc.compress_tar(output_annotatedxml) #%% +with open(input_notnames) as f: + list_notnames = f.readlines() + +list_notnames = [term.rstrip() for term in list_notnames] + # to test for one file -file_tarpath = './1925/20029981_datacorr.xml' +file_tarpath = './1971/20000726f_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -148,3 +153,8 @@ file_doc.check_discussion() str_date = '1925-12-09 08:00' import datetime datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M') + + + +listilist = ['a', 'b', 'c', 'd'] +listilist[0,2] diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 525eeb8288f2d24b8f9a2e0c6a4c4f9114d13faf..ceedfc9b8d8c44d20972e5a6d29267f66efa0bb6 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -249,6 +249,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # initialize flag this_is_speech = False + flag_print = False # font text end fontend = '[/font]' @@ -264,10 +265,11 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ text_start = re.sub(r'[\(\)]','',text[:colon_index_text]) list_oi = tokenizer.tokenize(text_start) - if len(list_oi) > 5: + if len(list_oi) > 4: with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str(list_oi), '\n'))) - + f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) + flag_print = True + if bln_print: print('possible speech start: ', list_oi) @@ -286,17 +288,22 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # remove single characters # TODO: might need to be changed for fractions (some fractions are abbreviated as single letters) + # TODO: needs to be changed to include 'I' for Minderheit I 1891/20000093 # TODO: maybe exclude I and A to account for Appenzell list_oi = [term for term in list_oi if len(term)>1] + if len(list_oi) > 4 or flag_print: + with open('data/lists/notunique.txt', 'a') as f: + f.write(' '.join((str(list_oi), str(len(list_oi)), '\n'))) + # if possible, find a name from the list str_name, str_role, list_uniqueID, str_canton = find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str_council, str_date, bln_print=False) if bln_print: print('name', str_name, 'role', str_role) - if len(list_uniqueID) > 1: + if len(list_uniqueID) > 1 or flag_print: with open('data/lists/notunique.txt', 'a') as f: - f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), '\n'))) + f.write(' '.join((str_name, str_role, str(list_uniqueID), str_date, str(ind_p), str(list_oi), '\n'))) # get rid of 'Präsident stimmt nicht Président ne vote pas' if set(str_role.split()).intersection(set(['Präsident', 'Präsidentin', 'Président', 'Présidente'])) and not str_name: @@ -613,25 +620,31 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str # get list of cities list_cities = [entry for entry in df_temp[canton_type] if str_canton in get_cities([entry])] str_citizenship = '' - try: - if len(list_cities) == 1: - str_citizenship = list_cities[0] - except: - print('found no or more than one person with citizenship', str_canton, str_name) - pass + if len(list_cities) == 1: + str_citizenship = list_cities[0] + list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) + str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0] + elif len(list_cities) > 1: + print('found more than one person with citizenship', str_canton, str_name, list_cities) + # TODO what happens with these:? + list_temp = list(df_names.loc[(df_names['nameType']==name_type) & (df_names['shortName']==str_name)].iloc[:, df_names.columns.get_loc('uniqueIndex')]) + str_completeName = str_name + ' (CANTON MISSING)' + else: + print('found no person with citizenship', str_canton, str_name, list_cities) - list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) - str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_citizenship)].iloc[0] else: list_temp = list(df_temp.loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) str_completeName = df_temp['completeName'].loc[(df_temp['nameType']==name_type) & (df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] - print(list_temp, list_uniqueID) + print(list_temp, list_uniqueID, str_completeName) if len(list_temp) > 0: list_uniqueID = update_list_uniqueID(list_uniqueID, list_temp, name_type) - if str_completeName.split(' ')[0] == str_name: + print(str_completeName) + if 'CANTON MISSING' in str_completeName: + str_name = add_to_string('', str_completeName) + elif str_completeName.split(' ')[0] == str_name: str_name = add_to_string('', str_completeName) else: str_name = add_to_string(str_name, str_completeName)