diff --git a/data/lists/not_names.txt b/data/lists/not_names.txt index a3d5eaf6a98727e2531b45f0b4d6a6a274fad5e4..be0ee8d508e6fbc8e0b8c943e8507bd2fa4dafc6 100644 --- a/data/lists/not_names.txt +++ b/data/lists/not_names.txt @@ -39,6 +39,7 @@ Masse Minister neben nehmen +nen neu nicht Rath diff --git a/src/python/def_classes.py b/src/python/def_classes.py index 4e55da1317e3683ec3799e015085be48a3b57f55..f8e07833543f350fe3f1a5877f63986f0a898038 100644 --- a/src/python/def_classes.py +++ b/src/python/def_classes.py @@ -736,10 +736,7 @@ class Document: # - dict_overlaps: dictionary with overlaps # output: # - (first_entry, last_entry): tuple of first and last textbox id - def get_first_last_textbox(self, dict_overlaps): - - # get yearly dictionary - dict_overlaps_year = dict_overlaps[self.year] + def get_first_last_textbox(self, dict_overlaps_year): # initialize to impossible values first_entry = -1 @@ -794,33 +791,36 @@ class Document: self.get_council_date() # get start and end of document - path_data = '/home/lili/NLP_DemocraSci/data_from_nlp-democracy/results_overlap/' - with open (path_data + 'dict_overlaps.pickle', 'rb') as f: - dict_overlaps = pickle.load(f) - self.entries = self.get_first_last_textbox(dict_overlaps) - print(self.entries) + entries = self.get_first_last_textbox(self.dict_overlaps_year) + + # update if document starts/ends as on pdf + if entries[0] == -1: + entries = (0, entries[1]) + last_page = len(self.XML_main_corr) + if entries[1] == 1000: + entries = (entries[0], len([textbox for textbox in self.XML_main_corr[last_page-1] if textbox.tag == 'textbox']) - 1) + self.entries = entries # file to track speakers - self.name_speakers = '_'.join((str(self.year), self.id_doc, 'speakers.txt')) - path_speakers = 'data/lists/speakers/' - if not os.path.exists(path_speakers): - os.makedirs(path_speakers) - with open(path_speakers + self.name_speakers, 'w') as f: + self.name_speakers = '_'.join((self.id_doc, 'speakers.txt')) + name_txt = './' + str(self.year) + '/' + self.name_speakers + if not os.path.exists('./' + str(self.year)): + os.makedirs('./' + str(self.year)) + with open(name_txt, 'w') as f: f.write(' '.join((str(self.year), self.id_doc, self.str_date, '\n'))) #utils_proc.tar_extractfile(self.name_xml_corr[1], self.folder_database, name_file = self.name_outcorrxml) - XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, self.name_speakers, self.entries, bln_print=False) + XML_main_annot = utils_annot.get_annotated_xml(self.XML_main_corr, self.df_lastnames, self.list_notnames, self.str_council, self.str_date, name_txt, self.entries, bln_print=False) self.XML_main_annot = XML_main_annot # save xml file name_xml = './' + str(self.year) + '/' + self.name_wo_ext + '.xml' tree = ET.ElementTree(XML_main_annot) - if not os.path.exists('./' + str(self.year)): - os.makedirs('./' + str(self.year)) tree.write(name_xml, encoding = 'utf-8') if flag_save: name_tar = utils_proc.addto_tar(name_xml, self.folder_database, name_file = name_outannotxml) + name_tar = utils_proc.addto_tar(name_txt, self.folder_database, name_file = name_outannotxml) else: print('Not saving to tar') name_tar = self.folder_database + '/' + str(self.year) + '/' + name_outannotxml + '.tar.gz' @@ -833,8 +833,3 @@ class Document: utils_proc.call_with_out(command) print("End of file %s - %s seconds -" % (self.input_file, (time.time() - start_time))) - - - command = 'rm -rf ./' + str(self.year) - #print(command) -# utils_proc.call_with_out(command) diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 812e0ff5ee2419f17bcd58fbb2868f64ae4fb1c5..b0b56828cf9f14d7df7f83610e11ddd024f4db41 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -31,6 +31,7 @@ input_lastnames = "data/politicians/lastnames/" + year + "_MPs.pickle" input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" input_notnames = "data/lists/not_names.txt" +input_overlaps = "data/lists/dict_overlaps.pickle" output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" #%% @@ -39,7 +40,8 @@ input_lastnames = sys.argv[1] input_correctedxml = sys.argv[2] input_correctedmeta = sys.argv[3] input_notnames = sys.argv[4] -output_annotatedxml = sys.argv[5] +input_overlaps = sys.argv[5] +output_annotatedxml = sys.argv[6] #%% # extract suffixes, year, folder_database @@ -75,13 +77,23 @@ print('files to process loaded:', files_to_process) # (there is one file of lastnames per year) with open(input_lastnames, 'rb') as f: df_lastnames = pickle.load(f) +df_lastnames = df_lastnames.fillna('') print('dataframe with lastnames loaded') +# open list of terms that are easily mistaken as names with open(input_notnames) as f: list_notnames = f.readlines() list_notnames = [term.rstrip() for term in list_notnames] +print('list of notnames loaded') + +# open dictionary of overlaps from pickle file +with open(input_overlaps, 'rb') as f: + dict_overlaps = pickle.load(f) +dict_overlaps_year = dict_overlaps[int(year)] +print('dictionary of overlaps loaded') + #%% # for each file @@ -98,6 +110,7 @@ for file_tarpath in files_to_process: print(id_doc + '\n') file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames + file_doc.dict_overlaps_year = dict_overlaps_year file_doc.annotate_xml() # Commands to get the compressegid version of the file @@ -115,7 +128,7 @@ with open(input_notnames) as f: list_notnames = [term.rstrip() for term in list_notnames] # to test for one file -file_tarpath = './1936/20031982_datacorr.xml' +file_tarpath = './1936/20031998_datacorr.xml' id_doc = file_tarpath.split('/')[-1][:8] @@ -129,6 +142,7 @@ if (file_doc.check_discussion()) and (id_doc not in ['20032463', '20032952', '20 file_doc.df_lastnames = df_lastnames file_doc.list_notnames = list_notnames + file_doc.dict_overlaps_year = dict_overlaps_year file_doc.annotate_xml() diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index a4cf53ff2146fbc49add9c4fb28657e71c39c3cc..7d8ce8053992f159caae552c4d84bed90bcae5af 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -84,7 +84,7 @@ def get_text(sometext): # function to annotated corrected XML -def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, str_file_speakers, entries, bln_print=False): +def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_date, name_txt, entries, bln_print=False): # list of votation terms list_votationterms = ['Abstimmung', 'Schlussabstimmung', 'Generalabstimmung', 'Angenommen', 'Abgelehnt', @@ -100,7 +100,7 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da # create new XML as a copy of the corrected one XML_new = copy.deepcopy(XML_root) - last_page = len(XML_root) + ind_last_page = len(XML_root) - 1 # initialize flags to distinguish speeches from votes this_is_speech = False @@ -109,20 +109,22 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da # for every page for ind_p, page in enumerate(XML_root): - print('page index:', ind_p) if bln_print: print(page.tag, page.attrib) # for every textbox on that page for ind_t, textbox in enumerate(page): + # specify start and end of document try: if ind_p == 0 and entries[0] == int(textbox.attrib['id']): - with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + XML_new = label_docstartend(XML_new, ind_p, ind_t, 'doc_start') + with open(name_txt, 'a') as f: f.write(' '.join(('<<<=====================', 'the document starts here', '\n\n'))) - if ind_p == last_page - 1 and entries[1] == int(textbox.attrib['id']): - with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + if ind_p == ind_last_page and entries[1] == int(textbox.attrib['id']): + XML_new = label_docstartend(XML_new, ind_p, ind_t, 'doc_end') + with open(name_txt, 'a') as f: f.write(' '.join(('=====================>>>', 'the document ends here', '\n\n'))) except KeyError: pass @@ -150,11 +152,11 @@ def get_annotated_xml(XML_root, df_lastnames, list_notnames, str_council, str_da if textbox_texttype in ['text_col1', 'text_col2']: - XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False) + XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, list_notnames, str_council, str_date, name_txt, bln_print=False) if this_is_speech: prev_is_speech = True continue - XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, str_file_speakers, bln_print=False) + XML_new, this_is_vote = label_votations(XML_new, ind_p, ind_t, complete_text, list_votationterms, name_txt, bln_print=False) if this_is_vote: prev_is_speech = False continue @@ -249,7 +251,7 @@ def get_complete_text(textbox): # - bln_print: whether to print during execution, default False # output: # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID -def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, str_file_speakers, bln_print=False): +def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, list_notnames, str_council, str_date, name_txt, bln_print=False): # lists of roles list_roles = ['Präsident', 'Präsidentin', 'Vizepräsident', 'Präsidium', 'Président', 'Présidente', 'président', 'présidente', @@ -332,7 +334,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ print('get rid of Für den Antrag <Name> stimmen: Votent pour la proposition <Name>:', list_oi) str_name = '' - with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + with open(name_txt, 'a') as f: f.write(' '.join(('page', str(ind_p + 1), str(list_oi), '\n'))) f.write(' '.join(('name:', str_name, '\n'))) f.write(' '.join(('role:', str_role, '\n'))) @@ -424,7 +426,7 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # - bln_print: whether to print during execution, default False # output: # - XML_new: updated -def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, str_file_speakers, bln_print=True): +def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, name_txt, bln_print=True): # get first terms of that text list_oi = tokenizer.tokenize(text)[:15] @@ -437,7 +439,7 @@ def label_votations(XML_new, ind_p, ind_t, text, list_votationterms, str_file_sp # set flag this_is_vote = True - with open('data/lists/speakers/' + str_file_speakers, 'a') as f: + with open(name_txt, 'a') as f: f.write(' '.join(('page', str(ind_p + 1), text, '\n'))) f.write(' '.join(('is a vote', '\n\n'))) @@ -461,6 +463,17 @@ def label_speechcont(XML_new, ind_p, ind_t): return XML_new + +# function to label start and end of document +# only adds label to corresponding textbox +# type_ is either 'doc_start' or 'doc_end' +def label_docstartend(XML_new, ind_p, ind_t, type_): + + XML_new[ind_p][ind_t].attrib[type_] = 'here' + + return XML_new + + # helper function to flatten nested irregular list def flatten(l): for el in l: @@ -695,6 +708,7 @@ def find_names(list_oi, list_roles, list_roles_ext, df_names, list_notnames, str else: + print(canton_type, str_canton, str_name, df_temp) list_temp = list(df_temp.loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[:, df_temp.columns.get_loc('uniqueIndex')]) str_completeName = df_temp['completeName'].loc[(df_temp['shortName']==str_name) & (df_temp[canton_type]==str_canton)].iloc[0] diff --git a/src/sh/extract_discussions_yearly.sh b/src/sh/extract_discussions_yearly.sh index 9495dc345a9e7425d8adfc2de7e9bd62206e128a..290554ff2d4182ddffd659afb3e34f39f3dd9be0 100755 --- a/src/sh/extract_discussions_yearly.sh +++ b/src/sh/extract_discussions_yearly.sh @@ -3,9 +3,16 @@ year_start=1891 year_end=1891 +input_lastnames = data/politicians/lastnames/${year}_lastnames.pickle +input_correctedxml = data/AB/${year}/04_correctedxml.tar.gz +input_correctedmeta = data/AB/${year}/03_correctedmeta.tar.gz +input_notnames = data/lists/not_names.txt +input_overlaps = data/lists/dict_overlaps.pickle +output_annotatedxml = data/AB/${year}/05_annotatedxml.tar.gz + for year in $(seq $year_start $year_end) do echo $year # renku run --isolation - python src/python/run_extract_discussions.py data/politicians/lastnames/${year}_lastnames.pickle data/AB/${year}/04_correctedxml.tar.gz data/AB/${year}/03_correctedmeta.tar.gz data/lists/not_names.txt data/AB/${year}/05_annotatedxml.tar.gz + python src/python/run_extract_discussions.py input_lastnames input_correctedxml input_correctedmeta input_notnames input_overlaps output_annotatedxml done