diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index 32c85bf240e4bfce0b82ba578541110b8a1198fa..495b23ceb7f8b38e2849c5d16119e76cf177d48f 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -4,8 +4,8 @@ # Code to extract discussions from corrected XML files #%% # to work with atom -#%load_ext autoreload -#%autoreload 2 +%load_ext autoreload +%autoreload 2 import pickle import time @@ -26,11 +26,13 @@ from utils_proc import call_with_out # specify input and output files # needed for running in atom, can be ignored -input_lastnames = "data/politicians/lastnames/1893_lastnames.pickle" -input_correctedxml = "data/AB/1893/04_correctedxml.tar.gz" -input_correctedmeta = "data/AB/1893/03_correctedmeta.tar.gz" -output_annotatedxml = "data/AB/1893/05_annotatedxml.tar.gz" +year = '1891' +input_lastnames = "data/politicians/lastnames/" + year + "_lastnames.pickle" +input_correctedxml = "data/AB/" + year + "/04_correctedxml.tar.gz" +input_correctedmeta = "data/AB/" + year + "/03_correctedmeta.tar.gz" +output_annotatedxml = "data/AB/" + year + "/05_annotatedxml.tar.gz" +#%% # detect arguments input_lastnames = sys.argv[1] input_correctedxml = sys.argv[2] @@ -55,7 +57,7 @@ for lfsfile in [input_correctedxml, input_correctedmeta, input_rawmeta]: #print(command) call_with_out(command) - +#%% # TODO: exclude overlaps --> after annotation @@ -93,6 +95,7 @@ for file_tarpath in files_to_process: file_doc.annotate_xml() # Commands to get the compressed version of the file +# (compressed file is around 5 times smaller than uncompressed file) #data/AB/${year}/05_annotatedxml.tar.gz utils_proc.compress_tar(output_annotatedxml) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 8f8fc03144ac406791d549efaa2098c7a97cf472..e41d934ac7914234ab6fae1f425123999971c413 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -49,9 +49,17 @@ def check_if_discussion(path_meta_xml_file, return True - +# helper function to get text without font information +# example for font information: [font face="11.718" size="Times-Roman"] sometext [/font] +# input: +# - sometext: string +# output: +# - newtext: modified string def get_text(sometext): + # initialize newtext = '' + + # find text between font information for text in re.findall('\].*?\[',sometext): #print(text) if text.startswith(']') and text.endswith('['): @@ -59,6 +67,7 @@ def get_text(sometext): #print(newtext) return newtext + # function to annotated corrected XML def get_annotated_xml(XML_root, df_lastnames, bln_print=False): @@ -112,9 +121,6 @@ def get_annotated_xml(XML_root, df_lastnames, bln_print=False): if textbox_texttype in ['text_col1', 'text_col2']: - if bln_print: - print(complete_text) - XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False) if this_is_speech: prev_is_speech = True @@ -146,7 +152,6 @@ def get_textbox_type(textbox): # for every textline in that textbox for ind_tl, textline in enumerate(textbox): if textline.tag == 'textline': -# print(textline.tag, textline.attrib) # count types if textline.attrib['type'] not in dict_type.keys(): @@ -154,16 +159,17 @@ def get_textbox_type(textbox): else: dict_type[textline.attrib['type']] += 1 -# print(dict_type) # list of all types with maximum count list_types = [type for type, count in dict_type.items() if count == max(dict_type.values())] -# print(list_types) + # if only one with maximum value if len(list_types) == 1: textbox_type = list_types[0] + # if several with same maximum value else: textbox_type = 'notdistinct' + return textbox_type @@ -227,6 +233,9 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ list_uniqueID = [] str_canton = '' + # font text end + fontend = '[/font]' + # very consistently, a speaker can be identified by looking for a colon # at the beginning of a textbox and identifiying a name or a role in front # of that colon @@ -286,31 +295,69 @@ def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_ # add attribute speech_start to textbox XML_new[ind_p][ind_t].attrib['text_type'] = 'speech_start' - # add speaker to first textline + # add speaker as attribute to first textline XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton) - # TODO: split speaker from text (check on which line and split that line accordingly) - # TODO account for splitting of [font ...] ... [/font] + # update text of XML (speaker is on first line, actual speech start on second line of speech_start textbox) + # if colon is on first line if ind_tl_colon == 0: + # get text of that line and colon index thattext = XML_new[ind_p][ind_t][0].text colon_index = thattext.index(':') + + # get last font information of thattext + fontstart = re.findall('\[font.*?\]', thattext)[-1] + try: - XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] - XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + ' ' + XML_new[ind_p][ind_t][1].text + # write speaker to first line + XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + fontend + + # get start of speech with correct font start + if thattext[colon_index+1:].startswith('[font'): + startspeech = thattext[colon_index+1:] + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = '' + else: + startspeech = fontstart + thattext[colon_index+1:] + + # write beginning of speech to second line + # (create new ET element if necessary) + if len(list(XML_new[ind_p][ind_t])) > 1: + XML_new[ind_p][ind_t][1].text = startspeech + ' ' + XML_new[ind_p][ind_t][1].text + else: + XML_new[ind_p][ind_t].append(copy.deepcopy(XML_new[ind_p][ind_t][0])) + XML_new[ind_p][ind_t][1].attrib.pop('speaker') + XML_new[ind_p][ind_t][1].text = startspeech except: print('error in self.input_file when splitting speaker') + #print(thattext) + #print(len(list(XML_new[ind_p][ind_t]))) + #print(list(XML_new[ind_p][ind_t])) + #print(XML_new[ind_p][ind_t]) + #print('gefundener Name:', str_name, str_role) pass + + # if colon is on second line if ind_tl_colon == 1: + # get text of that line and colon index thattext = XML_new[ind_p][ind_t][1].text colon_index = thattext.index(':') - XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] - XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] - + # get last font information of thattext + fontstart = re.findall('\[font.*?\]', thattext)[-1] + # get start of speech with correct font start + if thattext[colon_index+1:].startswith('[font'): + startspeech = thattext[colon_index+1:] + elif re.match('^[ ]?\[/font\]', thattext[colon_index+1:]): + startspeech = '' + else: + startspeech = fontstart + thattext[colon_index+1:] -# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], -# text[colon_index_text+1:]) + # write speaker to first line + XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + fontend + # write beginning of speech to second line + XML_new[ind_p][ind_t][1].text = startspeech # set flag this_is_speech = True @@ -488,8 +535,8 @@ def find_names(term, str_name, str_role, list_uniqueID, df_names, str_canton, bl 'Bundesrat', 'Bundesrath', 'Bundesrätin', 'conseiller fédéral', 'Vizepräsident'] - list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'pas', 'Gallen', 'StGallen', - 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter'] + list_notnames = ['Art', 'Rath', 'Alinea', 'Stimmen', 'Stimme', 'Hans', 'Walter', 'Werner', 'projet', 'stimmt', 'nicht', 'vote', 'Gallen', 'StGallen', + 'Kasse', 'fasse', 'Sitten', 'Herren', 'Herr', 'Alter', 'Biffer', 'biffer', 'Rédiger', 'rédiger', 'Wer', 'Fällen'] list_places = get_list_cantons(df_names)