diff --git a/src/python/run_extract_discussions.py b/src/python/run_extract_discussions.py index ba1cbd8c101db6dea6a6d967c1c5db84bbf07007..434c10d10f13f6f29be0d09073c44becd61b7bb0 100644 --- a/src/python/run_extract_discussions.py +++ b/src/python/run_extract_discussions.py @@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]: # Commands to get the compressed version of the file -#data/AB/${year}/02_extractedxml.tar.gz +#data/AB/${year}/05_annotatedxml.tar.gz utils_proc.compress_tar(output_annotatedxml) diff --git a/src/python/utils_annot.py b/src/python/utils_annot.py index 2d9310c9929e64998b0c5835ed6eddd1bafb1de6..a330b3fcd9cb5c8fa09a0f6c8231d3b8c966b558 100644 --- a/src/python/utils_annot.py +++ b/src/python/utils_annot.py @@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames): print(textbox.tag, textbox.attrib) # get complete text of that textbox - complete_text = get_complete_text(textbox) + complete_text, ind_tl_colon = get_complete_text(textbox) # identify and label language in XML dict_lang = identify_language(complete_text) @@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames): print(complete_text) - XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, df_lastnames, list_stopwords, bln_print=False) + XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False) if this_is_speech: prev_is_speech = True # print('stopped after finding speech start') @@ -166,6 +166,7 @@ def get_textbox_type(textbox): # - textbox # output: # - complete_text: string +# - ind_tl_colon: index of textline with colon (needed for label speech start) def get_complete_text(textbox): # helper function to get text without font information @@ -177,16 +178,28 @@ def get_complete_text(textbox): newtext += text[1:-1] #print(newtext) return newtext + # initialize empty string complete_text = '' + # initialize index of textline colon to impossible value + ind_tl_colon = -1 + # for every textline in that textbox for ind_tl, textline in enumerate(textbox): if textline.tag == 'textline': - # append text to string - complete_text += get_text(textline.text) + # get that text + thattext = get_text(textline.text) + + # append that text to string + complete_text += thattext - return complete_text + # in first two textlines of textbox, check for colon + if ind_tl < 3: + if ':' in thattext: + ind_tl_colon = ind_tl + + return complete_text, ind_tl_colon # function to label speech starts @@ -197,7 +210,7 @@ def get_complete_text(textbox): # - bln_print: whether to print during execution, default False # output: # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID -def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln_print=False): +def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False): # initialize flag this_is_speech = False @@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln # very consistently, a speaker can be identified by looking for a colon # at the beginning of a textbox and identifiying a name or a role in front # of that colon - if ':' in text[:100]: + if ind_tl_colon >= 0: +# if ':' in text[:100]: # extract the index of the colon in the text colon_index_text = text.index(':') @@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln # add speaker to first textline XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton) + # TODO: split speaker from text (check on which line and split that line accordingly) + # TODO account for splitting of [font ...] ... [/font] + if ind_tl_colon == 0: + thattext = XML_new[ind_p][ind_t][0].text + colon_index = thattext.index(':') + try: + XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1] + XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + ' ' + XML_new[ind_p][ind_t][1].text + except: + print('error in self.input_file when splitting speaker') + pass + if ind_tl_colon == 1: + thattext = XML_new[ind_p][ind_t][1].text + colon_index = thattext.index(':') + XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1] + XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + + + + # dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], # text[colon_index_text+1:])