Skip to content
Snippets Groups Projects
Commit 59b1071d authored by Lili Gasser's avatar Lili Gasser
Browse files

WIP trying to split speaker from text

parent ae40dd41
No related branches found
No related tags found
No related merge requests found
...@@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]: ...@@ -86,7 +86,7 @@ for file_tarpath in files_to_process[66:]:
# Commands to get the compressed version of the file # Commands to get the compressed version of the file
#data/AB/${year}/02_extractedxml.tar.gz #data/AB/${year}/05_annotatedxml.tar.gz
utils_proc.compress_tar(output_annotatedxml) utils_proc.compress_tar(output_annotatedxml)
......
...@@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames): ...@@ -94,7 +94,7 @@ def get_annotated_xml(XML_root, df_lastnames):
print(textbox.tag, textbox.attrib) print(textbox.tag, textbox.attrib)
# get complete text of that textbox # get complete text of that textbox
complete_text = get_complete_text(textbox) complete_text, ind_tl_colon = get_complete_text(textbox)
# identify and label language in XML # identify and label language in XML
dict_lang = identify_language(complete_text) dict_lang = identify_language(complete_text)
...@@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames): ...@@ -109,7 +109,7 @@ def get_annotated_xml(XML_root, df_lastnames):
print(complete_text) print(complete_text)
XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, df_lastnames, list_stopwords, bln_print=False) XML_new, this_is_speech = label_speechstart(XML_new, ind_p, ind_t, complete_text, ind_tl_colon, df_lastnames, list_stopwords, bln_print=False)
if this_is_speech: if this_is_speech:
prev_is_speech = True prev_is_speech = True
# print('stopped after finding speech start') # print('stopped after finding speech start')
...@@ -166,6 +166,7 @@ def get_textbox_type(textbox): ...@@ -166,6 +166,7 @@ def get_textbox_type(textbox):
# - textbox # - textbox
# output: # output:
# - complete_text: string # - complete_text: string
# - ind_tl_colon: index of textline with colon (needed for label speech start)
def get_complete_text(textbox): def get_complete_text(textbox):
# helper function to get text without font information # helper function to get text without font information
...@@ -177,16 +178,28 @@ def get_complete_text(textbox): ...@@ -177,16 +178,28 @@ def get_complete_text(textbox):
newtext += text[1:-1] newtext += text[1:-1]
#print(newtext) #print(newtext)
return newtext return newtext
# initialize empty string # initialize empty string
complete_text = '' complete_text = ''
# initialize index of textline colon to impossible value
ind_tl_colon = -1
# for every textline in that textbox # for every textline in that textbox
for ind_tl, textline in enumerate(textbox): for ind_tl, textline in enumerate(textbox):
if textline.tag == 'textline': if textline.tag == 'textline':
# append text to string # get that text
complete_text += get_text(textline.text) thattext = get_text(textline.text)
# append that text to string
complete_text += thattext
return complete_text # in first two textlines of textbox, check for colon
if ind_tl < 3:
if ':' in thattext:
ind_tl_colon = ind_tl
return complete_text, ind_tl_colon
# function to label speech starts # function to label speech starts
...@@ -197,7 +210,7 @@ def get_complete_text(textbox): ...@@ -197,7 +210,7 @@ def get_complete_text(textbox):
# - bln_print: whether to print during execution, default False # - bln_print: whether to print during execution, default False
# output: # output:
# - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID # - (str_name, str_role, list_uniqueID, str_canton): tuple with strings and ID
def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln_print=False): def label_speechstart(XML_new, ind_p, ind_t, text, ind_tl_colon, df_names, list_stopwords, bln_print=False):
# initialize flag # initialize flag
this_is_speech = False this_is_speech = False
...@@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln ...@@ -211,7 +224,8 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
# very consistently, a speaker can be identified by looking for a colon # very consistently, a speaker can be identified by looking for a colon
# at the beginning of a textbox and identifiying a name or a role in front # at the beginning of a textbox and identifiying a name or a role in front
# of that colon # of that colon
if ':' in text[:100]: if ind_tl_colon >= 0:
# if ':' in text[:100]:
# extract the index of the colon in the text # extract the index of the colon in the text
colon_index_text = text.index(':') colon_index_text = text.index(':')
...@@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln ...@@ -264,7 +278,27 @@ def label_speechstart(XML_new, ind_p, ind_t, text, df_names, list_stopwords, bln
# add speaker to first textline # add speaker to first textline
XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton) XML_new[ind_p][ind_t][0].attrib['speaker'] = (str_name, str_role, list_uniqueID, str_canton)
# TODO: split speaker from text (check on which line and split that line accordingly) # TODO: split speaker from text (check on which line and split that line accordingly)
# TODO account for splitting of [font ...] ... [/font]
if ind_tl_colon == 0:
thattext = XML_new[ind_p][ind_t][0].text
colon_index = thattext.index(':')
try:
XML_new[ind_p][ind_t][0].text = thattext[:colon_index+1]
XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:] + ' ' + XML_new[ind_p][ind_t][1].text
except:
print('error in self.input_file when splitting speaker')
pass
if ind_tl_colon == 1:
thattext = XML_new[ind_p][ind_t][1].text
colon_index = thattext.index(':')
XML_new[ind_p][ind_t][0].text = XML_new[ind_p][ind_t][0].text + ' ' + thattext[:colon_index+1]
XML_new[ind_p][ind_t][1].text = thattext[colon_index+1:]
# dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text], # dict_discussionstarts[(key, str_name, str_role, int_uniqueID)] = (text[:colon_index_text],
# text[colon_index_text+1:]) # text[colon_index_text+1:])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment