Skip to content
Snippets Groups Projects
Commit 67da49e8 authored by Lili Gasser's avatar Lili Gasser
Browse files

read from txt can read w/ and w/o separator

parent c4f479e7
No related branches found
No related tags found
No related merge requests found
source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -18,18 +18,18 @@ def get_spacyformat_for_sner_output(text):
elif isinstance(text, str):
title = None
sometext = text
# generate sner output as a list
list_tags_all = tagger.get_entities(sometext)
# initiliaze dictionary
somedict = {}
# specify text and title
somedict['text'] = sometext
if isinstance(title, str):
somedict['title'] = title
# generate list of entities with corresponding start und end indices
list_ents = []
remtext = sometext
......@@ -41,15 +41,15 @@ def get_spacyformat_for_sner_output(text):
overall_end_index = overall_start_index + len(ent[0])
#print(ent, remtext[start_index:end_index], sometext[overall_start_index:overall_end_index])
remtext = remtext[end_index+1:]
if ent[1] != 'O':
list_ents.append({'start': overall_start_index, 'end': overall_end_index, 'label': ent[1]})
#print(ent)
overall_start_index += len(ent[0]) + 1
# add that to dictionary
somedict['ents'] = list_ents
return somedict
def write_to_txt(alldicts, filename):
......@@ -61,12 +61,28 @@ def write_to_txt(alldicts, filename):
f.close()
def read_from_txt(filename):
f = open(filename,'r')
data=f.read()
f.close()
# read in data
with open(filename,'r') as f:
data = f.read()
# initialize dictionary
alldicts = {}
for sent in data.split('\n\n.\n\n'):
#print(sent)
# check if formatted with separator '\n\n.\n\n'
separator = '\n\n.\n\n'
bln_separator = True
if not separator in data:
separator = "},\n \'"
bln_separator = False
# for each sentence
for sent in data.split(separator):
# if there is no separator, some additional formatting is needed
if not bln_separator:
colon_index = sent.index(':')
sent = sent[colon_index + 1:] + '}'
# extract dictionary for that sentence and add it to alldicts
try:
somedict = eval(sent)
key = somedict['title'].split('/')[-1]
......@@ -74,6 +90,7 @@ def read_from_txt(filename):
except SyntaxError:
#print('SyntaxError')
pass
return alldicts
def render_dict(alldicts, language):
......@@ -86,7 +103,7 @@ def render_dict(alldicts, language):
def transform_to_training_format(alldicts):
def get_entitities_in_training_format(list_ents):
list_ents_train = []
......@@ -105,7 +122,7 @@ def transform_to_training_format(alldicts):
list_ents_train.append(tpl_ent)
return list_ents_train
train_data = []
for speaker, somedict in alldicts.items():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment