read from txt can read w/ and w/o separator

67da49e8 · Lili Gasser · c4f479e7 · 67da49e8 · 67da49e8
Commit 67da49e8 authored 6 years ago by Lili Gasser
--- a/notebooks/NER_train_first-attempt.ipynb
+++ b/notebooks/NER_train_first-attempt.ipynb
--- a/src/python/utils_ner.py
+++ b/src/python/utils_ner.py
@@ -18,18 +18,18 @@ def get_spacyformat_for_sner_output(text):
    elif isinstance(text, str):
        title = None
        sometext = text
-    
+
    # generate sner output as a list
    list_tags_all = tagger.get_entities(sometext)
-    
+
    # initiliaze dictionary
    somedict = {}
-    
+
    # specify text and title
    somedict['text'] = sometext
    if isinstance(title, str):
        somedict['title'] = title
-    
+
    # generate list of entities with corresponding start und end indices
    list_ents = []
    remtext = sometext
@@ -41,15 +41,15 @@ def get_spacyformat_for_sner_output(text):
        overall_end_index = overall_start_index + len(ent[0])
        #print(ent, remtext[start_index:end_index], sometext[overall_start_index:overall_end_index])
        remtext = remtext[end_index+1:]
-    
+
        if ent[1] != 'O':
            list_ents.append({'start': overall_start_index, 'end': overall_end_index, 'label': ent[1]})
            #print(ent)
        overall_start_index += len(ent[0]) + 1
-    
+
    # add that to dictionary
    somedict['ents'] = list_ents
-    
+
    return somedict

 def write_to_txt(alldicts, filename):
@@ -61,12 +61,28 @@ def write_to_txt(alldicts, filename):
    f.close()

 def read_from_txt(filename):
-    f = open(filename,'r')
-    data=f.read()
-    f.close()
+    # read in data
+    with open(filename,'r') as f:
+        data = f.read()
+
+    # initialize dictionary
    alldicts = {}
-    for sent in data.split('\n\n.\n\n'):
-        #print(sent)
+
+    # check if formatted with separator '\n\n.\n\n'
+    separator = '\n\n.\n\n'
+    bln_separator = True
+    if not separator in data:
+        separator = "},\n \'"
+        bln_separator = False
+
+    # for each sentence
+    for sent in data.split(separator):
+        # if there is no separator, some additional formatting is needed
+        if not bln_separator:
+            colon_index = sent.index(':')
+            sent = sent[colon_index + 1:] + '}'
+
+        # extract dictionary for that sentence and add it to alldicts
        try:
            somedict = eval(sent)
            key = somedict['title'].split('/')[-1]
@@ -74,6 +90,7 @@ def read_from_txt(filename):
        except SyntaxError:
            #print('SyntaxError')
            pass
+
    return alldicts

 def render_dict(alldicts, language):
@@ -86,7 +103,7 @@ def render_dict(alldicts, language):


 def transform_to_training_format(alldicts):
-    
+
    def get_entitities_in_training_format(list_ents):

        list_ents_train = []
@@ -105,7 +122,7 @@ def transform_to_training_format(alldicts):
            list_ents_train.append(tpl_ent)

        return list_ents_train
-    
+
    train_data = []

    for speaker, somedict in alldicts.items():