Skip to content
Snippets Groups Projects
Commit b73b5e2c authored by Lili Gasser's avatar Lili Gasser
Browse files

training data not shuffled

parent 1ec85395
No related branches found
No related tags found
No related merge requests found
......@@ -19,6 +19,7 @@ from spacy.util import minibatch, compounding
import sys
sys.path.append("./src/python")
from utils_ner import read_from_txt, write_to_txt, transform_to_training_format, transform_to_reading_format
from utils_proc import call_with_out
......@@ -32,22 +33,22 @@ from utils_ner import read_from_txt, write_to_txt, transform_to_training_format,
def main(model=None, output_dir=None, n_iter=100, train_data=None, print_output=False):
"""Load training data and the model, set up the pipeline and train the entity recognizer."""
if train_data is not None:
call_with_out("git-lfs pull -I " + train_data.as_posix())
dict_onedoc = read_from_txt(train_data)
TRAIN_DATA = transform_to_training_format(dict_onedoc)[:50] # TODO: get rid of [:50]
TRAIN_DATA_orig = TRAIN_DATA
print(TRAIN_DATA[:10])
TRAIN_DATA_orig = TRAIN_DATA[:]
print(type(TRAIN_DATA), TRAIN_DATA[:10])
# TODO: format checks
else:
sys.exit("no training data")
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
nlp = spacy.blank("de") # create blank Language class
print("Created blank 'de' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
......@@ -88,7 +89,6 @@ def main(model=None, output_dir=None, n_iter=100, train_data=None, print_output=
# test the trained model
TRAIN_DATA_tested = []
for text, dict_ents_train, title in TRAIN_DATA_orig:
print(title)
list_ents_train = dict_ents_train['entities']
doc = nlp(text)
list_ents_test = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
......@@ -107,7 +107,7 @@ def main(model=None, output_dir=None, n_iter=100, train_data=None, print_output=
alldicts_tested = transform_to_reading_format(TRAIN_DATA_tested)
filename_tested = str(train_data)[:-4] + '_tested.txt'
filename_tested = str(train_data)[:-4] + '_trained.txt'
write_to_txt(alldicts_tested, filename_tested)
# save model to output directory
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment