From b73b5e2c16be541cd140b075e1248f625ae5927b Mon Sep 17 00:00:00 2001 From: Lili Gasser <gasserli@ethz.ch> Date: Mon, 18 Feb 2019 09:33:23 +0000 Subject: [PATCH] training data not shuffled --- src/python/example_train-ner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/python/example_train-ner.py b/src/python/example_train-ner.py index ce5ad60a..86ae826a 100644 --- a/src/python/example_train-ner.py +++ b/src/python/example_train-ner.py @@ -19,6 +19,7 @@ from spacy.util import minibatch, compounding import sys sys.path.append("./src/python") from utils_ner import read_from_txt, write_to_txt, transform_to_training_format, transform_to_reading_format +from utils_proc import call_with_out @@ -32,22 +33,22 @@ from utils_ner import read_from_txt, write_to_txt, transform_to_training_format, def main(model=None, output_dir=None, n_iter=100, train_data=None, print_output=False): """Load training data and the model, set up the pipeline and train the entity recognizer.""" if train_data is not None: + call_with_out("git-lfs pull -I " + train_data.as_posix()) dict_onedoc = read_from_txt(train_data) TRAIN_DATA = transform_to_training_format(dict_onedoc)[:50] # TODO: get rid of [:50] - TRAIN_DATA_orig = TRAIN_DATA - print(TRAIN_DATA[:10]) + TRAIN_DATA_orig = TRAIN_DATA[:] + print(type(TRAIN_DATA), TRAIN_DATA[:10]) # TODO: format checks else: sys.exit("no training data") - if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: - nlp = spacy.blank("en") # create blank Language class - print("Created blank 'en' model") + nlp = spacy.blank("de") # create blank Language class + print("Created blank 'de' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy @@ -88,7 +89,6 @@ def main(model=None, output_dir=None, n_iter=100, train_data=None, print_output= # test the trained model TRAIN_DATA_tested = [] for text, dict_ents_train, title in TRAIN_DATA_orig: - print(title) list_ents_train = dict_ents_train['entities'] doc = nlp(text) list_ents_test = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] @@ -107,7 +107,7 @@ def main(model=None, output_dir=None, n_iter=100, train_data=None, print_output= alldicts_tested = transform_to_reading_format(TRAIN_DATA_tested) - filename_tested = str(train_data)[:-4] + '_tested.txt' + filename_tested = str(train_data)[:-4] + '_trained.txt' write_to_txt(alldicts_tested, filename_tested) # save model to output directory -- GitLab