From fa1923392f897905d2dd0cdd5d3194afec795f91 Mon Sep 17 00:00:00 2001 From: Lili Gasser <gasserli@ethz.ch> Date: Mon, 18 Feb 2019 10:32:51 +0000 Subject: [PATCH] clean ffille --- src/python/run_train_ner.py | 15 +++------------ src/sh/train_ner.sh | 2 +- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/python/run_train_ner.py b/src/python/run_train_ner.py index 15878724..2ebd5fe7 100644 --- a/src/python/run_train_ner.py +++ b/src/python/run_train_ner.py @@ -36,10 +36,9 @@ def main(model=None, output_dir=None, n_iter=100, training_data=None, trained_da if training_data is not None: call_with_out("git-lfs pull -I " + training_data.as_posix()) dict_onedoc = read_from_txt(training_data) - TRAIN_DATA = transform_to_training_format(dict_onedoc)[:50] # TODO: get rid of [:50] - TRAIN_DATA_orig = TRAIN_DATA[:] - print(type(TRAIN_DATA), TRAIN_DATA[:10]) - # TODO: format checks + TRAIN_DATA = transform_to_training_format(dict_onedoc) + TRAIN_DATA_orig = TRAIN_DATA[:] # save a copy to have an unshuffled version + print("Training data loaded") else: sys.exit("no training data") @@ -97,14 +96,6 @@ def main(model=None, output_dir=None, n_iter=100, training_data=None, trained_da dict_ents_test['entities'] = list_ents_test tpl = (text, dict_ents_test, title) TRAIN_DATA_tested.append(tpl) -# print('train', list_ents_train) -# print('test', list_ents_test) -# print(set(list_ents_train) == set(list_ents_test)) -# if print_output - #if not set(list_ents_train) == set(list_ents_test): - #print(text) - #print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) alldicts_tested = transform_to_reading_format(TRAIN_DATA_tested) diff --git a/src/sh/train_ner.sh b/src/sh/train_ner.sh index e21cdfaf..ec5e1539 100755 --- a/src/sh/train_ner.sh +++ b/src/sh/train_ner.sh @@ -1,7 +1,7 @@ #!/bin/bash model=de_core_news_sm -n_iter=2 +n_iter=100 path_data=data/train_NER/ training_data=${path_data}1891_20026449_corrected_german.txt trained_data=${path_data}1891_20026449_corrected_german_trained.txt -- GitLab