diff --git a/src/python/run_train_ner.py b/src/python/run_train_ner.py index 158787244b9fc5f96ab038db13014cc92462047d..485210a437769a7b051b5df7bc5fc60bc8dd8944 100644 --- a/src/python/run_train_ner.py +++ b/src/python/run_train_ner.py @@ -36,10 +36,9 @@ def main(model=None, output_dir=None, n_iter=100, training_data=None, trained_da if training_data is not None: call_with_out("git-lfs pull -I " + training_data.as_posix()) dict_onedoc = read_from_txt(training_data) - TRAIN_DATA = transform_to_training_format(dict_onedoc)[:50] # TODO: get rid of [:50] - TRAIN_DATA_orig = TRAIN_DATA[:] - print(type(TRAIN_DATA), TRAIN_DATA[:10]) - # TODO: format checks + TRAIN_DATA = transform_to_training_format(dict_onedoc) + TRAIN_DATA_orig = TRAIN_DATA[:] # save a copy to have an unshuffled version + print("Training data loaded"]) else: sys.exit("no training data") @@ -97,14 +96,6 @@ def main(model=None, output_dir=None, n_iter=100, training_data=None, trained_da dict_ents_test['entities'] = list_ents_test tpl = (text, dict_ents_test, title) TRAIN_DATA_tested.append(tpl) -# print('train', list_ents_train) -# print('test', list_ents_test) -# print(set(list_ents_train) == set(list_ents_test)) -# if print_output - #if not set(list_ents_train) == set(list_ents_test): - #print(text) - #print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) - #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) alldicts_tested = transform_to_reading_format(TRAIN_DATA_tested)