From fa1923392f897905d2dd0cdd5d3194afec795f91 Mon Sep 17 00:00:00 2001
From: Lili Gasser <gasserli@ethz.ch>
Date: Mon, 18 Feb 2019 10:32:51 +0000
Subject: [PATCH] clean ffille

---
 src/python/run_train_ner.py | 15 +++------------
 src/sh/train_ner.sh         |  2 +-
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/python/run_train_ner.py b/src/python/run_train_ner.py
index 15878724..2ebd5fe7 100644
--- a/src/python/run_train_ner.py
+++ b/src/python/run_train_ner.py
@@ -36,10 +36,9 @@ def main(model=None, output_dir=None, n_iter=100, training_data=None, trained_da
     if training_data is not None:
         call_with_out("git-lfs pull -I " + training_data.as_posix())
         dict_onedoc = read_from_txt(training_data)
-        TRAIN_DATA = transform_to_training_format(dict_onedoc)[:50]   # TODO: get rid of [:50]
-        TRAIN_DATA_orig = TRAIN_DATA[:]
-        print(type(TRAIN_DATA), TRAIN_DATA[:10])
-        # TODO: format checks
+        TRAIN_DATA = transform_to_training_format(dict_onedoc)
+        TRAIN_DATA_orig = TRAIN_DATA[:]      # save a copy to have an unshuffled version
+        print("Training data loaded")
     else:
         sys.exit("no training data")
 
@@ -97,14 +96,6 @@ def main(model=None, output_dir=None, n_iter=100, training_data=None, trained_da
         dict_ents_test['entities'] = list_ents_test
         tpl = (text, dict_ents_test, title)
         TRAIN_DATA_tested.append(tpl)
-#        print('train', list_ents_train)
-#        print('test', list_ents_test)
-#        print(set(list_ents_train) == set(list_ents_test))
-#       if print_output
-        #if not set(list_ents_train) == set(list_ents_test):
-            #print(text)
-            #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
-	    #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 
     alldicts_tested = transform_to_reading_format(TRAIN_DATA_tested)
 
diff --git a/src/sh/train_ner.sh b/src/sh/train_ner.sh
index e21cdfaf..ec5e1539 100755
--- a/src/sh/train_ner.sh
+++ b/src/sh/train_ner.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 model=de_core_news_sm
-n_iter=2
+n_iter=100
 path_data=data/train_NER/
 training_data=${path_data}1891_20026449_corrected_german.txt
 trained_data=${path_data}1891_20026449_corrected_german_trained.txt
-- 
GitLab