diff --git a/notebooks/NER_read-training-data.ipynb b/notebooks/NER_read-training-data.ipynb index 27ba2c847185d8421fb2ce2f2c328b032fd0a9b9..a84ff6670ca017d7d1e7f4bd7e8fbe8342209636 100644 --- a/notebooks/NER_read-training-data.ipynb +++ b/notebooks/NER_read-training-data.ipynb @@ -9,18 +9,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", @@ -30,95 +21,16 @@ "import datetime\n", "import numpy as np\n", "\n", - "import spacy\n", - "from spacy import displacy\n", - "colors = {'ORG': '#73c6b6', 'LOC': '#bb8fce', 'PER': '#e59866', 'MISC': '#a6acaf'}\n", - "options = {'ents': ['ORG', 'LOC', 'PER', 'MISC'], 'colors': colors}\n", - "colors_sner = {'ORGANIZATION': '#73c6b6', 'LOCATION': '#bb8fce', 'PERSON': '#e59866', 'MISC': '#a6acaf'}\n", - "options_sner = {'ents': ['ORGANIZATION', 'LOCATION', 'PERSON', 'MISC'], 'colors': colors_sner}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## define necessary functions" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def get_spacyformat_for_sner_output(text):\n", - " if isinstance(text, tuple):\n", - " title = text[0]\n", - " sometext = text[1]\n", - " elif isinstance(text, str):\n", - " title = None\n", - " sometext = text\n", - " \n", - " # generate sner output as a list\n", - " list_tags_all = tagger.get_entities(sometext)\n", - " \n", - " # initiliaze dictionary\n", - " somedict = {}\n", - " \n", - " # specify text and title\n", - " somedict['text'] = sometext\n", - " if isinstance(title, str):\n", - " somedict['title'] = title\n", - " \n", - " # generate list of entities with corresponding start und end indices\n", - " list_ents = []\n", - " remtext = sometext\n", - " overall_start_index = 0\n", - " for ent in list_tags_all:\n", - " start_index = remtext.index(ent[0])\n", - " overall_start_index += start_index\n", - " end_index = start_index + len(ent[0])\n", - " overall_end_index = overall_start_index + len(ent[0])\n", - " #print(ent, remtext[start_index:end_index], sometext[overall_start_index:overall_end_index])\n", - " remtext = remtext[end_index+1:]\n", - " \n", - " if ent[1] != 'O':\n", - " list_ents.append({'start': overall_start_index, 'end': overall_end_index, 'label': ent[1]})\n", - " #print(ent)\n", - " overall_start_index += len(ent[0]) + 1\n", - " \n", - " # add that to dictionary\n", - " somedict['ents'] = list_ents\n", - " \n", - " return somedict\n", - "\n", - "def write_to_txt(alldicts, filename):\n", - " f = open(filename, 'w')\n", - " for key, value in alldicts.items():\n", - " f.write( str(value) )\n", - " f.write( '\\n\\n.\\n\\n')\n", - " f.write('generated: ' + str(datetime.datetime.now()))\n", - " f.close()\n", + "#import spacy\n", + "#from spacy import displacy\n", + "#colors = {'ORG': '#73c6b6', 'LOC': '#bb8fce', 'PER': '#e59866', 'MISC': '#a6acaf'}\n", + "#options = {'ents': ['ORG', 'LOC', 'PER', 'MISC'], 'colors': colors}\n", + "#colors_sner = {'ORGANIZATION': '#73c6b6', 'LOCATION': '#bb8fce', 'PERSON': '#e59866', 'MISC': '#a6acaf'}\n", + "#options_sner = {'ents': ['ORGANIZATION', 'LOCATION', 'PERSON', 'MISC'], 'colors': colors_sner}\n", "\n", - "def read_from_txt(filename):\n", - " f = open(filename,'r')\n", - " data=f.read()\n", - " f.close()\n", - " alldicts = {}\n", - " for sent in data.split('\\n\\n.\\n\\n'):\n", - " #print(sent)\n", - " try:\n", - " somedict = eval(sent)\n", - " key = somedict['title'].split('/')[-1]\n", - " alldicts[key] = somedict\n", - " except SyntaxError:\n", - " #print('SyntaxError')\n", - " pass\n", - " return alldicts\n", - "\n", - "def render_dict(alldicts):\n", - " for sent_key, sent_dict in alldicts.items():\n", - " displacy.render(sent_dict, style='ent', jupyter=True, manual=True, options=options_sner)" + "import sys\n", + "sys.path.append(\"../src/python/\")\n", + "import utils_ner" ] }, { @@ -127,7 +39,8 @@ "source": [ "#### info on tags from sner\n", "\n", - "four tags besides O for Other: PERSON, LOCATION, ORGANIZATION, MISC" + "four tags: PERSON, LOCATION, ORGANIZATION, MISC\n", + "(besides O for Other)" ] }, { @@ -146,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -705,7 +618,7 @@ " '1995_20026399.txt']" ] }, - "execution_count": 18, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -725,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": { "scrolled": true }, @@ -7917,13 +7830,13 @@ "source": [ "# either choose from the list of files\n", "filepath = path_data + list_files[0]\n", - "alldicts = read_from_txt(filepath)\n", - "render_dict(alldicts)" + "alldicts = utils_ner.read_from_txt(filepath)\n", + "utils_ner.render_dict(alldicts)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 5, "metadata": { "scrolled": true }, @@ -18787,8 +18700,8 @@ "# or copy and paste a filename from the list\n", "filename = '1994_20023826.txt'\n", "filepath = path_data + filename\n", - "alldicts = read_from_txt(filepath)\n", - "render_dict(alldicts)" + "alldicts = utils_ner.read_from_txt(filepath)\n", + "utils_ner.render_dict(alldicts)" ] }, { diff --git a/requirements.txt b/requirements.txt index f01a2581d31e59403c15fef871b95bd8b8972dd1..ae3be3df42108fd3054cad7466f082091af00f57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,6 @@ tables gitpython pdfminer.six nltk -pyxdameraulevenshtein==1.4.1 \ No newline at end of file +pyxdameraulevenshtein==1.4.1 +spacy +plac