From 06838c4e9fc6135b3680cbb66f49c32883c03c0f Mon Sep 17 00:00:00 2001 From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch> Date: Thu, 2 Apr 2020 19:25:59 +0200 Subject: [PATCH] Commiting the notebook before checkout --- notebooks/Section_TextB_Supervision.ipynb | 407 +++++++++++++++++++--- 1 file changed, 365 insertions(+), 42 deletions(-) diff --git a/notebooks/Section_TextB_Supervision.ipynb b/notebooks/Section_TextB_Supervision.ipynb index 04bafa27..44171854 100644 --- a/notebooks/Section_TextB_Supervision.ipynb +++ b/notebooks/Section_TextB_Supervision.ipynb @@ -2,15 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 75, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" + "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/\n" ] } ], @@ -26,12 +25,13 @@ "os.environ['DEMOCRASCI_DATA'] = \"/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/\"\n", "import user_labeling\n", "import trained_annotation\n", - "import utils_feats as ut_f" + "import utils_feats as ut_f\n", + "import pickle" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -43,10 +43,11 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ + "'''\n", "# We need to open all the features files for the range of years of interest, and create\n", "# the vocabulary we would need for the CountVectorizer\n", "\n", @@ -56,12 +57,43 @@ " pd_feat = pd.read_pickle('%s/%s.pickle' % (folder_data,str(year)))\n", " text_col = np.concatenate([text_col,np.array(pd_feat['blocktext'])], axis = 0)\n", "\n", - "vocab_final, ocurrence = ut_f.get_vocab(text_col, min_ocurr = 10, flag_lower = 1, flag_stopw = 1, n_words = 100)" + "vocab_final, ocurrence = ut_f.get_vocab(text_col, min_ocurr = 10, flag_lower = 1, flag_stopw = 1, n_words = 200)\n", + "'''" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#### SAVE DICT WITH VOCAB\n", + "file_pickle = '../src/python/dict_in_suploop.pkl'\n", + "dict_save = dict()\n", + "dict_save['in_year'] = in_year\n", + "dict_save['end_year'] = end_year\n", + "dict_save['data_path'] = data_path\n", + "dict_save['vocab_final'] = vocab_final\n", + "pickle.dump(dict_save, open(file_pickle, 'wb'))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#### OPEN DICT\n", + "dict_in_sup_loop = pickle.load(open(file_pickle,'rb'))\n", + "# Copy values in dict into variables\n", + "for key_arg in list(dict_in_sup_loop.keys()):\n", + " str_exec = key_arg + ' = dict_in_sup_loop[\"' + key_arg + '\"]'\n", + " exec(str_exec)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -77,12 +109,12 @@ { "data": { "text/plain": [ - "{'title': <trained_annotation.TrainedAnnotator at 0x114f6c160>,\n", - " 'overview_page_type': <trained_annotation.TrainedAnnotator at 0x114f6ca20>,\n", - " 'section_textb': <trained_annotation.TrainedAnnotator at 0x1151e8828>}" + "{'title': <trained_annotation.TrainedAnnotator at 0x107add748>,\n", + " 'overview_page_type': <trained_annotation.TrainedAnnotator at 0x11fba3518>,\n", + " 'section_textb': <trained_annotation.TrainedAnnotator at 0x11fba3cc0>}" ] }, - "execution_count": 77, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -94,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -108,25 +140,38 @@ " \"qu'il\" 'loi' 'herrn' 'sagen' 'bundes' \"c'est\" 'confédération' 'ja'\n", " 'suisse' 'cantons' 'être' 'müssen' 'wäre' 'recht' 'gesetz' 'ändern'\n", " 'sache' 'fédérale' 'droit' 'commission' 'kantonen' 'faire' 'schweiz'\n", - " 'artikel' 'zeit' 'banque' 'comme' \"d'une\" '!' 'tout' 'weise' 'heute'\n", - " 'bank' '3' 'gesagt' 'fait' 'wurde' 'lassen' 'kanton' '2' 'bien' 'peut'\n", + " 'artikel' 'zeit' 'banque' 'comme' \"d'une\" '!' 'tout' 'weise' 'heute' '3'\n", + " 'bank' 'gesagt' 'fait' 'wurde' 'lassen' 'kanton' '2' 'bien' 'peut'\n", " 'jahre' 'möchte' 'allein' 'gegenüber' 'bezug' 'sagt' 'gemacht' 'sollen'\n", " 'question' \"n'est\" 'kommen' 'seite' 'darauf' 'allerdings' 'proposition'\n", " 'bundesrates' 'wohl' 'herren' '1' 'sans' '4' 'geben' 'contre' \"d'un\"\n", - " 'grossen' 'hätte']\n", - "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/heuristic_label does not exist. No heuristic label were loaded.\n", - "The file /Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/human_label.csv already exists, newly labelled data will be added to it\n" + " 'grossen' 'hätte' 'dafür' 'gar' 'projet' 'r' 'mehrheit' 'tous'\n", + " 'angenommen' 'fer' 'fall' 'immer' 'u' \"l'on\" 'cas' 'zwei' 'grosse'\n", + " 'darüber' 'stimmen' 'donc' 'canton' 'bern' 'national' 'tessin'\n", + " \"l'article\" 'constitution' 'ganzen' 'gesetzes' 'nehmen' 'ganze' 'cet'\n", + " 'doit' 'deux' 'dire' 'millionen' 'aussi' 'point' 'bestimmungen' 'gut'\n", + " 'où' '5' 'kommt' 'dit' 'möglich' 'berichterstatter' 'stellen' 'deshalb'\n", + " 'vielleicht' 'steht' 'thun' 'dont' 'ansicht' 'chemins' 'ainsi' 'aktien'\n", + " 'seien' 'gerade' 'entre' \"l'art\" 'finden' 'dagegen' 'bulletin' 'pays'\n", + " 'bahnen' 'bereits' 'eintreten' 'eben' 'beim' 'jahren' 'vorlage'\n", + " 'verfassung' 'gehen' 'unserer' 'grund' 'hand' 'b.' 'verhältnisse' 'liegt'\n", + " 'überhaupt' 'stehen' 'teil' 'encore' 'wissen' 'staat' 'gestellt' \"''\"\n", + " 'toutes' 'geht' 'diejenigen' 'richtig' 'darum' \"qu'on\" 'dispositions'\n", + " 'très' 'lieu' 'faut' 'dürfen' 'ment' 'mission' 'frankreich' 'bestimmung'\n", + " '6']\n", + "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/features\n", + "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/heuristic_label does not exist. No heuristic label were loaded.\n" ] } ], "source": [ "#supervision_loop = user_labeling.make_standard_supervision_loop(\"section_textb\")\n", - "supervision_loop = user_labeling.make_standard_supervision_loop(\"section_textb\", list_stopw = None, max_feat = 100, vocab = vocab_final)\n" + "supervision_loop = user_labeling.make_standard_supervision_loop(\"section_textb\", list_stopw = None, max_feat = 200, vocab = vocab_final)\n" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -151,7 +196,7 @@ " dtype='<U47')}" ] }, - "execution_count": 79, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -162,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -184,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -197,57 +242,335 @@ } ], "source": [ - "supervision_loop.supervise(n_rounds=5, n_samples_per_round=20, n_samples_perId = 5)\n" + "supervision_loop.supervise(n_rounds=5, n_samples_per_round=20, n_samples_perId=10)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import json\n", + "df = pd.DataFrame(np.zeros([3,3]), columns=['1a','2b','3c'], index=['11','12','13'])\n", + "data = json.dumps({'df_andas': df.to_json()})" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 1a 3c 2b\n", + "0 0 0 0\n", + "1 0 0 0\n", + "2 0 0 0\n" + ] + } + ], + "source": [ + "print(pd.read_json(json.loads(data)[\"df_andas\"]))" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>1a</th>\n", + " <th>2b</th>\n", + " <th>3c</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 1a 2b 3c\n", + "12 0.0 0.0 0.0\n", + "13 0.0 0.0 0.0" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[1:3]" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(data).get(\"sup_loop\",{})" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "dict_a = {1: {'a':1,'b': 2}, 2: {'c':3,'d':4}}" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "'int' object is not subscriptable", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-50-dfb093393faa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbox\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mbox\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mbox\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: 'int' object is not subscriptable" + ] + } + ], + "source": [ + "for i, box in enumerate(dict_a):\n", + " print(i,box['a'],box['b'])\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 51, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import def_classes as dc" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + } + ], + "source": [ + "doc = dc.Document(input_file='./1893/20026572.pdf', folder_database='../data/AB/')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A Tarfile is Opened that is never closed!\n", + "Page 0: flag_2col 1\n", + "Page 1: flag_2col 1\n", + "Page 2: flag_2col 1\n", + "Page 3: flag_2col 1\n", + "Page 4: flag_2col 1\n", + "Page 5: flag_2col 1\n", + "Not saving to tar\n", + "Command output : b''\n", + "Command exit status/return code : 0\n", + "End of file ./1893/20026572.pdf - 33.064653158187866 seconds -\n" + ] + } + ], + "source": [ + "doc.correct_xml(flag_plots=1, flag_save=0, flag_save_figs=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on http://127.0.0.1:8050/\n", + "Debugger PIN: 805-158-901\n", + " * Serving Flask app \"__main__\" (lazy loading)\n", + " * Environment: production\n", + " WARNING: Do not use the development server in a production environment.\n", + " Use a production WSGI server instead.\n", + " * Debug mode: on\n" + ] + }, + { + "ename": "SystemExit", + "evalue": "1", + "output_type": "error", + "traceback": [ + "An exception has occurred, use %tb to see the full traceback.\n", + "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luissalamanca/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2969: UserWarning:\n", + "\n", + "To exit: use 'exit', 'quit', or Ctrl-D.\n", + "\n" + ] + } + ], + "source": [ + "import dash\n", + "import dash_core_components as dcc\n", + "import dash_html_components as html\n", + "from dash.dependencies import Input, Output, State\n", + "import numpy as np\n", + "\n", + "app = dash.Dash()\n", + "\n", + "app.layout = html.Div([\n", + " dcc.Input(id='my-id', value='initial value', type='text'),\n", + " html.Button(id='button', n_clicks=0, children='Submit'),\n", + " html.Button(id='button1', n_clicks=0, children='SubmitRand'),\n", + " html.Button(id='restart', n_clicks=0, children='RestartRand'),\n", + " html.Button(id=\"skip-button\",n_clicks=0, children='Skip',style={\"marginRight\":\"1rem\",\"display\":\"none\"}),\n", + " html.Pre(id=\"new-event\",style={\"display\":\"none\"}),\n", + " html.Div(id='my-div')\n", + "])\n", + "\n", + "class ModClicks:\n", + " \n", + " def __init__(self):\n", + " self.new_value = np.random.randint(100)\n", + " \n", + " def curr_val(self):\n", + " return self.new_value\n", + " \n", + " def restart_val(self):\n", + " self.new_value = np.random.randint(100)\n", + " \n", + "mod_cl = ModClicks()\n", + "\n", + "@app.callback(\n", + " Output(component_id='my-div', component_property='children'),\n", + " [Input(component_id='button', component_property='n_clicks'),\n", + " Input(component_id='my-id', component_property='value')]\n", + ")\n", + "def update_output_div(n_clicks, input_value):\n", + " return 'You\\'ve entered \"{}\" and n_clicks on Submit is {}'.format(input_value, n_clicks)\n", + "\n", + "@app.callback(\n", + " Output(component_id='my-id', component_property='value'),\n", + " [Input(\"skip-button\",\"n_clicks\"),\n", + " Input(\"new-event\",\"children\")]\n", + ")\n", + "def update_input_val(input_value, new_event):\n", + " input_value = mod_cl.curr_val()\n", + " return input_value\n", + "\n", + "@app.callback(\n", + " Output(\"skip-button\",\"n_clicks\"),\n", + " [Input(component_id='button1', component_property='n_clicks')]\n", + ")\n", + "def val_from_class(n_clicks):\n", + " return n_clicks + 1\n", + "\n", + "\n", + "@app.callback(\n", + " Output(\"new-event\",\"children\"),\n", + " [Input(component_id='restart', component_property='n_clicks')]\n", + ")\n", + "def restart_val_from_class(n_clicks):\n", + " mod_cl.restart_val()\n", + " return n_clicks + 1\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + " app.run_server(debug=True)" + ] }, { "cell_type": "code", -- GitLab