From 06838c4e9fc6135b3680cbb66f49c32883c03c0f Mon Sep 17 00:00:00 2001
From: Luis Salamanca <luis.salamanca@sdsc.ethz.ch>
Date: Thu, 2 Apr 2020 19:25:59 +0200
Subject: [PATCH] Commiting the notebook before checkout

---
 notebooks/Section_TextB_Supervision.ipynb | 407 +++++++++++++++++++---
 1 file changed, 365 insertions(+), 42 deletions(-)

diff --git a/notebooks/Section_TextB_Supervision.ipynb b/notebooks/Section_TextB_Supervision.ipynb
index 04bafa27..44171854 100644
--- a/notebooks/Section_TextB_Supervision.ipynb
+++ b/notebooks/Section_TextB_Supervision.ipynb
@@ -2,15 +2,14 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/\n"
      ]
     }
    ],
@@ -26,12 +25,13 @@
     "os.environ['DEMOCRASCI_DATA'] = \"/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/\"\n",
     "import user_labeling\n",
     "import trained_annotation\n",
-    "import utils_feats as ut_f"
+    "import utils_feats as ut_f\n",
+    "import pickle"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,10 +43,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
+    "'''\n",
     "# We need to open all the features files for the range of years of interest, and create\n",
     "# the vocabulary we would need for the CountVectorizer\n",
     "\n",
@@ -56,12 +57,43 @@
     "    pd_feat = pd.read_pickle('%s/%s.pickle' % (folder_data,str(year)))\n",
     "    text_col = np.concatenate([text_col,np.array(pd_feat['blocktext'])], axis = 0)\n",
     "\n",
-    "vocab_final, ocurrence = ut_f.get_vocab(text_col, min_ocurr = 10, flag_lower = 1, flag_stopw = 1, n_words = 100)"
+    "vocab_final, ocurrence = ut_f.get_vocab(text_col, min_ocurr = 10, flag_lower = 1, flag_stopw = 1, n_words = 200)\n",
+    "'''"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#### SAVE DICT WITH VOCAB\n",
+    "file_pickle = '../src/python/dict_in_suploop.pkl'\n",
+    "dict_save = dict()\n",
+    "dict_save['in_year'] = in_year\n",
+    "dict_save['end_year'] = end_year\n",
+    "dict_save['data_path'] = data_path\n",
+    "dict_save['vocab_final'] = vocab_final\n",
+    "pickle.dump(dict_save, open(file_pickle, 'wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#### OPEN DICT\n",
+    "dict_in_sup_loop = pickle.load(open(file_pickle,'rb'))\n",
+    "# Copy values in dict into variables\n",
+    "for key_arg in list(dict_in_sup_loop.keys()):\n",
+    "    str_exec = key_arg + ' = dict_in_sup_loop[\"' + key_arg + '\"]'\n",
+    "    exec(str_exec)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -77,12 +109,12 @@
     {
      "data": {
       "text/plain": [
-       "{'title': <trained_annotation.TrainedAnnotator at 0x114f6c160>,\n",
-       " 'overview_page_type': <trained_annotation.TrainedAnnotator at 0x114f6ca20>,\n",
-       " 'section_textb': <trained_annotation.TrainedAnnotator at 0x1151e8828>}"
+       "{'title': <trained_annotation.TrainedAnnotator at 0x107add748>,\n",
+       " 'overview_page_type': <trained_annotation.TrainedAnnotator at 0x11fba3518>,\n",
+       " 'section_textb': <trained_annotation.TrainedAnnotator at 0x11fba3cc0>}"
       ]
      },
-     "execution_count": 77,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -94,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -108,25 +140,38 @@
       " \"qu'il\" 'loi' 'herrn' 'sagen' 'bundes' \"c'est\" 'confédération' 'ja'\n",
       " 'suisse' 'cantons' 'être' 'müssen' 'wäre' 'recht' 'gesetz' 'ändern'\n",
       " 'sache' 'fédérale' 'droit' 'commission' 'kantonen' 'faire' 'schweiz'\n",
-      " 'artikel' 'zeit' 'banque' 'comme' \"d'une\" '!' 'tout' 'weise' 'heute'\n",
-      " 'bank' '3' 'gesagt' 'fait' 'wurde' 'lassen' 'kanton' '2' 'bien' 'peut'\n",
+      " 'artikel' 'zeit' 'banque' 'comme' \"d'une\" '!' 'tout' 'weise' 'heute' '3'\n",
+      " 'bank' 'gesagt' 'fait' 'wurde' 'lassen' 'kanton' '2' 'bien' 'peut'\n",
       " 'jahre' 'möchte' 'allein' 'gegenüber' 'bezug' 'sagt' 'gemacht' 'sollen'\n",
       " 'question' \"n'est\" 'kommen' 'seite' 'darauf' 'allerdings' 'proposition'\n",
       " 'bundesrates' 'wohl' 'herren' '1' 'sans' '4' 'geben' 'contre' \"d'un\"\n",
-      " 'grossen' 'hätte']\n",
-      "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/heuristic_label does not exist. No heuristic label were loaded.\n",
-      "The file /Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/human_label.csv already exists, newly labelled data will be added to it\n"
+      " 'grossen' 'hätte' 'dafür' 'gar' 'projet' 'r' 'mehrheit' 'tous'\n",
+      " 'angenommen' 'fer' 'fall' 'immer' 'u' \"l'on\" 'cas' 'zwei' 'grosse'\n",
+      " 'darüber' 'stimmen' 'donc' 'canton' 'bern' 'national' 'tessin'\n",
+      " \"l'article\" 'constitution' 'ganzen' 'gesetzes' 'nehmen' 'ganze' 'cet'\n",
+      " 'doit' 'deux' 'dire' 'millionen' 'aussi' 'point' 'bestimmungen' 'gut'\n",
+      " 'où' '5' 'kommt' 'dit' 'möglich' 'berichterstatter' 'stellen' 'deshalb'\n",
+      " 'vielleicht' 'steht' 'thun' 'dont' 'ansicht' 'chemins' 'ainsi' 'aktien'\n",
+      " 'seien' 'gerade' 'entre' \"l'art\" 'finden' 'dagegen' 'bulletin' 'pays'\n",
+      " 'bahnen' 'bereits' 'eintreten' 'eben' 'beim' 'jahren' 'vorlage'\n",
+      " 'verfassung' 'gehen' 'unserer' 'grund' 'hand' 'b.' 'verhältnisse' 'liegt'\n",
+      " 'überhaupt' 'stehen' 'teil' 'encore' 'wissen' 'staat' 'gestellt' \"''\"\n",
+      " 'toutes' 'geht' 'diejenigen' 'richtig' 'darum' \"qu'on\" 'dispositions'\n",
+      " 'très' 'lieu' 'faut' 'dürfen' 'ment' 'mission' 'frankreich' 'bestimmung'\n",
+      " '6']\n",
+      "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/features\n",
+      "/Users/luissalamanca/My_stuff/05_SDSCresearch/02_NLP/Code/democrasci_preprocwp1/data/trained_annotation_data/section_textb/heuristic_label does not exist. No heuristic label were loaded.\n"
      ]
     }
    ],
    "source": [
     "#supervision_loop = user_labeling.make_standard_supervision_loop(\"section_textb\")\n",
-    "supervision_loop = user_labeling.make_standard_supervision_loop(\"section_textb\", list_stopw = None, max_feat = 100, vocab = vocab_final)\n"
+    "supervision_loop = user_labeling.make_standard_supervision_loop(\"section_textb\", list_stopw = None, max_feat = 200, vocab = vocab_final)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -151,7 +196,7 @@
        "       dtype='<U47')}"
       ]
      },
-     "execution_count": 79,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -162,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -184,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -197,57 +242,335 @@
     }
    ],
    "source": [
-    "supervision_loop.supervise(n_rounds=5, n_samples_per_round=20, n_samples_perId = 5)\n"
+    "supervision_loop.supervise(n_rounds=5, n_samples_per_round=20, n_samples_perId=10)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import json\n",
+    "df = pd.DataFrame(np.zeros([3,3]), columns=['1a','2b','3c'], index=['11','12','13'])\n",
+    "data = json.dumps({'df_andas': df.to_json()})"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   1a  3c  2b\n",
+      "0   0   0   0\n",
+      "1   0   0   0\n",
+      "2   0   0   0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(pd.read_json(json.loads(data)[\"df_andas\"]))"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df)"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 33,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>1a</th>\n",
+       "      <th>2b</th>\n",
+       "      <th>3c</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     1a   2b   3c\n",
+       "12  0.0  0.0  0.0\n",
+       "13  0.0  0.0  0.0"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.iloc[1:3]"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{}"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "json.loads(data).get(\"sup_loop\",{})"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "dict_a = {1: {'a':1,'b': 2}, 2: {'c':3,'d':4}}"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "'int' object is not subscriptable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-50-dfb093393faa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbox\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mbox\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mbox\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'b'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: 'int' object is not subscriptable"
+     ]
+    }
+   ],
+   "source": [
+    "for i, box in enumerate(dict_a):\n",
+    "    print(i,box['a'],box['b'])\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import def_classes as dc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = dc.Document(input_file='./1893/20026572.pdf', folder_database='../data/AB/')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A Tarfile is Opened that is never closed!\n",
+      "Page 0: flag_2col 1\n",
+      "Page 1: flag_2col 1\n",
+      "Page 2: flag_2col 1\n",
+      "Page 3: flag_2col 1\n",
+      "Page 4: flag_2col 1\n",
+      "Page 5: flag_2col 1\n",
+      "Not saving to tar\n",
+      "Command output :  b''\n",
+      "Command exit status/return code :  0\n",
+      "End of file ./1893/20026572.pdf - 33.064653158187866 seconds -\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc.correct_xml(flag_plots=1, flag_save=0, flag_save_figs=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on http://127.0.0.1:8050/\n",
+      "Debugger PIN: 805-158-901\n",
+      " * Serving Flask app \"__main__\" (lazy loading)\n",
+      " * Environment: production\n",
+      "   WARNING: Do not use the development server in a production environment.\n",
+      "   Use a production WSGI server instead.\n",
+      " * Debug mode: on\n"
+     ]
+    },
+    {
+     "ename": "SystemExit",
+     "evalue": "1",
+     "output_type": "error",
+     "traceback": [
+      "An exception has occurred, use %tb to see the full traceback.\n",
+      "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m 1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/luissalamanca/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2969: UserWarning:\n",
+      "\n",
+      "To exit: use 'exit', 'quit', or Ctrl-D.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import dash\n",
+    "import dash_core_components as dcc\n",
+    "import dash_html_components as html\n",
+    "from dash.dependencies import Input, Output, State\n",
+    "import numpy as np\n",
+    "\n",
+    "app = dash.Dash()\n",
+    "\n",
+    "app.layout = html.Div([\n",
+    "    dcc.Input(id='my-id', value='initial value', type='text'),\n",
+    "    html.Button(id='button', n_clicks=0, children='Submit'),\n",
+    "    html.Button(id='button1', n_clicks=0, children='SubmitRand'),\n",
+    "    html.Button(id='restart', n_clicks=0, children='RestartRand'),\n",
+    "    html.Button(id=\"skip-button\",n_clicks=0, children='Skip',style={\"marginRight\":\"1rem\",\"display\":\"none\"}),\n",
+    "    html.Pre(id=\"new-event\",style={\"display\":\"none\"}),\n",
+    "    html.Div(id='my-div')\n",
+    "])\n",
+    "\n",
+    "class ModClicks:\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        self.new_value = np.random.randint(100)\n",
+    "    \n",
+    "    def curr_val(self):\n",
+    "        return self.new_value\n",
+    "    \n",
+    "    def restart_val(self):\n",
+    "        self.new_value = np.random.randint(100)\n",
+    "                \n",
+    "mod_cl = ModClicks()\n",
+    "\n",
+    "@app.callback(\n",
+    "    Output(component_id='my-div', component_property='children'),\n",
+    "    [Input(component_id='button', component_property='n_clicks'),\n",
+    "     Input(component_id='my-id', component_property='value')]\n",
+    ")\n",
+    "def update_output_div(n_clicks, input_value):\n",
+    "    return 'You\\'ve entered \"{}\" and n_clicks on Submit is {}'.format(input_value, n_clicks)\n",
+    "\n",
+    "@app.callback(\n",
+    "    Output(component_id='my-id', component_property='value'),\n",
+    "    [Input(\"skip-button\",\"n_clicks\"),\n",
+    "     Input(\"new-event\",\"children\")]\n",
+    ")\n",
+    "def update_input_val(input_value, new_event):\n",
+    "    input_value = mod_cl.curr_val()\n",
+    "    return input_value\n",
+    "\n",
+    "@app.callback(\n",
+    "    Output(\"skip-button\",\"n_clicks\"),\n",
+    "    [Input(component_id='button1', component_property='n_clicks')]\n",
+    ")\n",
+    "def val_from_class(n_clicks):\n",
+    "    return n_clicks + 1\n",
+    "\n",
+    "\n",
+    "@app.callback(\n",
+    "    Output(\"new-event\",\"children\"),\n",
+    "    [Input(component_id='restart', component_property='n_clicks')]\n",
+    ")\n",
+    "def restart_val_from_class(n_clicks):\n",
+    "    mod_cl.restart_val()\n",
+    "    return n_clicks + 1\n",
+    "\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    app.run_server(debug=True)"
+   ]
   },
   {
    "cell_type": "code",
-- 
GitLab