{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Neural Networks - SVM Loss Function and Gradient"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import and Visualize CIFAR-10 Data Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 70 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train data shape:  (49000, 3072)\n",
      "Train labels shape:  (49000,)\n",
      "Validation data shape:  (1000, 3072)\n",
      "Validation labels shape:  (1000,)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "# function to import CIFAR-10 data set\n",
    "def unpickle(file):\n",
    "    import pickle\n",
    "    with open(file, 'rb') as fo:\n",
    "        dict = pickle.load(fo, encoding='bytes')\n",
    "    return dict\n",
    "data_batch_1 = unpickle(\"../Introduction_to_Image_Classification/data/data_batch_1\")\n",
    "data_batch_2 = unpickle(\"../Introduction_to_Image_Classification/data/data_batch_2\")\n",
    "data_batch_3 = unpickle(\"../Introduction_to_Image_Classification/data/data_batch_3\")\n",
    "data_batch_4 = unpickle(\"../Introduction_to_Image_Classification/data/data_batch_4\")\n",
    "data_batch_5 = unpickle(\"../Introduction_to_Image_Classification/data/data_batch_5\")\n",
    "test_batch = unpickle(\"../Introduction_to_Image_Classification/data/test_batch\")\n",
    "\n",
    "# Let us concatenate the batch training data \n",
    "X_train=np.concatenate([data_batch_1[b'data'], \n",
    "                         data_batch_2[b'data'], \n",
    "                         data_batch_3[b'data'], \n",
    "                         data_batch_4[b'data'], \n",
    "                         data_batch_5[b'data']], \n",
    "                         axis = 0)\n",
    "\n",
    "\n",
    "# What is the shape of Xtr_rows ?\n",
    "X_train.shape\n",
    "\n",
    "\n",
    "# Let us concatenate the training labels\n",
    "y_train=np.concatenate([data_batch_1[b'labels'] , \n",
    "                data_batch_2[b'labels'],\n",
    "                data_batch_3[b'labels'],\n",
    "                data_batch_4[b'labels'],\n",
    "                data_batch_5[b'labels']], \n",
    "                axis = 0)\n",
    "\n",
    "# Let us define the test data as X_test\n",
    "X_test=test_batch[b'data']\n",
    "X_test.shape\n",
    "\n",
    "# Let us cast the test labels as ndarray\n",
    "y_test=np.array(test_batch[b'labels']) \n",
    "y_test.shape\n",
    "\n",
    "\n",
    "# Visualize some examples from the dataset.\n",
    "# We show a few examples of training images from each class.\n",
    "\n",
    "classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']\n",
    "num_classes = len(classes)\n",
    "samples_per_class = 7\n",
    "\n",
    "\n",
    "\n",
    "for y, cls in enumerate(classes):\n",
    "    idxs = np.flatnonzero(y_train == y)\n",
    "    idxs = np.random.choice(idxs, samples_per_class, replace=False)\n",
    "    for i, idx in enumerate(idxs):\n",
    "        plt_idx = i * num_classes + y + 1\n",
    "        plt.subplot(samples_per_class, num_classes, plt_idx)\n",
    "        plt.imshow(X_train[idx].reshape((3,32,32)).transpose((1,2,0)).astype('uint8'))\n",
    "        plt.axis('off')\n",
    "        if i == 0:\n",
    "            plt.title(cls)\n",
    "plt.show()\n",
    "\n",
    "# Split the data into train, val, and test sets. In addition we will\n",
    "# create a small development set as a subset of the training data;\n",
    "# we can use this for development so our code runs faster.\n",
    "num_training = 49000\n",
    "num_validation = 1000\n",
    "num_test = 1000\n",
    "num_dev = 500\n",
    "\n",
    "# Our validation set will be num_validation points from the original\n",
    "# training set.\n",
    "mask = range(num_training, num_training + num_validation)\n",
    "X_val = X_train[mask]\n",
    "y_val = y_train[mask]\n",
    "\n",
    "# Our training set will be the first num_train points from the original\n",
    "# training set.\n",
    "mask = range(num_training)\n",
    "X_train = X_train[mask]\n",
    "y_train = y_train[mask]\n",
    "\n",
    "# We will also make a development set, which is a small subset of\n",
    "# the training set.\n",
    "mask = np.random.choice(num_training, num_dev, replace=False)\n",
    "X_dev = X_train[mask]\n",
    "y_dev = y_train[mask]\n",
    "\n",
    "# We use the first num_test points of the original test set as our\n",
    "# test set.\n",
    "mask = range(num_test)\n",
    "X_test = X_test[mask]\n",
    "y_test = y_test[mask]\n",
    "\n",
    "print('Train data shape: ', X_train.shape)\n",
    "print('Train labels shape: ', y_train.shape)\n",
    "print('Validation data shape: ', X_val.shape)\n",
    "print('Validation labels shape: ', y_val.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " ## Preprocessing the Data : Subtract the Mean Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[130 130 130 131 132 132 133 133 134 134]\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD5CAYAAADhukOtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAR/klEQVR4nO3db6hl5XXH8e+K0cR7FUdrOgyjVGOFIqEZ5TJYIsEmJFgJqFBEX4gvJJO2ESqkL8RCtdAXplRFaDGMdcikWP80Kg5F2pghIHljvFodR6dtjIzEYZwxqGjnhqbjrL7Ye+COnOc556yz9z5H1+8Dwz137/PsZ909Z919zl73eR5zd0Tkk+9T8w5ARIahZBdJQskukoSSXSQJJbtIEkp2kSQ+PUtjM7sCuBc4CfhHd7+z9vzl5WXfcOaGWbocgE3fYvomMmfxivNil6rfe/c9jhw5MvIVGU52MzsJ+Afga8CbwHNmtsvdXy212XDmBv7k5j8r7K2cxEI21XLMghkYaVdvUt4ZbLY4On7dxw83fctoskf/LqXWrrgn0Nf3/v6+4r5Z3sZvBV5z99fd/TfAw8BVMxxPRHo0S7JvBn657vs3220isoB6v0FnZtvMbNXMVo8cOdJ3dyJSMEuyHwDOXff9Oe22E7j7dndfcfeV5eXlGboTkVnMkuzPARea2flmdgpwHbCrm7BEpGvhu/HuftTMbgb+nab0tsPdX5mgZel4xRZWalO7ZV27k1m70+2VnaVd1TbR276xZp9UXVfKPHjE6t342K5yLB2/Bmaqs7v7U8BTHcUiIj3SX9CJJKFkF0lCyS6ShJJdJAklu0gSM92NjyiVLtyPVRoVSlvhslawVFbaVRkJUz1cL4NdivXBSiB9xDGcSPjhAS3B81jtLVQeHP3/XPuxdGUXSULJLpKEkl0kCSW7SBJKdpEkBr8bX771GBi4Erz7WRpYMzaMwECY6h336o8cvVUfmKKp0ma4KKKNooeM7KnvjIbf7UCYciNd2UWSULKLJKFkF0lCyS6ShJJdJAklu0gSw5be3Cu1rVo5bPS+PspC1UpZZEBOeCq8YM0u0ltktZIe9NFX1/O7xctrw/VVoiu7SBJKdpEklOwiSSjZRZJQsoskoWQXSWKm0puZ7Qc+AD4Ejrr7Su35Tm0OuulHXtVLEwMWjaKTyXVdXYvqo6/Yf1pR1yH2U+Ybst30rbqos/+hu/+qg+OISI/0Nl4kiVmT3YEfmdnzZrati4BEpB+zvo2/zN0PmNlvA0+b2X+6+zPrn9D+EtgGcMYZZ8zYnYhEzXRld/cD7dfDwBPA1hHP2e7uK+6+srS8NEt3IjKDcLKb2bKZnX78MfB1YG9XgYlIt2Z5G78ReMKa0VmfBv7Z3f9tfLPpJ5ysL4MzXTcQr3iVJqr0yhHrI9sqOxdFeIjgcHGEugqe+2HLa92+QMLJ7u6vA1/sMBYR6ZFKbyJJKNlFklCyiyShZBdJQskuksTga725H5tqe/1g5V3V9dym7ykeSA/NOrco5bUeFEOMxl6ZCLT7slxo4cFiE13ZRZJQsoskoWQXSULJLpKEkl0kiYHvxpeXf4rMQRdftqjSV9cDLgbW+diaIafy6+OggRNSG9hUe81Vm00fRniAVYmu7CJJKNlFklCyiyShZBdJQskukoSSXSSJwQfCFEsXkTnoggNhamqVlVKH1bEiwbnwokrdhfuqNuz6J+ih+FaY6K8+/19sZNCw89NNfzBd2UWSULKLJKFkF0lCyS6ShJJdJAklu0gSY0tvZrYD+AZw2N2/0G47C3gEOA/YD1zr7u9O0mF5KafacLPp28RLXpFhb7GhctGp32L6WO9o4Lri9FGM3TtarbwWLOmGTkitHD398Sa5sn8fuOIj224Fdrv7hcDu9nsRWWBjk71db/2dj2y+CtjZPt4JXN1tWCLStehn9o3ufrB9/BbNiq4issBmvkHnzRQzxQ8XZrbNzFbNbHXtyNqs3YlIUDTZD5nZJoD26+HSE919u7uvuPvK0vJSsDsRmVU02XcBN7aPbwSe7CYcEenLJKW3h4DLgbPN7E3gduBO4FEzuwl4A7h2ot6cyoST5eWfypNARmeH7Ha5ptBkmYPrYzrHwAyL4RPScaGy9tKpzToanVWycszIq6e8ClX5aGOT3d2vL+z66ri2IrI49Bd0Ikko2UWSULKLJKFkF0lCyS6SxMdjwsn6LJAjWXAduNA8hIH4+jJsqa/rcljsPFq1rFWIozpLaK23Sl/leljsRwvHOJqu7CJJKNlFklCyiyShZBdJQskukoSSXSSJgUtvjlMY3VarTQw64WRFxyW22uCqShWn8wFs/ZTrCqMbg3HEBzF2Pvyu0lXsRVcqD3b9EtCVXSQJJbtIEkp2kSSU7CJJKNlFkliYgTD1wS6j99UGu9RjCO3CinHEwqgJVxMCscSXT1qMGfYip7/6egvcOR8XR/WlWngB1foym/46rSu7SBJKdpEklOwiSSjZRZJQsoskoWQXSWKS5Z92AN8ADrv7F9ptdwDfBN5un3abuz81WyjTD4SJLrtUr7pMX8iJHi9aXlucYli3db7pi6+twIpMtbJWdGmo+tmYvmRXL7FOf+4nubJ/H7hixPZ73H1L+2/GRBeRvo1Ndnd/BnhngFhEpEezfGa/2cz2mNkOMzuzs4hEpBfRZL8PuADYAhwE7io90cy2mdmqma2ura0FuxORWYWS3d0PufuH3iyqfj+wtfLc7e6+4u4rS0tL0ThFZEahZDezTeu+vQbY2004ItKXSUpvDwGXA2eb2ZvA7cDlZraFpmqwH/jWxD0Gln8KLRlVCSG6NFS5UbCeVD9oZV+gMNdHiF2LVcNCP1u19FaLo1qW67ZgGhmBWWs1Ntnd/foRmx8Y105EFov+gk4kCSW7SBJKdpEklOwiSSjZRZIYfsLJ4rJA3ZbewmW5rmtUPUyKWV32KnLAcIiB8mAPy1pFymi12OuTQ1ZGr1WHMU4/HrHWJJISurKLJKFkF0lCyS6ShJJdJAklu0gSSnaRJOZQeiuolcqKdYZjlePF+goJj76rHDJYhypVa+o/ch/j3gKj7wLlqXEHLf7ctRJaraeOy2tVXlnrLfB/piu7SBJKdpEklOwiSSjZRZJQsoskMfDdeA/dCS/fjY8NhIkPkilsjw5aCd4Er48l+RjPQRe80x0anxSc46+Pc1X+0brtTVd2kSSU7CJJKNlFklCyiyShZBdJQskuksQkyz+dC/wA2EhTC9ju7vea2VnAI8B5NEtAXevu70YDqQ4wKM1b10PpLSI8yKRWaYodsbx3QeprlbEdYxp23F/XxxtzzPp8cqN31k9VPwNhjgLfcfeLgEuBb5vZRcCtwG53vxDY3X4vIgtqbLK7+0F3f6F9/AGwD9gMXAXsbJ+2E7i6pxhFpANTfWY3s/OAi4FngY3ufrDd9RbN23wRWVATJ7uZnQY8Btzi7u+v3+fNh+eRHyLMbJuZrZrZ6tqRX88UrIjETZTsZnYyTaI/6O6Pt5sPmdmmdv8m4PCotu6+3d1X3H1lafnULmIWkYCxyW5mRrMe+z53v3vdrl3Aje3jG4Enuw9PRLoyyai3LwE3AC+b2YvtttuAO4FHzewm4A3g2l4iDApU8ibZ2XEgwSgCJbv68lqVvjqeVq3eV/drQ5VPf23JqO7PVX1AX+Rnm/4/Zmyyu/tPK0f+6tQ9ishc6C/oRJJQsoskoWQXSULJLpKEkl0kicVZ/qk6MWNh1Fv0eOEyzuh2XVen2s5izabeET1gULW6NtzSStEJJ6NiRwzXj0fSlV0kCSW7SBJKdpEklOwiSSjZRZJQsosksUClt3JxolR16XjeyONH7bjFgsz02IfawLHA4eoj/YKzc0YiCZcAhy3nTUtXdpEklOwiSSjZRZJQsoskoWQXSWJh7sZXl8epzKxWbDPwMkNlCxLIwDeDF+Y0DnW8cQet9VfcV6lQBbrRlV0kCSW7SBJKdpEklOwiSSjZRZJQsoskMbb0ZmbnAj+gWZLZge3ufq+Z3QF8E3i7fept7v7U2B4jJY9Cm/qYhPLOcFkotExPRR9LIRV2LcpwnPhUch2PugkfrzZgq9t9HU+7N1Gd/SjwHXd/wcxOB543s6fbffe4+991G5KI9GGStd4OAgfbxx+Y2T5gc9+BiUi3pvrMbmbnARcDz7abbjazPWa2w8zO7Do4EenOxMluZqcBjwG3uPv7wH3ABcAWmiv/XYV228xs1cxW19Z+PXvEIhIyUbKb2ck0if6guz8O4O6H3P1Ddz8G3A9sHdXW3be7+4q7rywtndpV3CIypbHJbs2twgeAfe5+97rtm9Y97Rpgb/fhiUhXJrkb/yXgBuBlM3ux3XYbcL2ZbaGp6uwHvjVbKLURPtPX3rxSJqsXtYYcHhYsiNWG9BV3xc5HXaBlD6e3VtYKHjDYrnbISFmuesCpm0xyN/6nhUOMr6mLyMLQX9CJJKFkF0lCyS6ShJJdJAklu0gSH48JJyMT8vVQPimKDimr/tCVyTQDwRTLlzOZ/pjhKlmtdFVtF2oViyO6rxBL1xVFXdlFklCyiyShZBdJQskukoSSXSQJJbtIEoOX3iIFlEgZzT5V/j3mlbKWVSdznH6kUVWtvFYr1VTLct3Wazov2AXrSd2XUqNxhDobU5YLtKmFUaAru0gSSnaRJJTsIkko2UWSULKLJKFkF0li4NKbUSoaREoa9aXeYqWr0BC28EJqlRJaD8ccVmREXB8jFTsuRUb7CpTexkQydQtd2UWSULKLJKFkF0lCyS6ShJJdJImxd+PN7LPAM8Bn2uf/0N1vN7PzgYeB3wKeB25w99+MP16xn1oMI7fXB7TU1Aa7VBt2bFHiGFD4hntkyaseAonquGIQuYM/yZX9f4GvuPsXaZZnvsLMLgW+C9zj7r8LvAvcNH33IjKUscnujf9pvz25/efAV4Afttt3Alf3EaCIdGPS9dlPaldwPQw8DfwCeM/dj7ZPeRPY3EuEItKJiZLd3T909y3AOcBW4Pcm7cDMtpnZqpmtrq2txaIUkZlNdTfe3d8DfgL8AbDBzI7f4DsHOFBos93dV9x9ZWlpaZZYRWQGY5PdzD5nZhvax6cCXwP20ST9H7dPuxF4sqcYRaQDkwyE2QTsNLOTaH45POru/2pmrwIPm9nfAP8BPDBZl6WBMN0OnBi4sNKDfLW3Acez9HN2gweNNSudkPKJGpvs7r4HuHjE9tdpPr+LyMeA/oJOJAklu0gSSnaRJJTsIkko2UWSsNrIsc47M3sbeKP99mzgV4N1XqY4TqQ4TvRxi+N33P1zo3YMmuwndGy26u4rc+lccSiOhHHobbxIEkp2kSTmmezb59j3eorjRIrjRJ+YOOb2mV1EhqW38SJJzCXZzewKM/svM3vNzG6dRwxtHPvN7GUze9HMVgfsd4eZHTazveu2nWVmT5vZz9uvZ84pjjvM7EB7Tl40sysHiONcM/uJmb1qZq+Y2Z+32wc9J5U4Bj0nZvZZM/uZmb3UxvHX7fbzzezZNm8eMbNTpjqwuw/6DziJZlqrzwOnAC8BFw0dRxvLfuDsOfT7ZeASYO+6bX8L3No+vhX47pziuAP4i4HPxybgkvbx6cB/AxcNfU4qcQx6TmjGqZ7WPj4ZeBa4FHgUuK7d/j3gT6c57jyu7FuB19z9dW+mnn4YuGoOccyNuz8DvPORzVfRTNwJA03gWYhjcO5+0N1faB9/QDM5ymYGPieVOAbljc4neZ1Hsm8Gfrnu+3lOVunAj8zseTPbNqcYjtvo7gfbx28BG+cYy81mtqd9m9/7x4n1zOw8mvkTnmWO5+QjccDA56SPSV6z36C7zN0vAf4I+LaZfXneAUHzm535TVVzH3ABzRoBB4G7hurYzE4DHgNucff31+8b8pyMiGPwc+IzTPJaMo9kPwCcu+774mSVfXP3A+3Xw8ATzHfmnUNmtgmg/Xp4HkG4+6H2hXYMuJ+BzomZnUyTYA+6++Pt5sHPyag45nVO2r7fY8pJXkvmkezPARe2dxZPAa4Ddg0dhJktm9npxx8DXwf21lv1ahfNxJ0wxwk8jydX6xoGOCfWTCb4ALDP3e9et2vQc1KKY+hz0tskr0PdYfzI3cYrae50/gL4yznF8HmaSsBLwCtDxgE8RPN28P9oPnvdRLNm3m7g58CPgbPmFMc/AS8De2iSbdMAcVxG8xZ9D/Bi++/Koc9JJY5Bzwnw+zSTuO6h+cXyV+tesz8DXgP+BfjMNMfVX9CJJJH9Bp1IGkp2kSSU7CJJKNlFklCyiyShZBdJQskukoSSXSSJ/wdks38E/WHkdgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 288x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(49000, 3073) (1000, 3073) (1000, 3073) (500, 3073)\n"
     ]
    }
   ],
   "source": [
    "# First: compute the image mean based on the training data\n",
    "mean_image = np.mean(X_train, axis=0).astype('uint8')\n",
    "print(mean_image[:10]) # print a few of the elements\n",
    "plt.figure(figsize=(4,4))\n",
    "# visualize the mean image\n",
    "plt.imshow(mean_image.reshape((3,32,32)).transpose((1,2,0))) \n",
    "plt.show()\n",
    "\n",
    "# Second: subtract the mean image from train and test data\n",
    "X_train -= mean_image\n",
    "X_val -= mean_image\n",
    "X_test -= mean_image\n",
    "X_dev -= mean_image\n",
    "\n",
    "\n",
    "# Third: append the bias dimension of ones (i.e. bias trick) so that our SVM\n",
    "# only has to worry about optimizing a single weight matrix W.\n",
    "X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])\n",
    "X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])\n",
    "X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])\n",
    "X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])\n",
    "\n",
    "print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SVM Loss Function and Gradient (Not Vectorized)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loss: 10.649423\n"
     ]
    }
   ],
   "source": [
    "from random import shuffle\n",
    "\n",
    "def svm_loss_naive(W, X, y, reg):\n",
    "  \"\"\"\n",
    "  Structured SVM loss function, naive implementation (with loops).\n",
    "\n",
    "  Inputs have dimension D, there are C classes, and we operate on minibatches\n",
    "  of N examples.\n",
    "\n",
    "  Inputs:\n",
    "  - W: A numpy array of shape (D, C) containing weights.\n",
    "  - X: A numpy array of shape (N, D) containing a minibatch of data.\n",
    "  - y: A numpy array of shape (N,) containing training labels; y[i] = c means\n",
    "    that X[i] has label c, where 0 <= c < C.\n",
    "  - reg: (float) regularization strength\n",
    "\n",
    "  Returns a tuple of:\n",
    "  - loss as single float\n",
    "  - gradient with respect to weights W; an array of same shape as W\n",
    "    To be precise: it is the Jacobian matrix of L with respect to all \n",
    "    matrix elements of W : dW is shorthand notation for dL/dW_ij\n",
    "  \"\"\"\n",
    "  \n",
    "\n",
    "  \n",
    "  # initialize the gradient as zero\n",
    "  dW = np.zeros(W.shape) \n",
    "  # compute the loss and the gradient\n",
    "  num_classes = W.shape[1]\n",
    "  num_train = X.shape[0]\n",
    "  loss = 0.0\n",
    "  for i in range(num_train):\n",
    "      scores = X[i].dot(W)\n",
    "      correct_class_score = scores[y[i]]\n",
    "      diff_count = 0  \n",
    "      for j in range(num_classes):\n",
    "          margin = scores[j] - correct_class_score + 1\n",
    "          if j == y[i]:\n",
    "              continue\n",
    "          if margin > 0:\n",
    "              diff_count += 1\n",
    "              # gradient update for incorrect rows\n",
    "              dW[:, j] += X[i] \n",
    "              loss += margin\n",
    "      # gradient update for correct row\n",
    "      dW[:, y[i]] += -diff_count * X[i]\n",
    "\n",
    "  # Right now the loss is a sum over all training examples, but we want it\n",
    "  # to be an average instead so we divide by num_train.\n",
    "  loss /= num_train\n",
    "  dW /= num_train\n",
    "  dW += reg*W # regularize the weights\n",
    "  # Add regularization to the loss.\n",
    "  loss += 0.5 * reg * np.sum(W * W)     \n",
    "  \n",
    "  # Add regularization to the loss.\n",
    "  loss += 0.5 * reg * np.sum(W * W)\n",
    "\n",
    "  return loss, dW\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# generate a random SVM weight matrix of small numbers\n",
    "W = np.random.randn(3073, 10) * 0.0001 \n",
    "\n",
    "loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.00001)\n",
    "print('loss: %f' % (loss, ))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Gradient Check"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We compute numerically the gradient along several randomly chosen \n",
    "dimensions, and compare them with our analytically computed gradient. \n",
    "The numbers should match almost exactly along all dimensions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "numerical: 27.222436 analytic: 27.132000, relative error: 1.663815e-03\n",
      "numerical: 1.851809 analytic: 1.744000, relative error: 2.998176e-02\n",
      "numerical: 72.183250 analytic: 72.012000, relative error: 1.187625e-03\n",
      "numerical: 84.852940 analytic: 85.228000, relative error: 2.205186e-03\n",
      "numerical: 78.516453 analytic: 78.556000, relative error: 2.517782e-04\n",
      "numerical: -7.425685 analytic: -7.556000, relative error: 8.698258e-03\n",
      "numerical: 65.614426 analytic: 65.438000, relative error: 1.346222e-03\n",
      "numerical: -70.012436 analytic: -69.910000, relative error: 7.320894e-04\n",
      "numerical: -19.432000 analytic: -19.432000, relative error: 3.065386e-12\n",
      "numerical: 72.136453 analytic: 72.214000, relative error: 5.372163e-04\n",
      "numerical: 83.964054 analytic: 84.352057, relative error: 2.305204e-03\n",
      "numerical: 77.114869 analytic: 77.281147, relative error: 1.076960e-03\n",
      "numerical: -40.186427 analytic: -40.202214, relative error: 1.963765e-04\n",
      "numerical: -46.241618 analytic: -46.248809, relative error: 7.774815e-05\n",
      "numerical: -7.937252 analytic: -8.083509, relative error: 9.129267e-03\n",
      "numerical: -87.263637 analytic: -87.255818, relative error: 4.479917e-05\n",
      "numerical: 43.166927 analytic: 43.066250, relative error: 1.167487e-03\n",
      "numerical: -12.293494 analytic: -12.292130, relative error: 5.544742e-05\n",
      "numerical: 73.631560 analytic: 73.687554, relative error: 3.800846e-04\n",
      "numerical: -85.680921 analytic: -85.620803, relative error: 3.509477e-04\n"
     ]
    }
   ],
   "source": [
    "def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5):\n",
    "  \"\"\"\n",
    "  sample a few random elements and only return numerical values\n",
    "  in this dimensions.\n",
    "  - f : is the loss function which will be passed to grad_check_sparse \n",
    "  as a lambda function\n",
    "  - x : is the array containing the weight matrix\n",
    "  - num_checks : how many elements of the array are randomly sampled\n",
    "  \"\"\"\n",
    "\n",
    "  for i in range(num_checks):\n",
    "    ix = tuple([np.random.randint(m) for m in x.shape])\n",
    "\n",
    "    oldval = x[ix]\n",
    "    # increment by h\n",
    "    x[ix] = oldval + h \n",
    "    # evaluate f(x + h)\n",
    "    fxph = f(x)\n",
    "    # increment by h\n",
    "    x[ix] = oldval - h \n",
    "    # evaluate f(x - h)\n",
    "    fxmh = f(x) \n",
    "    # reset\n",
    "    x[ix] = oldval \n",
    "\n",
    "    grad_numerical = (fxph - fxmh) / (2 * h)\n",
    "    grad_analytic = analytic_grad[ix]\n",
    "    rel_error = abs(grad_numerical - grad_analytic) / (abs(grad_numerical) + abs(grad_analytic))\n",
    "    print('numerical: %f analytic: %f, relative error: %e' % (grad_numerical, grad_analytic, rel_error))\n",
    "\n",
    "\n",
    "\n",
    "loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.0)\n",
    "f = lambda w: svm_loss_naive(w, X_dev, y_dev, 0.0)[0]\n",
    "grad_numerical = grad_check_sparse(f, W, grad)\n",
    "\n",
    "# do the gradient check once again with regularization turned on\n",
    "# you didn't forget the regularization gradient did you?\n",
    "\n",
    "loss, grad = svm_loss_naive(W, X_dev, y_dev, 1e2)\n",
    "f = lambda w: svm_loss_naive(w, X_dev, y_dev, 1e2)[0]\n",
    "grad_numerical = grad_check_sparse(f, W, grad)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SVM Loss Function and Gradient (Vectorized)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We implement the function `svm_loss_vectorized`; we compute\n",
    "the loss and the gradient by means of vectorized operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def svm_loss_vectorized(W, X, y, reg):\n",
    "  \"\"\"\n",
    "  Structured SVM loss function, vectorized implementation.\n",
    "\n",
    "  Inputs and outputs are the same as svm_loss_naive.\n",
    "    Inputs have dimension D, there are C classes, and we operate on minibatches\n",
    "  of N examples.\n",
    "\n",
    "  Inputs:\n",
    "  - W: A numpy array of shape (D, C) containing weights.\n",
    "  - X: A numpy array of shape (N, D) containing a minibatch of data.\n",
    "  - y: A numpy array of shape (N,) containing training labels; y[i] = c means\n",
    "    that X[i] has label c, where 0 <= c < C.\n",
    "  - reg: (float) regularization strength\n",
    "\n",
    "  Returns a tuple of:\n",
    "  - loss as single float\n",
    "  - gradient with respect to weights W; an array of same shape as W\n",
    "  \"\"\"\n",
    "  loss = 0.0\n",
    "  delta = 1\n",
    "  # initialize the gradient as zero\n",
    "  dW = np.zeros(W.shape) \n",
    "  # compute the loss \n",
    "  num_train = X.shape[0]\n",
    "  scores = X.dot(W)\n",
    "  correct_class_score = scores[np.arange(num_train), y]\n",
    "  margin = scores - correct_class_score[:, np.newaxis] + delta\n",
    "  margin[np.arange(num_train), y] = 0\n",
    "  margin = np.where(margin > 0, margin, 0)\n",
    "  loss = np.sum(margin)/num_train\n",
    "  # regularization\n",
    "  loss += 0.5 * reg * np.sum(W * W) \n",
    "  \n",
    "  # Compute the gradient : fully vectorized version \n",
    "  mask = np.zeros(margin.shape)\n",
    "  # column maps to class, row maps to sample; a value v in X_mask[i, j]\n",
    "  # adds a row sample i to column class j with multiple of v\n",
    "  mask[margin > 0] = 1\n",
    "  # for each sample, find the total number of classes where margin > 0\n",
    "  incorrect_counts = np.sum(mask, axis=1)\n",
    "  mask[np.arange(num_train), y] = -incorrect_counts\n",
    "  dW = X.T.dot(mask)\n",
    "\n",
    "  dW /= num_train # average out weights\n",
    "  dW += reg*W # regularize the weights\n",
    "  \n",
    "\n",
    "  return loss, dW"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check Correctness and Performance of Vectorized Gradient Computation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Naive loss: 1.064942e+01 computed in 0.783425s\n",
      "Vectorized loss: 1.064942e+01 computed in 0.006955s\n",
      "difference: 0.000000\n",
      "Naive loss and gradient: computed in 0.802595s\n",
      "Vectorized loss and gradient: computed in 0.004300s\n",
      "22.7 ms ± 4.24 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
      "difference: 0.000000\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "tic = time.time()\n",
    "loss_naive, _ = svm_loss_naive(W, X_dev, y_dev, 0.00001)\n",
    "toc = time.time()\n",
    "print('Naive loss: %e computed in %fs' % (loss_naive, toc - tic))\n",
    "\n",
    "\n",
    "tic = time.time()\n",
    "loss_vectorized, _ = svm_loss_vectorized(W, X_dev, y_dev, 0.00001)\n",
    "toc = time.time()\n",
    "print('Vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))\n",
    "\n",
    "# The losses should match but your vectorized implementation should be much faster.\n",
    "print('difference: %f' % (loss_naive - loss_vectorized))\n",
    "\n",
    "\n",
    "# The naive implementation and the vectorized implementation should match, but\n",
    "# the vectorized version should still be much faster.\n",
    "tic = time.time()\n",
    "_, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.00001)\n",
    "toc = time.time()\n",
    "print('Naive loss and gradient: computed in %fs' % (toc - tic))\n",
    "\n",
    "tic = time.time()\n",
    "_, grad_vectorized = svm_loss_vectorized(W, X_dev, y_dev, 0.00001)\n",
    "toc = time.time()\n",
    "print('Vectorized loss and gradient: computed in %fs' % (toc - tic))\n",
    "\n",
    "# Alternative time measurement with ipython : use %timeit\n",
    "%timeit svm_loss_vectorized(W, X_dev, y_dev, 0.00001)\n",
    "\n",
    "# The loss is a single number, so it is easy to compare the values computed\n",
    "# by the two implementations. The gradient on the other hand is a matrix, so\n",
    "# we use the Frobenius norm to compare them.\n",
    "difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')\n",
    "print('difference: %f' % difference)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Linear Classifier with Stochastic Gradient Descent (SGD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 0 / 1500: loss 818.604090\n",
      "iteration 100 / 1500: loss 292.089905\n",
      "iteration 200 / 1500: loss 110.877384\n",
      "iteration 300 / 1500: loss 46.280911\n",
      "iteration 400 / 1500: loss 23.123541\n",
      "iteration 500 / 1500: loss 15.756814\n",
      "iteration 600 / 1500: loss 11.933461\n",
      "iteration 700 / 1500: loss 11.412081\n",
      "iteration 800 / 1500: loss 11.247968\n",
      "iteration 900 / 1500: loss 12.900388\n",
      "iteration 1000 / 1500: loss 13.957223\n",
      "iteration 1100 / 1500: loss 10.316672\n",
      "iteration 1200 / 1500: loss 9.824412\n",
      "iteration 1300 / 1500: loss 11.212222\n",
      "iteration 1400 / 1500: loss 11.595881\n",
      "That took 30.091362s\n"
     ]
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training accuracy: 0.157102\n",
      "validation accuracy: 0.145000\n"
     ]
    }
   ],
   "source": [
    "\n",
    "class LinearClassifier():\n",
    "\n",
    "  def __init__(self):\n",
    "    self.W = None\n",
    "\n",
    "  def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,\n",
    "            batch_size=200, verbose=False):\n",
    "    \"\"\"\n",
    "    Train this linear classifier using stochastic gradient descent.\n",
    "    Inputs:\n",
    "    - X: A numpy array of shape (N, D) containing training data; there are N\n",
    "      training samples each of dimension D.\n",
    "    - y: A numpy array of shape (N,) containing training labels; y[i] = c\n",
    "      means that X[i] has label 0 <= c < C for C classes.\n",
    "    - learning_rate: (float) learning rate for optimization.\n",
    "    - reg: (float) regularization strength.\n",
    "    - num_iters: (integer) number of steps to take when optimizing\n",
    "    - batch_size: (integer) number of training examples to use at each step.\n",
    "    - verbose: (boolean) If true, print progress during optimization.\n",
    "    Outputs:\n",
    "    A list containing the value of the loss function at each training iteration.\n",
    "    \"\"\"\n",
    "    num_train, dim = X.shape\n",
    "    # assume y takes values 0...K-1 where K is number of classes\n",
    "    num_classes = np.max(y) + 1 \n",
    "    if self.W is None:\n",
    "      # lazily initialize W\n",
    "      self.W = 0.001 * np.random.randn(dim, num_classes)\n",
    "\n",
    "    # Run stochastic gradient descent to optimize W\n",
    "    loss_history = []\n",
    "    for it in range(num_iters):\n",
    "      X_batch = None\n",
    "      y_batch = None\n",
    "\n",
    "      \n",
    "      # Sample batch_size elements from the training data and their           \n",
    "      # corresponding labels to use in this round of gradient descent.        \n",
    "      # Store the data in X_batch and their corresponding labels in           \n",
    "      # y_batch; after sampling X_batch should have shape (dim, batch_size)   \n",
    "      # and y_batch should have shape (batch_size,)                           \n",
    "      #                                                                       \n",
    "      # Use np.random.choice to generate indices. Sampling with         \n",
    "      # replacement is faster than sampling without replacement.              \n",
    "     \n",
    "      sample_indices = np.random.choice(np.arange(num_train), batch_size)\n",
    "      X_batch = X[sample_indices]\n",
    "      y_batch = y[sample_indices]\n",
    "      \n",
    "\n",
    "      # evaluate loss and gradient\n",
    "      loss, grad = self.loss(X_batch, y_batch, reg)\n",
    "      loss_history.append(loss)\n",
    "\n",
    "      # perform parameter update\n",
    "      \n",
    "      # Update the weights using the gradient and the learning rate.          \n",
    "      \n",
    "      self.W += -learning_rate * grad\n",
    "     \n",
    "\n",
    "      if verbose and it % 100 == 0:\n",
    "        print('iteration %d / %d: loss %f' % (it, num_iters, loss))\n",
    "\n",
    "    return loss_history\n",
    "\n",
    "  def predict(self, X):\n",
    "    \"\"\"\n",
    "    Use the trained weights of this linear classifier to predict labels for\n",
    "    data points.\n",
    "    Inputs:\n",
    "    - X: D x N array of training data. Each column is a D-dimensional point.\n",
    "    Returns:\n",
    "    - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional\n",
    "      array of length N, and each element is an integer giving the predicted\n",
    "      class.\n",
    "    \"\"\"\n",
    "    y_pred = np.zeros(X.shape[1])\n",
    "   \n",
    "    # Implement this method. Store the predicted labels in y_pred.            \n",
    "    \n",
    "    y_pred = np.argmax(X.dot(self.W), axis=1)\n",
    "\n",
    "    return y_pred\n",
    "  \n",
    "  def loss(self, X_batch, y_batch, reg):\n",
    "    \"\"\"\n",
    "    Compute the loss function and its derivative. \n",
    "    Subclasses (child class) will override this.\n",
    "    Inputs:\n",
    "    - X_batch: A numpy array of shape (N, D) containing a minibatch of N\n",
    "      data points; each point has dimension D.\n",
    "    - y_batch: A numpy array of shape (N,) containing labels for the minibatch.\n",
    "    - reg: (float) regularization strength.\n",
    "    Returns: A tuple containing:\n",
    "    - loss as a single float\n",
    "    - gradient with respect to self.W; an array of the same shape as W\n",
    "    \"\"\"\n",
    "    pass\n",
    "\n",
    "class LinearSVM(LinearClassifier):\n",
    "  \"\"\" A subclass (child class) that uses the Multiclass SVM loss function \n",
    "      The function loss of the parent class LinearClassifier will be \n",
    "      overwritten by the following loss function.\n",
    "  \"\"\"\n",
    "\n",
    "  def loss(self, X_batch, y_batch, reg):\n",
    "    return svm_loss_vectorized(self.W, X_batch, y_batch, reg)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "svm = LinearSVM()\n",
    "tic = time.time()\n",
    "loss_hist = svm.train(X_train, y_train, learning_rate=1e-7, reg=5e4,\n",
    "                      num_iters=1500, verbose=True)\n",
    "toc = time.time()\n",
    "print('That took %fs' % (toc - tic))\n",
    "\n",
    "\n",
    "# A useful debugging strategy is to plot the loss as a function of\n",
    "# iteration number:\n",
    "plt.plot(loss_hist)\n",
    "plt.xlabel('Iteration number')\n",
    "plt.ylabel('Loss value')\n",
    "plt.show()\n",
    "\n",
    "\n",
    "# Evaluate the performance on both the\n",
    "# training and validation set\n",
    "y_train_pred = svm.predict(X_train)\n",
    "print('training accuracy: %f' % (np.mean(y_train == y_train_pred), ))\n",
    "y_val_pred = svm.predict(X_val)\n",
    "print('validation accuracy: %f' % (np.mean(y_val == y_val_pred), ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Tune the Hyperparameters Learning Rate and Regularization Strength"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use the validation set to tune hyperparameters (regularization strength and\n",
    "learning rate). You should experiment with different ranges for the learning\n",
    "rates and regularization strengths; if you are careful you should be able to\n",
    "get a classification accuracy of about 0.4 on the validation set.\n",
    "learning_rates = [1e-7, 5e-5]\n",
    "regularization_strengths = [5e4, 1e5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "learning_rates = np.logspace(-5, 0, 5) \n",
    "# causes numeric issues: np.logspace(-5, 5, 8) #[-4, -3, -2, -1, 1, 2, 3, 4, 5, 6]\n",
    "regularization_strengths = np.logspace(-5, 2, 5) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`results` is dictionary mapping tuples of the form\n",
    "(`learning_rate`, `regularization_strength`) to tuples of the form\n",
    "(training_accuracy, validation_accuracy). The accuracy is simply the fraction\n",
    "of data points that are correctly classified."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The highest validation accuracy that we have seen so far."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_val = -1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The LinearSVM object that achieved the highest validation rate. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_svm = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The corresponding learning rates and regularization strengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_l = np.min(learning_rates)\n",
    "best_r = np.min(regularization_strengths)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Code that chooses the best hyperparameters by tuning on the validation \n",
    "set. For each combination of hyperparameters, we train a linear SVM on the      \n",
    "training set, compute its accuracy on the training and validation sets, and  \n",
    "store these numbers in the results dictionary. In addition, we store the best   \n",
    "validation accuracy in `best_val` and the LinearSVM object that achieves this  \n",
    "accuracy in `best_svm`.                                                        \n",
    "                                                                             \n",
    "Hint: You should use a small value for `num_iters` as you develop your         \n",
    "validation code so that the SVMs don't take much time to train; once you are \n",
    "confident that your validation code works, you should rerun the validation   \n",
    "code with a larger value for `num_iters`.                         "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:33: RuntimeWarning: overflow encountered in double_scalars\n",
      "/opt/conda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:87: RuntimeWarning: overflow encountered in reduce\n",
      "  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:33: RuntimeWarning: overflow encountered in multiply\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:28: RuntimeWarning: overflow encountered in subtract\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:28: RuntimeWarning: invalid value encountered in subtract\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:46: RuntimeWarning: overflow encountered in multiply\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:59: RuntimeWarning: invalid value encountered in add\n"
     ]
    }
   ],
   "source": [
    "for l in learning_rates:\n",
    "    for r in regularization_strengths:\n",
    "        svm = LinearSVM()\n",
    "        svm.train(X_train, y_train, learning_rate=l, reg=r, num_iters=1500, batch_size=200)\n",
    "        y_train_pred = svm.predict(X_train)\n",
    "        y_val_pred = svm.predict(X_val)\n",
    "        training_accuracy = np.mean(y_train == y_train_pred)\n",
    "        validation_accuracy = np.mean(y_val == y_val_pred)\n",
    "        results[(l, r)] = (training_accuracy, validation_accuracy)\n",
    "        if validation_accuracy > best_val:\n",
    "            best_val = validation_accuracy\n",
    "            best_svm = svm\n",
    "            best_l = l\n",
    "            best_r = r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "lr 1.000000e-05 reg 1.000000e-05 train accuracy: 0.216061 val accuracy: 0.213000\n",
      "lr 1.000000e-05 reg 5.623413e-04 train accuracy: 0.188796 val accuracy: 0.196000\n",
      "lr 1.000000e-05 reg 3.162278e-02 train accuracy: 0.199061 val accuracy: 0.195000\n",
      "lr 1.000000e-05 reg 1.778279e+00 train accuracy: 0.165388 val accuracy: 0.171000\n",
      "lr 1.000000e-05 reg 1.000000e+02 train accuracy: 0.142673 val accuracy: 0.127000\n",
      "lr 1.778279e-04 reg 1.000000e-05 train accuracy: 0.198490 val accuracy: 0.199000\n",
      "lr 1.778279e-04 reg 5.623413e-04 train accuracy: 0.183449 val accuracy: 0.194000\n",
      "lr 1.778279e-04 reg 3.162278e-02 train accuracy: 0.221286 val accuracy: 0.192000\n",
      "lr 1.778279e-04 reg 1.778279e+00 train accuracy: 0.219224 val accuracy: 0.179000\n",
      "lr 1.778279e-04 reg 1.000000e+02 train accuracy: 0.122224 val accuracy: 0.146000\n",
      "lr 3.162278e-03 reg 1.000000e-05 train accuracy: 0.196265 val accuracy: 0.182000\n",
      "lr 3.162278e-03 reg 5.623413e-04 train accuracy: 0.175286 val accuracy: 0.167000\n",
      "lr 3.162278e-03 reg 3.162278e-02 train accuracy: 0.166531 val accuracy: 0.166000\n",
      "lr 3.162278e-03 reg 1.778279e+00 train accuracy: 0.159265 val accuracy: 0.169000\n",
      "lr 3.162278e-03 reg 1.000000e+02 train accuracy: 0.105714 val accuracy: 0.122000\n",
      "lr 5.623413e-02 reg 1.000000e-05 train accuracy: 0.210959 val accuracy: 0.197000\n",
      "lr 5.623413e-02 reg 5.623413e-04 train accuracy: 0.236245 val accuracy: 0.227000\n",
      "lr 5.623413e-02 reg 3.162278e-02 train accuracy: 0.170571 val accuracy: 0.154000\n",
      "lr 5.623413e-02 reg 1.778279e+00 train accuracy: 0.147673 val accuracy: 0.157000\n",
      "lr 5.623413e-02 reg 1.000000e+02 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+00 reg 1.000000e-05 train accuracy: 0.179714 val accuracy: 0.167000\n",
      "lr 1.000000e+00 reg 5.623413e-04 train accuracy: 0.225020 val accuracy: 0.211000\n",
      "lr 1.000000e+00 reg 3.162278e-02 train accuracy: 0.134224 val accuracy: 0.110000\n",
      "lr 1.000000e+00 reg 1.778279e+00 train accuracy: 0.099959 val accuracy: 0.102000\n",
      "lr 1.000000e+00 reg 1.000000e+02 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "best validation accuracy achieved during cross-validation: 0.227000\n"
     ]
    }
   ],
   "source": [
    "# Print out results.\n",
    "for lr, reg in sorted(results):\n",
    "    train_accuracy, val_accuracy = results[(lr, reg)]\n",
    "    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (lr, reg, train_accuracy, val_accuracy))\n",
    "    \n",
    "print('best validation accuracy achieved during cross-validation: %f' % best_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize the cross-validation results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot training accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0.5, 1.0, 'CIFAR-10 training accuracy')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import math\n",
    "x_scatter = [math.log10(x[0]) for x in results]\n",
    "y_scatter = [math.log10(x[1]) for x in results]\n",
    "\n",
    "marker_size = 100 # default size of markers is 20\n",
    "colors = [results[x][0] for x in results]\n",
    "plt.subplot(2, 1, 1)\n",
    "plt.scatter(x_scatter, y_scatter, marker_size, c=colors)\n",
    "plt.colorbar()\n",
    "plt.xlabel('log learning rate')\n",
    "plt.ylabel('log regularization strength')\n",
    "plt.title('CIFAR-10 training accuracy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot validation accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "colors = [results[x][1] for x in results] \n",
    "plt.subplot(2, 1, 2)\n",
    "plt.scatter(x_scatter, y_scatter, marker_size, c=colors)\n",
    "plt.colorbar()\n",
    "plt.xlabel('log learning rate')\n",
    "plt.ylabel('log regularization strength')\n",
    "plt.title('CIFAR-10 validation accuracy')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate the best svm on test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "linear SVM on raw pixels final test set accuracy: 0.219000\n"
     ]
    }
   ],
   "source": [
    "y_test_pred = best_svm.predict(X_test)\n",
    "test_accuracy = np.mean(y_test == y_test_pred)\n",
    "print('linear SVM on raw pixels final test set accuracy: %f' % test_accuracy)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Rerun the training with larger value of num_iters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "svm = LinearSVM()\n",
    "svm.train(X_train, y_train, learning_rate=best_l, reg=best_r, num_iters=3000, batch_size=200)\n",
    "y_train_pred = svm.predict(X_train)\n",
    "y_val_pred = svm.predict(X_val)\n",
    "training_accuracy = np.mean(y_train == y_train_pred)\n",
    "validation_accuracy = np.mean(y_val == y_val_pred)\n",
    "if validation_accuracy > best_val:\n",
    "    best_svm = svm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate the new best svm on test set\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "linear SVM on raw pixels final test set accuracy: 0.219000\n"
     ]
    }
   ],
   "source": [
    "y_test_pred = best_svm.predict(X_test)\n",
    "test_accuracy = np.mean(y_test == y_test_pred)\n",
    "print('linear SVM on raw pixels final test set accuracy: %f' % test_accuracy)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Visualize the learned weights for each class.\n",
    "Depending on your choice of learning rate and regularization strength, these may\n",
    "or may not be nice to look at."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 10 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "w = best_svm.W[:-1,:] # strip out the bias\n",
    "w = w.T.reshape(10,3,32,32).transpose(2,3,1,0)\n",
    "w_min, w_max = np.min(w), np.max(w)\n",
    "classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']\n",
    "for i in range(10):\n",
    "  plt.subplot(2, 5, i + 1)\n",
    "    \n",
    "  # Rescale the weights to be between 0 and 255\n",
    "  wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min)\n",
    "  plt.imshow(wimg.astype('uint8'))\n",
    "  plt.axis('off')\n",
    "  plt.title(classes[i])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Softmax Linear Classifier : Not Vectorized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loss: 2.462197\n",
      "sanity check: 2.302585\n",
      "numerical: 3.211194 analytic: 3.211189, relative error: 8.060887e-07\n",
      "numerical: -0.539907 analytic: -0.539912, relative error: 4.825723e-06\n",
      "numerical: 3.701203 analytic: 3.701196, relative error: 8.716703e-07\n",
      "numerical: 14.517206 analytic: 14.517198, relative error: 2.613311e-07\n",
      "numerical: -9.800780 analytic: -9.800784, relative error: 1.827418e-07\n",
      "numerical: 15.496042 analytic: 15.496034, relative error: 2.499581e-07\n",
      "numerical: 0.374921 analytic: 0.374916, relative error: 7.011827e-06\n",
      "numerical: 17.383515 analytic: 17.383508, relative error: 2.126607e-07\n",
      "numerical: -7.557805 analytic: -7.557808, relative error: 2.332168e-07\n",
      "numerical: 4.524942 analytic: 4.524937, relative error: 6.025743e-07\n",
      "numerical: -9.262434 analytic: -9.262438, relative error: 2.185417e-07\n",
      "numerical: -6.892215 analytic: -6.892219, relative error: 2.433884e-07\n",
      "numerical: -2.417390 analytic: -2.417395, relative error: 1.076355e-06\n",
      "numerical: 0.174921 analytic: 0.174916, relative error: 1.541490e-05\n",
      "numerical: -0.754994 analytic: -0.755000, relative error: 4.155306e-06\n",
      "numerical: -0.795812 analytic: -0.795817, relative error: 3.283622e-06\n",
      "numerical: 0.789450 analytic: 0.789444, relative error: 3.708386e-06\n",
      "numerical: 1.307015 analytic: 1.307010, relative error: 2.032914e-06\n",
      "numerical: -7.209426 analytic: -7.209430, relative error: 3.010362e-07\n",
      "numerical: 0.347822 analytic: 0.347818, relative error: 7.047705e-06\n"
     ]
    }
   ],
   "source": [
    "def softmax_loss_naive(W, X, y, reg):\n",
    "  \"\"\"\n",
    "  Softmax loss function, naive implementation (with loops)\n",
    "  Inputs have dimension D, there are C classes, and we operate on minibatches\n",
    "  of N examples.\n",
    "  Inputs:\n",
    "  - W: A numpy array of shape (D, C) containing weights.\n",
    "  - X: A numpy array of shape (N, D) containing a minibatch of data.\n",
    "  - y: A numpy array of shape (N,) containing training labels; y[i] = c means\n",
    "    that X[i] has label c, where 0 <= c < C.\n",
    "  - reg: (float) regularization strength\n",
    "  Returns a tuple of:\n",
    "  - loss as single float\n",
    "  - gradient with respect to weights W; an array of same shape as W\n",
    "  \"\"\"\n",
    "  # Initialize the loss and gradient to zero.\n",
    "  loss = 0.0\n",
    "  dW = np.zeros_like(W)\n",
    "\n",
    "  \n",
    "  # Compute the softmax loss and its gradient using explicit loops.     \n",
    "  # Store the loss in loss and the gradient in dW. If you are not careful     \n",
    "  # here, it is easy to run into numeric instability. Don't forget the        \n",
    "  # regularization!                                                           \n",
    "  \n",
    "  num_train = X.shape[0]\n",
    "  num_classes = W.shape[1]\n",
    "  loss = 0.0\n",
    "  for i in range(num_train):\n",
    "    # Compute vector of scores\n",
    "    f_i = X[i].dot(W)\n",
    "\n",
    "    # Normalization trick to avoid numerical instability\n",
    "    f_i -= np.max(f_i)\n",
    "\n",
    "    # Compute loss (and add to it, divided later)\n",
    "    sum_j = np.sum(np.exp(f_i))\n",
    "    p = lambda k: np.exp(f_i[k]) / sum_j\n",
    "    loss += -np.log(p(y[i]))\n",
    "\n",
    "    # Compute gradient\n",
    "    # Here we are computing the contribution to the inner sum for a given i.\n",
    "    for k in range(num_classes):\n",
    "      p_k = p(k)\n",
    "      dW[:, k] += (p_k - (k == y[i])) * X[i]\n",
    "\n",
    "  loss /= num_train\n",
    "  loss += 0.5 * reg * np.sum(W * W)\n",
    "  dW /= num_train\n",
    "  dW += reg*W\n",
    "\n",
    "\n",
    "  return loss, dW\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "import time\n",
    "\n",
    "# Generate a random softmax weight matrix and use it \n",
    "# to compute the loss.\n",
    "W = np.random.randn(3073, 10) * 0.0001\n",
    "loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)\n",
    "\n",
    "# As a rough sanity check, our loss should be something close to -log(0.1).\n",
    "print('loss: %f' % loss)\n",
    "print('sanity check: %f' % (-np.log(0.1)))\n",
    "\n",
    "\n",
    "# Loss and gradient computed in none vectorized fashion\n",
    "loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)\n",
    "\n",
    "# As we did for the SVM, use numeric gradient checking as a debugging tool.\n",
    "# The numeric gradient should be close to the analytic gradient.\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.0)[0]\n",
    "grad_numerical = grad_check_sparse(f, W, grad, 10)\n",
    "\n",
    "# similar to SVM case, do another gradient check with regularization\n",
    "loss, grad = softmax_loss_naive(W, X_dev, y_dev, 1e2)\n",
    "f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 1e2)[0]\n",
    "grad_numerical = grad_check_sparse(f, W, grad, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Softmax Loss and Gradient : Vectorized Operations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now that we have a naive implementation of the softmax loss function and its gradient,\n",
    "we implement a vectorized version in `softmax_loss_vectorized`.\n",
    "The two versions should compute the same results, but the vectorized version should be\n",
    "much faster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def softmax_loss_vectorized(W, X, y, reg):\n",
    "  \"\"\"\n",
    "  Softmax loss function, vectorized version.\n",
    "  Inputs and outputs are the same as softmax_loss_naive.\n",
    "  \"\"\"\n",
    "  # Initialize the loss and gradient to zero.\n",
    "  loss = 0.0\n",
    "  dW = np.zeros_like(W)\n",
    "\n",
    "  \n",
    "  # Compute the softmax loss and its gradient using no explicit loops.  \n",
    "  # Store the loss in loss and the gradient in dW. If you are not careful     \n",
    "  # here, it is easy to run into numeric instability. Don't forget the        \n",
    "  # regularization!                                                           \n",
    " \n",
    "  num_train = X.shape[0]\n",
    "  f = X.dot(W)\n",
    "  f -= np.max(f, axis=1, keepdims=True) # max of every sample\n",
    "  sum_f = np.sum(np.exp(f), axis=1, keepdims=True)\n",
    "  p = np.exp(f)/sum_f\n",
    "\n",
    "  loss = np.sum(-np.log(p[np.arange(num_train), y]))\n",
    "\n",
    "  ind = np.zeros_like(p)\n",
    "  ind[np.arange(num_train), y] = 1\n",
    "  dW = X.T.dot(p - ind)\n",
    "\n",
    "  loss /= num_train\n",
    "  loss += 0.5 * reg * np.sum(W * W)\n",
    "  dW /= num_train\n",
    "  dW += reg*W\n",
    "\n",
    "\n",
    "  return loss, dW"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Linear Classifier : Softmax"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "naive loss: 2.462197e+00 computed in 1.393167s\n",
      "vectorized loss: 2.462197e+00 computed in 0.006959s\n",
      "Loss difference: 0.000000\n",
      "Gradient difference: 0.000000\n"
     ]
    }
   ],
   "source": [
    "class Softmax(LinearClassifier):\n",
    "  \"\"\" Softmax is a\n",
    "  \n",
    "  subclass (child class) that uses the \n",
    "      Softmax + Cross-entropy loss function and \n",
    "      overrides the loss function of the superclass \n",
    "      (parent class) LinearClassifier.\n",
    "   \"\"\"\n",
    "\n",
    "  def loss(self, X_batch, y_batch, reg):\n",
    "      return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "tic = time.time()\n",
    "loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.00001)\n",
    "toc = time.time()\n",
    "print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))\n",
    "\n",
    "\n",
    "tic = time.time()\n",
    "loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_dev, y_dev, 0.00001)\n",
    "toc = time.time()\n",
    "print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))\n",
    "\n",
    "# As we did for the SVM, we use the Frobenius norm to compare the two versions\n",
    "# of the gradient.\n",
    "grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')\n",
    "print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))\n",
    "print('Gradient difference: %f' % grad_difference)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tune the Hyperparameters : Learning Rate and Regularization Strength"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use the validation set to tune hyperparameters (regularization strength and\n",
    "learning rate). You should experiment with different ranges for the learning\n",
    "rates and regularization strengths; if you are careful you should be able to\n",
    "get a classification accuracy of over 0.35 on the validation set.\n",
    "\n",
    "Use the validation set to set the learning rate and regularization strength. \n",
    "This should be identical to the validation that you did for the SVM; save    \n",
    "the best trained softmax classifer in best_softmax.                          \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:29: RuntimeWarning: overflow encountered in double_scalars\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:29: RuntimeWarning: overflow encountered in multiply\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:31: RuntimeWarning: overflow encountered in multiply\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:18: RuntimeWarning: invalid value encountered in subtract\n",
      "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:59: RuntimeWarning: overflow encountered in multiply\n"
     ]
    }
   ],
   "source": [
    "results = {}\n",
    "best_val = -1\n",
    "best_softmax = None\n",
    "# np.logspace(-10, 10, 8) #-10, -9, -8, -7, -6, -5, -4\n",
    "learning_rates = np.logspace(-10, 10, 5) \n",
    "# causes numeric issues: np.logspace(-5, 5, 8) #[-4, -3, -2, -1, 1, 2, 3, 4, 5, 6]\n",
    "regularization_strengths = np.logspace(-3, 6, 5) \n",
    "\n",
    "iters = 1500\n",
    "for lr in learning_rates:\n",
    "    for rs in regularization_strengths:\n",
    "        softmax = Softmax()\n",
    "        softmax.train(X_train, y_train, learning_rate=lr, reg=rs, num_iters=iters)\n",
    "        y_train_pred = softmax.predict(X_train)\n",
    "        acc_train = np.mean(y_train == y_train_pred)\n",
    "        y_val_pred = softmax.predict(X_val)\n",
    "        acc_val = np.mean(y_val == y_val_pred)\n",
    "        results[(lr, rs)] = (acc_train, acc_val)\n",
    "        if best_val < acc_val:\n",
    "            best_val = acc_val\n",
    "            best_softmax = softmax"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Print out results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "lr 1.000000e-10 reg 1.000000e-03 train accuracy: 0.099449 val accuracy: 0.096000\n",
      "lr 1.000000e-10 reg 1.778279e-01 train accuracy: 0.095551 val accuracy: 0.097000\n",
      "lr 1.000000e-10 reg 3.162278e+01 train accuracy: 0.110163 val accuracy: 0.135000\n",
      "lr 1.000000e-10 reg 5.623413e+03 train accuracy: 0.098367 val accuracy: 0.079000\n",
      "lr 1.000000e-10 reg 1.000000e+06 train accuracy: 0.091612 val accuracy: 0.105000\n",
      "lr 1.000000e-05 reg 1.000000e-03 train accuracy: 0.170224 val accuracy: 0.161000\n",
      "lr 1.000000e-05 reg 1.778279e-01 train accuracy: 0.245673 val accuracy: 0.266000\n",
      "lr 1.000000e-05 reg 3.162278e+01 train accuracy: 0.195918 val accuracy: 0.180000\n",
      "lr 1.000000e-05 reg 5.623413e+03 train accuracy: 0.099612 val accuracy: 0.119000\n",
      "lr 1.000000e-05 reg 1.000000e+06 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+00 reg 1.000000e-03 train accuracy: 0.135204 val accuracy: 0.135000\n",
      "lr 1.000000e+00 reg 1.778279e-01 train accuracy: 0.099857 val accuracy: 0.107000\n",
      "lr 1.000000e+00 reg 3.162278e+01 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+00 reg 5.623413e+03 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+00 reg 1.000000e+06 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+05 reg 1.000000e-03 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+05 reg 1.778279e-01 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+05 reg 3.162278e+01 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+05 reg 5.623413e+03 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+05 reg 1.000000e+06 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+10 reg 1.000000e-03 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+10 reg 1.778279e-01 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+10 reg 3.162278e+01 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+10 reg 5.623413e+03 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "lr 1.000000e+10 reg 1.000000e+06 train accuracy: 0.100265 val accuracy: 0.087000\n",
      "best validation accuracy achieved during cross-validation: 0.266000\n"
     ]
    }
   ],
   "source": [
    "for lr, reg in sorted(results):\n",
    "    train_accuracy, val_accuracy = results[(lr, reg)]\n",
    "    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (lr, reg, train_accuracy, val_accuracy))\n",
    "    \n",
    "print('best validation accuracy achieved during cross-validation: %f' % best_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate on test set\n",
    "Evaluate the best softmax on test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "softmax on raw pixels final test set accuracy: 0.221000\n"
     ]
    }
   ],
   "source": [
    "y_test_pred = best_softmax.predict(X_test)\n",
    "test_accuracy = np.mean(y_test == y_test_pred)\n",
    "print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize the learned weights for each class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 10 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "w = best_softmax.W[:-1,:] # strip out the bias\n",
    "w = w.T.reshape(10, 3, 32, 32).transpose(2,3,1,0)\n",
    "\n",
    "w_min, w_max = np.min(w), np.max(w)\n",
    "\n",
    "classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']\n",
    "for i in range(10):\n",
    "  plt.subplot(2, 5, i + 1)\n",
    "  \n",
    "  # Rescale the weights to be between 0 and 255\n",
    "  wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min)\n",
    "  plt.imshow(wimg.astype('uint8'))\n",
    "  plt.axis('off')\n",
    "  plt.title(classes[i])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}