Skip to content
Snippets Groups Projects
Jupyter Notebook Block 5 - Object Detection and Segmentation.ipynb 809 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "dWyPGNkCGhIX"
   },
   "source": [
    "# Part I : Create Your Own Dataset and Train it with ConvNets\n",
    "\n",
    "In this part of the notebook, you will set up your own dataset for image classification. Please specify \n",
    "under `queries` the image categories you are interested in. Under `limit` specify the number of images \n",
    "you want to download for each image category. \n",
    "\n",
    "You do not need to understand the class `simple_image_download`, just execute the cell after you have specified \n",
    "the download folder.\n"
   ]
  },
  {
   "cell_type": "code",
Simon van Hemert's avatar
Simon van Hemert committed
   "execution_count": 1,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "8rckz3ZuGhIc",
    "outputId": "6f615f06-759a-4eea-839e-658155df8d36"
   },
Simon van Hemert's avatar
Simon van Hemert committed
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 2 image links\n",
      "Saved 2 images\n",
      "Found 2 image links\n",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
      "Saved 2 images\n",
      "Found 2 image links\n",
      "Saved 2 images\n",
      "Found 2 image links\n",
      "Saved 2 images\n",
      "Found 2 image links\n",
      "Saved 2 images\n",
      "Found 2 image links\n",
      "Saved 2 images\n",
      "Found 2 image links\n",
      "Saved 2 images\n",
      "Found 2 image links\n",
      "ERROR - Could not save https://upload.wikimedia.org/wikipedia/commons/5/59/Marion_Cotillard_at_2019_Cannes.jpg - cannot identify image file <_io.BytesIO object at 0x7f1a0b4d6d70>\n",
      "Saved 1 images\n"
Simon van Hemert's avatar
Simon van Hemert committed
     ]
   "source": [
Simon van Hemert's avatar
Simon van Hemert committed
    "from selenium import webdriver\n",
    "from selenium.webdriver.firefox.options import Options\n",
    "from Image_crawling import Image_crawling\n",
    "\n",
    "# Specifiy the queries\n",
    "queries = [\"brad pitt\",\"johnny depp\", \"leonardo dicaprio\", \"robert de niro\", \"angelina jolie\", \"sandra bullock\", \"catherine deneuve\", \"marion cotillard\"]\n",
    "#queries = [\"Bart Simpson\",\"Homer Simpson\"]\n",
    "limit = 2\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "download_folder = \"./brandnew_images/train/\"\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "waittime = 0.1  # Time to wait between actions, depends on the number of pictures you want to crawl. More pictures means you need to wait longer for them to load. \n",
Simon van Hemert's avatar
Simon van Hemert committed
    "# Set options\n",
    "options = webdriver.FirefoxOptions()\n",
    "options.add_argument('--headless')\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "# Create Driver\n",
    "driver = webdriver.Firefox(options=options, executable_path=\"/usr/bin/geckodriver\")\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "# create instance of crawler\n",
    "image_crawling = Image_crawling(driver, waittime=waittime)\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "# Find urls and download images\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "for query in queries:\n",
    "    # Craws image urls:\n",
    "    image_urls = image_crawling.fetch_image_urls(query, limit)\n",
    "      \n",
    "    # download images\n",
    "    image_crawling.download_image(download_folder + query)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "CRHl9UX6GhIs"
   },
   "source": [
    "Please check carefully the downloaded images, there may be a lot of garbage! You definitely need to \n",
    "clean the data.\n",
    "\n",
    "In the following, you will apply data augmentation to your data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "3SX21FtcGhIu"
   },
   "outputs": [],
   "source": [
    "# General imports\n",
    "import tensorflow as tf\n",
    "tf.compat.v1.enable_eager_execution(\n",
    "    config=None, device_policy=None, execution_mode=None\n",
    ")\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
    "import os, datetime\n",
    "\n",
    "# Shortcuts to keras if (however from tensorflow)\n",
    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Conv2D, MaxPooling2D\n",
    "from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense\n",
    "from tensorflow.keras.callbacks import TensorBoard \n",
    "\n",
    "# Shortcut for displaying images\n",
    "def plot_img(img):\n",
    "    plt.imshow(img, cmap='gray')\n",
    "    plt.axis(\"off\")\n",
    "    plt.show()\n",
    "    \n",
    "# The target image size can be fixed here (quadratic)\n",
    "# the ImageDataGenerator() automatically scales the images accordingly (aspect ratio is changed)\n",
    "image_size = 150"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "rN_Mp1rmGhI1",
    "outputId": "6417b1f9-e7d4-4d56-a213-191f9d17524a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 480 images belonging to 8 classes.\n"
Mirko Birbaumer's avatar
Mirko Birbaumer committed
     "data": {
      "image/png": "\n",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)"
     "execution_count": 96,
Mirko Birbaumer's avatar
Mirko Birbaumer committed
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# These are the class names; this defines the ordering of the classes\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "class_names = [\"brad pitt\", \"johnny depp\", \"leonardo dicaprio\", \"robert de niro\",\n",
    "           \"angelina jolie\", \"sandra bullock\", \"catherine deneuve\", \"marion cotillard\"]\n",
    "\n",
    "\n",
    "# Class ImageDataGenerator() returns an iterator holding one batch of images\n",
    "# the constructor takes arguments defining the different image transformations\n",
    "# for augmentation purposes (rotation, x-/y-shift, intensity scaling - here 1./255 \n",
    "# to scale range to [0, 1], shear, zoom, flip, ... )\n",
    "train_datagen = ImageDataGenerator(\n",
    "        rotation_range=10,\n",
    "        width_shift_range=0.2,\n",
    "        height_shift_range=0.2,\n",
    "        rescale=1./255,\n",
    "        shear_range=0.2,\n",
    "        zoom_range=0.2,\n",
    "        horizontal_flip=True,\n",
    "        fill_mode='nearest')\n",
    "\n",
    "\n",
    "dir_iter = train_datagen.flow_from_directory('./train/', \n",
    "                                         target_size=(image_size, image_size),\n",
    "                                         classes=class_names,\n",
    "                                         batch_size=25, class_mode='sparse', shuffle=False)\n",
    "\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "plot_img(dir_iter[0][0][1,...])\n",
    "dir_iter[0][1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "V2fYccc8GhJF"
   },
   "source": [
    "Before you continue, you need to split the downloaded images into a `train` folder and into a `validation` folder."
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "colab_type": "raw",
    "id": "VamXG4FoGhJH"
   },
   "source": [
    "./\n",
    "├── train\n",
    "│   ├── brad pitt\n",
    "│   └── johnny deep\n",
    "|   ├── leonardo di caprio\n",
    "|   └── ...\n",
    "│       \n",
    "└── validation\n",
    "    ├── brad pitt\n",
    "    ├── johnny deep\n",
    "    ├── leonardo di caprio\n",
    "    └── ..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "9322su6vGhJJ"
   },
   "source": [
Simon van Hemert's avatar
Simon van Hemert committed
    "If you want to use the example of this jupyter notebook, you can use the images provided in the ./train and ./validation folders."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "xPqJWgeAGhJL"
   },
   "source": [
    "## Define a ConvNet Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "UuJV4JBKGhJO"
   },
   "outputs": [],
   "source": [
    "batch_size = 20\n",
    "num_train_images = 480\n",
    "num_valid_images = 80\n",
    "num_classes = 8\n",
    "\n",
    "model_scratch = Sequential()\n",
    "model_scratch.add(Conv2D(32, (3, 3), input_shape=(image_size, image_size, 3)))\n",
    "model_scratch.add(Activation('relu'))\n",
    "model_scratch.add(MaxPooling2D(pool_size=(2, 2)))\n",
    "\n",
    "model_scratch.add(Conv2D(32, (3, 3)))\n",
    "model_scratch.add(Activation('relu'))\n",
    "model_scratch.add(MaxPooling2D(pool_size=(2, 2)))\n",
    "\n",
    "model_scratch.add(Conv2D(64, (3, 3)))\n",
    "model_scratch.add(Activation('relu'))\n",
    "model_scratch.add(MaxPooling2D(pool_size=(2, 2)))\n",
    "\n",
    "# this converts our 3D feature maps to 1D feature vectors\n",
    "model_scratch.add(Flatten())  \n",
    "model_scratch.add(Dense(64))\n",
    "model_scratch.add(Activation('relu'))\n",
    "model_scratch.add(Dropout(0.5))\n",
    "model_scratch.add(Dense(num_classes))\n",
    "model_scratch.add(Activation('softmax'))\n",
    "\n",
    "model_scratch.compile(loss='categorical_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "JFdkIokMGhJT",
    "outputId": "63e7d032-4083-4fe0-d970-c10bf0c39a94"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 480 images belonging to 8 classes.\n",
      "Found 80 images belonging to 8 classes.\n"
     ]
    }
   ],
   "source": [
    "# This is the augmentation configuration we will use for training\n",
    "train_datagen = ImageDataGenerator(\n",
    "        rescale=1./255,\n",
    "        shear_range=0.2,\n",
    "        zoom_range=0.2,\n",
    "        horizontal_flip=True)\n",
    "\n",
    "# This is the augmentation configuration we will use for validation:\n",
    "# only rescaling\n",
    "validation_datagen = ImageDataGenerator(rescale=1./255)\n",
    "\n",
    "# This is a generator that will read pictures found in\n",
    "# subfolers of './train', and indefinitely generate\n",
    "# batches of augmented image data\n",
    "train_generator = train_datagen.flow_from_directory(\n",
    "        './train',  # this is the target directory\n",
    "        target_size=(image_size, image_size),  # all images will be resized to 150x150\n",
    "        classes=class_names,\n",
    "        batch_size=batch_size)  \n",
    "\n",
    "# This is a similar generator, for validation data\n",
    "validation_generator = validation_datagen.flow_from_directory(\n",
    "        './validation',\n",
    "        target_size = (image_size, image_size),\n",
    "        classes = class_names,\n",
    "        batch_size = batch_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "cytHiQUTGhJb"
   },
   "outputs": [],
   "source": [
Mirko Birbaumer's avatar
Mirko Birbaumer committed
    "logdir = os.path.join(\"logs\", datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
    "tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "C7dCbyXPGhJg",
    "outputId": "98b4085e-ed6d-43e2-831f-aec32161583f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/20\n",
      " 4/24 [====>.........................] - ETA: 23s - loss: 2.2787 - accuracy: 0.0750"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:952: UserWarning: Palette images with Transparency expressed in bytes should be converted to RGBA images\n",
      "  \"Palette images with Transparency expressed in bytes should be \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24/24 [==============================] - 31s 1s/step - loss: 2.1227 - accuracy: 0.0979 - val_loss: 2.0783 - val_accuracy: 0.1375\n",
      "Epoch 2/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 2.0774 - accuracy: 0.1479 - val_loss: 2.0719 - val_accuracy: 0.1625\n",
      "Epoch 3/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 2.0655 - accuracy: 0.1417 - val_loss: 2.0479 - val_accuracy: 0.1875\n",
      "Epoch 4/20\n",
      "24/24 [==============================] - 30s 1s/step - loss: 2.0295 - accuracy: 0.2104 - val_loss: 2.0195 - val_accuracy: 0.2625\n",
      "Epoch 5/20\n",
      "24/24 [==============================] - 30s 1s/step - loss: 1.9806 - accuracy: 0.2104 - val_loss: 1.9734 - val_accuracy: 0.2625\n",
      "Epoch 6/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.9266 - accuracy: 0.2688 - val_loss: 1.9223 - val_accuracy: 0.2625\n",
      "Epoch 7/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.8778 - accuracy: 0.2438 - val_loss: 1.8354 - val_accuracy: 0.3375\n",
      "Epoch 8/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.8005 - accuracy: 0.2562 - val_loss: 1.7621 - val_accuracy: 0.3625\n",
      "Epoch 9/20\n",
      "24/24 [==============================] - 30s 1s/step - loss: 1.7497 - accuracy: 0.3333 - val_loss: 1.6562 - val_accuracy: 0.4000\n",
      "Epoch 10/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.6707 - accuracy: 0.3333 - val_loss: 1.5198 - val_accuracy: 0.4625\n",
      "Epoch 11/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.6633 - accuracy: 0.3958 - val_loss: 1.5632 - val_accuracy: 0.4750\n",
      "Epoch 12/20\n",
      "24/24 [==============================] - 31s 1s/step - loss: 1.6404 - accuracy: 0.3729 - val_loss: 1.5778 - val_accuracy: 0.4125\n",
      "Epoch 13/20\n",
      "24/24 [==============================] - 30s 1s/step - loss: 1.5924 - accuracy: 0.4021 - val_loss: 1.5459 - val_accuracy: 0.4125\n",
      "Epoch 14/20\n",
      "24/24 [==============================] - 28s 1s/step - loss: 1.5209 - accuracy: 0.4292 - val_loss: 1.5800 - val_accuracy: 0.3750\n",
      "Epoch 15/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.4475 - accuracy: 0.4417 - val_loss: 1.5742 - val_accuracy: 0.4000\n",
      "Epoch 16/20\n",
      "24/24 [==============================] - 28s 1s/step - loss: 1.4813 - accuracy: 0.4187 - val_loss: 1.5788 - val_accuracy: 0.3875\n",
      "Epoch 17/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.4735 - accuracy: 0.4437 - val_loss: 1.4948 - val_accuracy: 0.4375\n",
      "Epoch 18/20\n",
      "24/24 [==============================] - 27s 1s/step - loss: 1.4049 - accuracy: 0.4563 - val_loss: 1.4764 - val_accuracy: 0.5000\n",
      "Epoch 19/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.3805 - accuracy: 0.4917 - val_loss: 1.4958 - val_accuracy: 0.4750\n",
      "Epoch 20/20\n",
      "24/24 [==============================] - 29s 1s/step - loss: 1.3101 - accuracy: 0.4667 - val_loss: 1.5749 - val_accuracy: 0.4375\n"
     ]
    }
   ],
   "source": [
    "history = model_scratch.fit(\n",
Simon van Hemert's avatar
Simon van Hemert committed
    "    train_generator,\n",
    "    steps_per_epoch = num_train_images // batch_size,\n",
    "    epochs = 20,\n",
    "    validation_data = validation_generator,\n",
    "    validation_steps = num_valid_images // batch_size,\n",
    "    callbacks = [tensorboard_callback])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wt_ONw5PGhJm",
    "outputId": "e75d8a73-da49-4dbe-ffcf-7cb316be39a2"
   },
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(history.history['accuracy'])\n",
    "plt.plot(history.history['val_accuracy'])\n",
    "plt.title('model accuracy')\n",
    "plt.ylabel('accuracy')\n",
    "plt.xlabel('epoch')\n",
    "plt.legend(['train', 'valid'], loc='lower right')\n",
    "plt.show()\n",
    "plt.plot(history.history['loss'])\n",
    "plt.plot(history.history['val_loss'])\n",
    "plt.title('model loss')\n",
    "plt.ylabel('loss')\n",
    "plt.xlabel('epoch')\n",
    "plt.legend(['train', 'valid'], loc='upper right')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tensorboard"
   ]
  },
  {
   "cell_type": "code",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
   "execution_count": null,
   "metadata": {},
Mirko Birbaumer's avatar
Mirko Birbaumer committed
   "outputs": [],
   "source": [
    "# Load the TensorBoard notebook extension\n",
    "%load_ext tensorboard\n",
    "\n",
    "os.makedirs(logdir, exist_ok=True)\n",
    "%tensorboard --logdir logs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "Y8oAT4oUGhJs"
   },
   "source": [
    "# Part II : Transfer Learning\n",
    "\n",
    "\n",
    "Having to train an image-classification model using very little data is a common situation,\n",
    "which you’ll likely encounter in practice if you ever do computer vision in a\n",
    "professional context. A “few” samples can mean anywhere from a few hundred to a\n",
    "few tens of thousands of images. As a practical example, we’ll focus on classifying\n",
    "560 images belongig to 8 actors. We’ll use 480 pictures for training, and 80 for validation.\n",
    "\n",
    "## 2.1 Feature Extraction with a Pretrained Model\n",
    "Feature extraction consists of using the representations learned by a previously\n",
    "trained model to extract interesting features from new samples. These features are\n",
    "then run through a new classifier, which is trained from scratch.\n",
    "As you saw previously, ConvNets used for image classification comprise two parts:\n",
    "they start with a series of pooling and convolution layers, and they end with a densely\n",
    "connected classifier. The first part is called the _convolutional base_ of the model. In the\n",
    "case of convnets, feature extraction consists of taking the convolutional base of a previously\n",
    "trained network, running the new data through it, and training a new classifier\n",
    "on top of the output.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
Mirko Birbaumer's avatar
Mirko Birbaumer committed
   "outputs": [],
   "source": [
    "# General imports\n",
    "import tensorflow as tf\n",
    "tf.compat.v1.enable_eager_execution(\n",
    "    config=None, device_policy=None, execution_mode=None\n",
    ")\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os, datetime\n",
    "\n",
    "# Shortcuts to keras if (however from tensorflow)\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Conv2D, MaxPooling2D\n",
    "from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense\n",
    "from tensorflow.keras.callbacks import TensorBoard \n",
    "from tensorflow.keras import layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<IPython.core.display.Image object>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import Image\n",
    "Image(\"./Images/feature_extraction.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Why only reuse the convolutional base? Could we reuse the densely connected\n",
    "classifier as well? In general, doing so should be avoided. The reason is that the representations\n",
    "learned by the convolutional base are likely to be more generic and, therefore,\n",
    "more reusable: the feature maps of a ConvNet are presence maps of generic\n",
    "concepts over a picture, which are likely to be useful regardless of the computer vision\n",
    "problem at hand. But the representations learned by the classifier will necessarily be\n",
    "specific to the set of classes on which the model was trained—they will only contain\n",
    "information about the presence probability of this or that class in the entire picture.\n",
    "Additionally, representations found in densely connected layers no longer contain any information about where objects are located in the input image; these layers get rid of\n",
    "the notion of space, whereas the object location is still described by convolutional feature\n",
    "maps. For problems where object location matters, densely connected features\n",
    "are largely useless.\n",
    "\n",
    "\n",
    "Note that the level of generality (and therefore reusability) of the representations\n",
    "extracted by specific convolution layers depends on the depth of the layer in the\n",
    "model. Layers that come earlier in the model extract local, highly generic feature\n",
    "maps (such as visual edges, colors, and textures), whereas layers that are higher up\n",
    "extract more-abstract concepts (such as “cat ear” or “dog eye”). So if your new dataset\n",
    "differs a lot from the dataset on which the original model was trained, you may be better\n",
    "off using only the first few layers of the model to do feature extraction, rather than\n",
    "using the entire convolutional base.\n",
    "\n",
    "\n",
    "\n",
    "In this case, because the ImageNet class set does not contain images of actors, we’ll \n",
    "choose not to use the densely connected layers, in order to cover\n",
    "the more general case where the class set of the new problem doesn’t overlap the\n",
    "class set of the original model. Let’s put this into practice by using the convolutional\n",
    "base of the VGG16 network, trained on ImageNet, to extract interesting features\n",
    "from actors, and then train a classifier for the 8 actors on top of\n",
    "these features.\n",
    "\n",
    "The VGG16 model, among others, comes prepackaged with Keras. You can import\n",
    "it from the `keras.applications` module. Many other image-classification models (all\n",
    "pretrained on the ImageNet dataset) are available as part of `keras.applications`:\n",
    "\n",
    "\n",
    "-  Xception\n",
    "-  ResNet\n",
    "-  MobileNet\n",
    "-  EfficientNet\n",
    "-  DenseNet\n",
    "-  etc.\n",
    "\n",
    "Let's instantiate the VGG16 model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "4Luec7pbGhJv",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# The target image size can be fixed here (quadratic)\n",
    "# The ImageDataGenerator() automatically scales the images accordingly (aspect ratio is changed)\n",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
    "image_size = 150"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "eRes_n9BGhJ0"
   },
   "outputs": [],
   "source": [
    "conv_base = keras.applications.vgg16.VGG16(weights=\"imagenet\",\n",
    "                                           include_top=False,\n",
    "                                           input_shape=(image_size, image_size, 3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "vEIWLeqSGhJ5"
   },
   "source": [
    "You pass three arguments to the constructor:\n",
    "\n",
    "- `weights` specifies the weight checkpoint from which to initialize the model.\n",
    "\n",
    "- `include_top` refers to including (or not) the densely connected classifier on\n",
    "top of the network. By default, this densely connected classifier corresponds to\n",
    "the 1'000 classes from ImageNet. Because we intend to use our own densely\n",
    "connected classifier (with 8 classes of actors), we don’t need to\n",
    "include it.\n",
    "- `input_shape` is the shape of the image tensors that we’ll feed to the network.\n",
    "This argument is purely optional: if we don’t pass it, the network will be able to\n",
    "process inputs of any size. Here we pass it so that we can visualize (in the following\n",
    "summary) how the size of the feature maps shrinks with each new convolution\n",
    "and pooling layer."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here’s the detail of the architecture of the VGG16 convolutional base. It’s similar to\n",
    "the simple convnets you’re already familiar with:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "M7Bk7t1MGhJ6"
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"vgg16\"\n",
      "_________________________________________________________________\n",
      " Layer (type)                Output Shape              Param #   \n",
      "=================================================================\n",
      " input_1 (InputLayer)        [(None, 150, 150, 3)]     0         \n",
      "                                                                 \n",
      " block1_conv1 (Conv2D)       (None, 150, 150, 64)      1792      \n",
      "                                                                 \n",
      " block1_conv2 (Conv2D)       (None, 150, 150, 64)      36928     \n",
      "                                                                 \n",
      " block1_pool (MaxPooling2D)  (None, 75, 75, 64)        0         \n",
      "                                                                 \n",
      " block2_conv1 (Conv2D)       (None, 75, 75, 128)       73856     \n",
      "                                                                 \n",
      " block2_conv2 (Conv2D)       (None, 75, 75, 128)       147584    \n",
      "                                                                 \n",
      " block2_pool (MaxPooling2D)  (None, 37, 37, 128)       0         \n",
      "                                                                 \n",
      " block3_conv1 (Conv2D)       (None, 37, 37, 256)       295168    \n",
      "                                                                 \n",
      " block3_conv2 (Conv2D)       (None, 37, 37, 256)       590080    \n",
      "                                                                 \n",
      " block3_conv3 (Conv2D)       (None, 37, 37, 256)       590080    \n",
      "                                                                 \n",
      " block3_pool (MaxPooling2D)  (None, 18, 18, 256)       0         \n",
      "                                                                 \n",
      " block4_conv1 (Conv2D)       (None, 18, 18, 512)       1180160   \n",
      "                                                                 \n",
      " block4_conv2 (Conv2D)       (None, 18, 18, 512)       2359808   \n",
      "                                                                 \n",
      " block4_conv3 (Conv2D)       (None, 18, 18, 512)       2359808   \n",
      "                                                                 \n",
      " block4_pool (MaxPooling2D)  (None, 9, 9, 512)         0         \n",
      "                                                                 \n",
      " block5_conv1 (Conv2D)       (None, 9, 9, 512)         2359808   \n",
      "                                                                 \n",
      " block5_conv2 (Conv2D)       (None, 9, 9, 512)         2359808   \n",
      "                                                                 \n",
      " block5_conv3 (Conv2D)       (None, 9, 9, 512)         2359808   \n",
      "                                                                 \n",
      " block5_pool (MaxPooling2D)  (None, 4, 4, 512)         0         \n",
      "                                                                 \n",
      "=================================================================\n",
      "Total params: 14,714,688\n",
      "Trainable params: 14,714,688\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "conv_base.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "DBSrhVORGhKH"
   },
   "source": [
    "\n",
    "The final feature map (output volume) has shape $(5, 5, 512)$. That's the feature on top of which we will stick a densely connected classifier.\n",
    "\n",
    "At this point, there are two ways how we could proceed:\n",
    "\n",
    "- __Approach 1__: Run the convolutional base over our dataset, record its output to a NumPy array\n",
    "on disk, and then use this data as input to a standalone, densely connected classifier\n",
    "similar to those you saw in Block 4 of this course. This solution is fast and\n",
    "cheap to run, because it only requires running the convolutional base once for\n",
    "every input image, and the convolutional base is by far the most expensive part\n",
    "of the pipeline. But for the same reason, this technique won’t allow us to use\n",
    "data augmentation.\n",
    "\n",
    "- __Approach 2__: Extend the model we have (`conv_base`) by adding `Dense` layers on top, and run\n",
    "the whole thing from end to end on the input data. This will allow us to use\n",
    "data augmentation, because every input image goes through the convolutional\n",
    "base every time it’s seen by the model. But for the same reason, this technique is\n",
    "far more expensive than the first.\n",
    "We’ll cover both techniques. Let’s walk through the code required to set up the first\n",
    "one: recording the output of `conv_base` on our data and using these outputs as inputs\n",
    "to a new model."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "mlpIDmSCGhKI"
   },
   "source": [
    "### 1. Approach : Fast feature extraction without data augmentation\n",
    "We’ll start by extracting features as NumPy arrays by calling the `predict()` method of\n",
    "the `conv_base` model on our training, and validation datasets.\n",
    "Let’s iterate over our datasets to extract the VGG16 features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 480 files belonging to 8 classes.\n",
      "Found 80 files belonging to 8 classes.\n"
     ]
    }
   ],
   "source": [
    "from tensorflow.keras.utils import image_dataset_from_directory\n",
    "train_dataset = image_dataset_from_directory(\n",
    "    './train',\n",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
    "    image_size=(150, 150),\n",
    "    batch_size=32,\n",
    "    shuffle=False,\n",
    "    label_mode=\"categorical\")\n",
    "validation_dataset = image_dataset_from_directory(\n",
    "    './validation',\n",
Mirko Birbaumer's avatar
Mirko Birbaumer committed
    "    image_size=(150, 150),\n",
    "    batch_size=32,\n",
    "    shuffle=False,\n",
    "    label_mode=\"categorical\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "def get_features_and_labels(dataset):\n",
    "    all_features = []\n",
    "    all_labels = []\n",
    "    for images, labels in dataset:\n",
    "        preprocessed_images = keras.applications.vgg16.preprocess_input(images)\n",
    "        features = conv_base.predict(preprocessed_images)\n",
    "        all_features.append(features)\n",
    "        all_labels.append(labels)\n",
    "    return np.concatenate(all_features), np.concatenate(all_labels)\n",
    "train_features, train_labels = get_features_and_labels(train_dataset)\n",
    "val_features, val_labels = get_features_and_labels(validation_dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Importantly, `predict()` only expects images, not labels, but our current dataset yields\n",
    "batches that contain both images and their labels. Moreover, the VGG16 model expects\n",
    "inputs that are preprocessed with the function `keras.applications.vgg16.preprocess_input`, which scales pixel values to an appropriate range.\n",
    "The extracted features are currently of shape `(samples, 5, 5, 512)`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(480, 4, 4, 512)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_features.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And the labels are now referring to the order of the folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(480, 8)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_labels.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(80, 4, 4, 512)\n",
      "(80, 8)\n"
     ]
    }
   ],
   "source": [
    "print(val_features.shape)\n",
    "print(val_labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
Mirko Birbaumer's avatar
Mirko Birbaumer committed
    "inputs = keras.Input(shape=(4, 4, 512))\n",
    "# Note the use of the Flatten\n",
    "# layer before passing the\n",
    "# features to a Dense layer\n",
    "x = layers.Flatten()(inputs)\n",
    "x = layers.Dense(256)(x)\n",
    "x = layers.Dropout(0.7)(x)\n",
    "outputs = layers.Dense(8, activation=\"softmax\")(x)\n",
    "model = keras.Model(inputs, outputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model\"\n",
      "_________________________________________________________________\n",
      " Layer (type)                Output Shape              Param #   \n",
      "=================================================================\n",
      " input_3 (InputLayer)        [(None, 4, 4, 512)]       0         \n",
      "                                                                 \n",
      " flatten (Flatten)           (None, 8192)              0         \n",
      "                                                                 \n",
      " dense (Dense)               (None, 256)               2097408   \n",
      "                                                                 \n",
      " dropout (Dropout)           (None, 256)               0         \n",
      "                                                                 \n",
      " dense_1 (Dense)             (None, 8)                 2056      \n",
      "                                                                 \n",
      "=================================================================\n",
      "Total params: 2,099,464\n",
      "Trainable params: 2,099,464\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]