Jupyter Notebook Block 5 - Object Detection and Segmentation.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "dWyPGNkCGhIX"
   },
   "source": [
    "# Part I : Create Your Own Dataset and Train it with ConvNets\n",
    "\n",
    "In this part of the notebook, you will set up your own dataset for image classification. Please specify \n",
    "under `queries` the image categories you are interested in. Under `limit` specify the number of images \n",
    "you want to download for each image category. \n",
    "\n",
    "You do not need to understand the class `simple_image_download`, just execute the cell after you have specified \n",
    "the download folder.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "8rckz3ZuGhIc",
    "outputId": "6f615f06-759a-4eea-839e-658155df8d36"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import urllib\n",
    "import requests\n",
    "from urllib.parse import quote\n",
    "import array as arr\n",
    "\n",
    "\n",
    "# Specifiy the queries\n",
    "queries = \"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\n",
    "limit = 1\n",
    "download_folder = \"./brandnew_images/\"\n",
    "\n",
    "\n",
    "class simple_image_download:\n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def urls(self, keywords, limit, download_folder):\n",
    "        keyword_to_search = [str(item).strip() for item in keywords.split(',')]\n",
    "        i = 0\n",
    "        links = []\n",
    "        while i < len(keyword_to_search):\n",
    "            url = 'https://www.google.com/search?q=' + quote(\n",
    "                keyword_to_search[i].encode(\n",
    "                    'utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'\n",
    "            raw_html = self._download_page(url)\n",
    "\n",
    "            end_object = -1;\n",
    "\n",
    "            j = 0\n",
    "            while j < limit:\n",
    "                while (True):\n",
    "                    try:\n",
    "                        new_line = raw_html.find('\"https://', end_object + 1)\n",
    "                        end_object = raw_html.find('\"', new_line + 1)\n",
    "\n",
    "                        buffor = raw_html.find('\\\\', new_line + 1, end_object)\n",
    "                        if buffor != -1:\n",
    "                            object_raw = (raw_html[new_line + 1:buffor])\n",
    "                        else:\n",
    "                            object_raw = (raw_html[new_line + 1:end_object])\n",
    "\n",
    "                        if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:\n",
    "                            break\n",
    "\n",
    "                    except Exception as e:\n",
    "                        print(e)\n",
    "                        break\n",
    "\n",
    "                links.append(object_raw)\n",
    "                j += 1\n",
    "\n",
    "            i += 1\n",
    "        return(links)\n",
    "\n",
    "\n",
    "    def download(self, keywords, limit, download_folder):\n",
    "        keyword_to_search = [str(item).strip() for item in keywords.split(',')]\n",
    "        main_directory = download_folder\n",
    "        i = 0\n",
    "\n",
    "        while i < len(keyword_to_search):\n",
    "            self._create_directories(main_directory, keyword_to_search[i])\n",
    "            url = 'https://www.google.com/search?q=' + quote(\n",
    "                keyword_to_search[i].encode('utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'\n",
    "            raw_html = self._download_page(url)\n",
    "\n",
    "            end_object = -1;\n",
    "\n",
    "            j = 0\n",
    "            while j < limit:\n",
    "                while (True):\n",
    "                    try:\n",
    "                        new_line = raw_html.find('\"https://', end_object + 1)\n",
    "                        end_object = raw_html.find('\"', new_line + 1)\n",
    "\n",
    "                        buffor = raw_html.find('\\\\', new_line + 1, end_object)\n",
    "                        if buffor != -1:\n",
    "                            object_raw = (raw_html[new_line+1:buffor])\n",
    "                        else:\n",
    "                            object_raw = (raw_html[new_line+1:end_object])\n",
    "\n",
    "                        if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:\n",
    "                            break\n",
    "\n",
    "                    except Exception as e:\n",
    "                        print(e)\n",
    "                        break\n",
    "\n",
    "                path = main_directory + keyword_to_search[i]\n",
    "\n",
    "                #print(object_raw)\n",
    "\n",
    "                if not os.path.exists(path):\n",
    "                    os.makedirs(path)\n",
    "\n",
    "                filename = str(keyword_to_search[i]) + \"_\" + str(j + 1) + \".jpg\"\n",
    "\n",
    "                try:\n",
    "                    r = requests.get(object_raw, allow_redirects=True)\n",
    "                    open(os.path.join(path, filename), 'wb').write(r.content)\n",
    "                except Exception as e:\n",
    "                    print(e)\n",
    "                    j -= 1\n",
    "                j += 1\n",
    "\n",
    "            i += 1\n",
    "\n",
    "\n",
    "    def _create_directories(self, main_directory, name):\n",
    "        try:\n",
    "            if not os.path.exists(main_directory):\n",
    "                os.makedirs(main_directory)\n",
    "                time.sleep(0.2)\n",
    "                path = (name)\n",
    "                sub_directory = os.path.join(main_directory, path)\n",
    "                if not os.path.exists(sub_directory):\n",
    "                    os.makedirs(sub_directory)\n",
    "            else:\n",
    "                path = (name)\n",
    "                sub_directory = os.path.join(main_directory, path)\n",
    "                if not os.path.exists(sub_directory):\n",
    "                    os.makedirs(sub_directory)\n",
    "\n",
    "        except OSError as e:\n",
    "            if e.errno != 17:\n",
    "                raise\n",
    "            pass\n",
    "        return\n",
    "\n",
    "    def _download_page(self,url):\n",
    "\n",
    "        try:\n",
    "            headers = {}\n",
    "            headers['User-Agent'] = \"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36\"\n",
    "            req = urllib.request.Request(url, headers=headers)\n",
    "            resp = urllib.request.urlopen(req)\n",
    "            respData = str(resp.read())\n",
    "            return respData\n",
    "\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            exit(0)\n",
    "            \n",
    "response = simple_image_download\n",
    "response().download(queries, limit, download_folder)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "CRHl9UX6GhIs"
   },
   "source": [
    "Please check carefully the downloaded images, there may be a lot of garbage! You definitely need to \n",
    "clean the data.\n",
    "\n",
    "In the following, you will apply data augmentation to your data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "3SX21FtcGhIu"
   },