Selenium

b8850cbd · Simon van Hemert · 9c8315ae · b8850cbd · b8850cbd · b8850cbd
Commit b8850cbd authored 4 years ago by Simon van Hemert
--- a/notebooks/Block 5/GoogleImageExtractor.py
+++ b/notebooks/Block 5/GoogleImageExtractor.py
@@ -13,7 +13,7 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 
-from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM
+# from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM

 class GoogleImageExtractor(object):
 

--- a/notebooks/Block 5/Selenium test/Selenium_test.ipynb
+++ b/notebooks/Block 5/Selenium test/Selenium_test.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'pattern'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-0cfc1ae18607>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;34m\"\"\"test the downloading of files\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mqueries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqueries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#leave blanks if get the search list from file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msupport\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mexpected_conditions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mEC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mpattern\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweb\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mURL\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextension\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplaintext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNewsfeed\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDOM\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pattern'"
+     ]
+    }
+   ],
+   "source": [
+    "# Pattern does not exist for python 3.6\n",
+    "from GoogleImageExtractor import GoogleImageExtractor\n",
+    "\"\"\"test the downloading of files\"\"\"\n",
+    "queries = \"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\n",
+    "\n",
+    "w = GoogleImageExtractor(queries)#leave blanks if get the search list from file\n",
+    "\n",
+    "w.set_num_image_to_dl(200)\n",
+    "w.get_searchlist_fr_file(searchlist_filename)#replace the searclist\n",
+    "w.multi_search_download()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'crawler'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-14-e46c54ee3ec7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m# from processor.DownloadProcessor import DownloadProcessor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m# from processor.ElasticSearchProcessor import ElasticSearchProcessor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mcrawler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGoogleCrawler\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGoogleCrawler\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'__main__'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'crawler'"
+     ]
+    }
+   ],
+   "source": [
+    "# Could try to avoid the processor parts.\n",
+    "\n",
+    "# from processor.LogProcessor import LogProcessor\n",
+    "# from processor.DownloadProcessor import DownloadProcessor\n",
+    "# from processor.ElasticSearchProcessor import ElasticSearchProcessor\n",
+    "from crawler.GoogleCrawler import GoogleCrawler\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "\n",
+    "    options = {\n",
+    "        'output_directory':  \"./images\"\n",
+    "    }\n",
+    "    # PKK is a European Union supported terrorist organization against Turkish Gofrom icrawler.builtin import GoogleImageCrawler\n",
+    "\n",
+    "google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})\n",
+    "google_crawler.crawl(keyword='cat', max_num=100)\n",
+    "w = GoogleCrawler(max_image_count = 10)\n",
+    "# w.append_processor(LogProcessor())\n",
+    "# # w.append_processor(DownloadProcessor(output_directory=options['output_directory']))\n",
+    "# w.append_processor(ElasticSearchProcessor())\n",
+    "w.run('PKK')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# icecrawler cant be installed on python 3.6\n",
+    "from icrawler.builtin import GoogleImageCrawler\n",
+    "\n",
+    "google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})\n",
+    "google_crawler.crawl(keyword='cat', max_num=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'webdriver_manager'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-4-334de132c807>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mwebdriver_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchrome\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mChromeDriverManager\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mElementClickInterceptedException\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'webdriver_manager'"
+     ]
+    }
+   ],
+   "source": [
+    "# Webdriver manager cant find\n",
+    "import os\n",
+    "import selenium\n",
+    "from selenium import webdriver\n",
+    "import time\n",
+    "from PIL import Image\n",
+    "import io\n",
+    "import requests\n",
+    "from webdriver_manager.chrome import ChromeDriverManager\n",
+    "from selenium.common.exceptions import ElementClickInterceptedException\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "WebDriverException",
+     "evalue": "Message: 'geckodriver.exe' executable needs to be in PATH. \n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     75\u001b[0m                                             \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 76\u001b[0;31m                                             stdin=PIPE)\n\u001b[0m\u001b[1;32m     77\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m    799\u001b[0m                                 \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 800\u001b[0;31m                                 restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m    801\u001b[0m         \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m   1550\u001b[0m                             \u001b[0merr_msg\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m': '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1551\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1552\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './geckodriver.exe': './geckodriver.exe'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mWebDriverException\u001b[0m                        Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-21-0a010110e778>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0munittest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mdriver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFirefox\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexecutable_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'./geckodriver.exe'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/firefox/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, firefox_profile, firefox_binary, timeout, capabilities, proxy, executable_path, options, service_log_path, firefox_options, service_args, desired_capabilities, log_path, keep_alive)\u001b[0m\n\u001b[1;32m    162\u001b[0m                 \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    163\u001b[0m                 log_path=service_log_path)\n\u001b[0;32m--> 164\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    165\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    166\u001b[0m             \u001b[0mcapabilities\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_capabilities\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     81\u001b[0m                 raise WebDriverException(\n\u001b[1;32m     82\u001b[0m                     \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[0;32m---> 83\u001b[0;31m                         os.path.basename(self.path), self.start_error_message)\n\u001b[0m\u001b[1;32m     84\u001b[0m                 )\n\u001b[1;32m     85\u001b[0m             \u001b[0;32melif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEACCES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mWebDriverException\u001b[0m: Message: 'geckodriver.exe' executable needs to be in PATH. \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Problem with getting this webdriver in current path or installed at all for that matter, could try to just copy the file into gitlab\n",
+    "# https://medium.com/cs-note/web-crawling-by-using-selenium-python-3-4fff0bdb4c65\n",
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.by import By\n",
+    "from selenium.webdriver.common.keys import Keys\n",
+    "from selenium.webdriver.support.ui import Select\n",
+    "from selenium.common.exceptions import NoSuchElementException\n",
+    "from selenium.common.exceptions import NoAlertPresentException\n",
+    "import unittest, time, re\n",
+    "\n",
+    "driver = webdriver.Firefox(executable_path='./geckodriver.exe')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "WebDriverException",
+     "evalue": "Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     75\u001b[0m                                             \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 76\u001b[0;31m                                             stdin=PIPE)\n\u001b[0m\u001b[1;32m     77\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m    799\u001b[0m                                 \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 800\u001b[0;31m                                 restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m    801\u001b[0m         \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m   1550\u001b[0m                             \u001b[0merr_msg\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m': '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1551\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1552\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/../../../../../../chromedriver': '/../../../../../../chromedriver'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mWebDriverException\u001b[0m                        Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-16-aa049f59d4d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0mservice\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mService\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDRIVER_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0mwd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRemote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0mwd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     81\u001b[0m                 raise WebDriverException(\n\u001b[1;32m     82\u001b[0m                     \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[0;32m---> 83\u001b[0;31m                         os.path.basename(self.path), self.start_error_message)\n\u001b[0m\u001b[1;32m     84\u001b[0m                 )\n\u001b[1;32m     85\u001b[0m             \u001b[0;32melif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEACCES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mWebDriverException\u001b[0m: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Same problem, driver needs to be in the path somehow. \n",
+    "# https://medium.com/swlh/web-scraping-stock-images-using-google-selenium-and-python-8b825ba649b9\n",
+    "import selenium\n",
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.chrome.service import Service\n",
+    "DRIVER_PATH = '/../../../../../../chromedriver'\n",
+    "\n",
+    "service = Service(DRIVER_PATH)\n",
+    "service.start()\n",
+    "wd = webdriver.Remote(service.service_url)\n",
+    "wd.quit()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+
+``` python
+# Pattern does not exist for python 3.6
+from GoogleImageExtractor import GoogleImageExtractor
+"""test the downloading of files"""
+queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
+
+w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
+
+w.set_num_image_to_dl(200)
+w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
+w.multi_search_download()
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    ModuleNotFoundError                       Traceback (most recent call last)
+    <ipython-input-1-0cfc1ae18607> in <module>
+    ----> 1 from GoogleImageExtractor import GoogleImageExtractor
+          2 """test the downloading of files"""
+          3 queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
+          4
+          5 w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
+    /work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py in <module>
+         14 from selenium.webdriver.support import expected_conditions as EC
+         15
+    ---> 16 from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM
+         17
+         18 class GoogleImageExtractor(object):
+    ModuleNotFoundError: No module named 'pattern'
+
+%% Cell type:code id: tags:
+
+``` python
+# Could try to avoid the processor parts.
+
+# from processor.LogProcessor import LogProcessor
+# from processor.DownloadProcessor import DownloadProcessor
+# from processor.ElasticSearchProcessor import ElasticSearchProcessor
+from crawler.GoogleCrawler import GoogleCrawler
+
+if __name__ == '__main__':
+
+    options = {
+        'output_directory':  "./images"
+    }
+    # PKK is a European Union supported terrorist organization against Turkish Gofrom icrawler.builtin import GoogleImageCrawler
+
+google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})
+google_crawler.crawl(keyword='cat', max_num=100)
+w = GoogleCrawler(max_image_count = 10)
+# w.append_processor(LogProcessor())
+# # w.append_processor(DownloadProcessor(output_directory=options['output_directory']))
+# w.append_processor(ElasticSearchProcessor())
+w.run('PKK')
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    ModuleNotFoundError                       Traceback (most recent call last)
+    <ipython-input-14-e46c54ee3ec7> in <module>
+          2 # from processor.DownloadProcessor import DownloadProcessor
+          3 # from processor.ElasticSearchProcessor import ElasticSearchProcessor
+    ----> 4 from crawler.GoogleCrawler import GoogleCrawler
+          5
+          6 if __name__ == '__main__':
+    ModuleNotFoundError: No module named 'crawler'
+
+%% Cell type:code id: tags:
+
+``` python
+# icecrawler cant be installed on python 3.6
+from icrawler.builtin import GoogleImageCrawler
+
+google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})
+google_crawler.crawl(keyword='cat', max_num=100)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Webdriver manager cant find
+import os
+import selenium
+from selenium import webdriver
+import time
+from PIL import Image
+import io
+import requests
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.common.exceptions import ElementClickInterceptedException
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    ModuleNotFoundError                       Traceback (most recent call last)
+    <ipython-input-4-334de132c807> in <module>
+          6 import io
+          7 import requests
+    ----> 8 from webdriver_manager.chrome import ChromeDriverManager
+          9 from selenium.common.exceptions import ElementClickInterceptedException
+    ModuleNotFoundError: No module named 'webdriver_manager'
+
+%% Cell type:code id: tags:
+
+``` python
+# Problem with getting this webdriver in current path or installed at all for that matter, could try to just copy the file into gitlab
+# https://medium.com/cs-note/web-crawling-by-using-selenium-python-3-4fff0bdb4c65
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.ui import Select
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import NoAlertPresentException
+import unittest, time, re
+
+driver = webdriver.Firefox(executable_path='./geckodriver.exe')
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    FileNotFoundError                         Traceback (most recent call last)
+    /opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
+         75                                             stderr=self.log_file,
+    ---> 76                                             stdin=PIPE)
+         77         except TypeError:
+    /opt/conda/lib/python3.7/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
+        799                                 errread, errwrite,
+    --> 800                                 restore_signals, start_new_session)
+        801         except:
+    /opt/conda/lib/python3.7/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
+       1550                             err_msg += ': ' + repr(err_filename)
+    -> 1551                     raise child_exception_type(errno_num, err_msg, err_filename)
+       1552                 raise child_exception_type(err_msg)
+    FileNotFoundError: [Errno 2] No such file or directory: './geckodriver.exe': './geckodriver.exe'
+
+During handling of the above exception, another exception occurred:
+    WebDriverException                        Traceback (most recent call last)
+    <ipython-input-21-0a010110e778> in <module>
+          9 import unittest, time, re
+         10
+    ---> 11 driver = webdriver.Firefox(executable_path='./geckodriver.exe')
+
+    /opt/conda/lib/python3.7/site-packages/selenium/webdriver/firefox/webdriver.py in __init__(self, firefox_profile, firefox_binary, timeout, capabilities, proxy, executable_path, options, service_log_path, firefox_options, service_args, desired_capabilities, log_path, keep_alive)
+        162                 service_args=service_args,
+        163                 log_path=service_log_path)
+    --> 164             self.service.start()
+        165
+        166             capabilities.update(options.to_capabilities())
+    /opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
+         81                 raise WebDriverException(
+         82                     "'%s' executable needs to be in PATH. %s" % (
+    ---> 83                         os.path.basename(self.path), self.start_error_message)
+         84                 )
+         85             elif err.errno == errno.EACCES:
+    WebDriverException: Message: 'geckodriver.exe' executable needs to be in PATH.
+
+%% Cell type:code id: tags:
+
+``` python
+# Same problem, driver needs to be in the path somehow.
+# https://medium.com/swlh/web-scraping-stock-images-using-google-selenium-and-python-8b825ba649b9
+import selenium
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+DRIVER_PATH = '/../../../../../../chromedriver'
+
+service = Service(DRIVER_PATH)
+service.start()
+wd = webdriver.Remote(service.service_url)
+wd.quit()
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    FileNotFoundError                         Traceback (most recent call last)
+    /opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
+         75                                             stderr=self.log_file,
+    ---> 76                                             stdin=PIPE)
+         77         except TypeError:
+    /opt/conda/lib/python3.7/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
+        799                                 errread, errwrite,
+    --> 800                                 restore_signals, start_new_session)
+        801         except:
+    /opt/conda/lib/python3.7/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
+       1550                             err_msg += ': ' + repr(err_filename)
+    -> 1551                     raise child_exception_type(errno_num, err_msg, err_filename)
+       1552                 raise child_exception_type(err_msg)
+    FileNotFoundError: [Errno 2] No such file or directory: '/../../../../../../chromedriver': '/../../../../../../chromedriver'
+
+During handling of the above exception, another exception occurred:
+    WebDriverException                        Traceback (most recent call last)
+    <ipython-input-16-aa049f59d4d0> in <module>
+          6
+          7 service = Service(DRIVER_PATH)
+    ----> 8 service.start()
+          9 wd = webdriver.Remote(service.service_url)
+         10 wd.quit()
+    /opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
+         81                 raise WebDriverException(
+         82                     "'%s' executable needs to be in PATH. %s" % (
+    ---> 83                         os.path.basename(self.path), self.start_error_message)
+         84                 )
+         85             elif err.errno == errno.EACCES:
+    WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
--- a/notebooks/Block 5/Selenium test/geckodriver.exe
+++ b/notebooks/Block 5/Selenium test/geckodriver.exe
--- a/notebooks/Block 5/Selenium_test.ipynb
+++ b/notebooks/Block 5/Selenium_test.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'selenium'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-4-0cfc1ae18607>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;34m\"\"\"test the downloading of files\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mqueries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqueries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#leave blanks if get the search list from file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mcontextlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mclosing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwebdriver\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mFirefox\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'selenium'"
-     ]
-    }
-   ],
-   "source": [
-    "from GoogleImageExtractor import GoogleImageExtractor\n",
-    "\"\"\"test the downloading of files\"\"\"\n",
-    "queries = \"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\n",
-    "\n",
-    "w = GoogleImageExtractor(queries)#leave blanks if get the search list from file\n",
-    "\n",
-    "w.set_num_image_to_dl(200)\n",
-    "w.get_searchlist_fr_file(searchlist_filename)#replace the searclist\n",
-    "w.multi_search_download()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-%% Cell type:code id: tags:
-
-``` python
-from GoogleImageExtractor import GoogleImageExtractor
-"""test the downloading of files"""
-queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
-
-w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
-
-w.set_num_image_to_dl(200)
-w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
-w.multi_search_download()
-```
-
-%% Output
-
-    ---------------------------------------------------------------------------
-    ModuleNotFoundError                       Traceback (most recent call last)
-    <ipython-input-4-0cfc1ae18607> in <module>
-    ----> 1 from GoogleImageExtractor import GoogleImageExtractor
-          2 """test the downloading of files"""
-          3 queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
-          4
-          5 w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
-    /work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py in <module>
-          7 import re, os, sys, datetime, time
-          8 import pandas
-    ----> 9 from selenium import webdriver
-         10 from contextlib import closing
-         11 from selenium.webdriver import Firefox
-    ModuleNotFoundError: No module named 'selenium'
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ scikit-learn==0.23.2
 vega_datasets==0.8.0
 mrcnn==0.2
 altair==4.1.0
-selenium==3.141.0
\ No newline at end of file
+selenium==3.141.0
+crawler==0.0.2
\ No newline at end of file