diff --git a/notebooks/Block 5/GoogleImageExtractor.py b/notebooks/Block 5/Selenium test/GoogleImageExtractor.py similarity index 98% rename from notebooks/Block 5/GoogleImageExtractor.py rename to notebooks/Block 5/Selenium test/GoogleImageExtractor.py index 6df9966a27a914d2712c1a425809bdd2401d8fe4..87e94c91b733174f75b110f8d82b22186fa37e18 100644 --- a/notebooks/Block 5/GoogleImageExtractor.py +++ b/notebooks/Block 5/Selenium test/GoogleImageExtractor.py @@ -13,7 +13,7 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC -from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM +# from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM class GoogleImageExtractor(object): diff --git a/notebooks/Block 5/Selenium test/Selenium_test.ipynb b/notebooks/Block 5/Selenium test/Selenium_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..4e522eb3300320437e1c4e4f29118084a5639cc9 --- /dev/null +++ b/notebooks/Block 5/Selenium test/Selenium_test.ipynb @@ -0,0 +1,217 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pattern'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-1-0cfc1ae18607>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\"\"\"test the downloading of files\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mqueries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqueries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#leave blanks if get the search list from file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msupport\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mexpected_conditions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mEC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mpattern\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweb\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mURL\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextension\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplaintext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNewsfeed\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDOM\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pattern'" + ] + } + ], + "source": [ + "# Pattern does not exist for python 3.6\n", + "from GoogleImageExtractor import GoogleImageExtractor\n", + "\"\"\"test the downloading of files\"\"\"\n", + "queries = \"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\n", + "\n", + "w = GoogleImageExtractor(queries)#leave blanks if get the search list from file\n", + "\n", + "w.set_num_image_to_dl(200)\n", + "w.get_searchlist_fr_file(searchlist_filename)#replace the searclist\n", + "w.multi_search_download()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'crawler'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-14-e46c54ee3ec7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m# from processor.DownloadProcessor import DownloadProcessor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# from processor.ElasticSearchProcessor import ElasticSearchProcessor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mcrawler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGoogleCrawler\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGoogleCrawler\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'__main__'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'crawler'" + ] + } + ], + "source": [ + "# Could try to avoid the processor parts.\n", + "\n", + "# from processor.LogProcessor import LogProcessor\n", + "# from processor.DownloadProcessor import DownloadProcessor\n", + "# from processor.ElasticSearchProcessor import ElasticSearchProcessor\n", + "from crawler.GoogleCrawler import GoogleCrawler\n", + "\n", + "if __name__ == '__main__':\n", + "\n", + " options = {\n", + " 'output_directory': \"./images\"\n", + " }\n", + " # PKK is a European Union supported terrorist organization against Turkish Gofrom icrawler.builtin import GoogleImageCrawler\n", + "\n", + "google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})\n", + "google_crawler.crawl(keyword='cat', max_num=100)\n", + "w = GoogleCrawler(max_image_count = 10)\n", + "# w.append_processor(LogProcessor())\n", + "# # w.append_processor(DownloadProcessor(output_directory=options['output_directory']))\n", + "# w.append_processor(ElasticSearchProcessor())\n", + "w.run('PKK')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# icecrawler cant be installed on python 3.6\n", + "from icrawler.builtin import GoogleImageCrawler\n", + "\n", + "google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})\n", + "google_crawler.crawl(keyword='cat', max_num=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'webdriver_manager'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-4-334de132c807>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mwebdriver_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchrome\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mChromeDriverManager\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mElementClickInterceptedException\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'webdriver_manager'" + ] + } + ], + "source": [ + "# Webdriver manager cant find\n", + "import os\n", + "import selenium\n", + "from selenium import webdriver\n", + "import time\n", + "from PIL import Image\n", + "import io\n", + "import requests\n", + "from webdriver_manager.chrome import ChromeDriverManager\n", + "from selenium.common.exceptions import ElementClickInterceptedException\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "ename": "WebDriverException", + "evalue": "Message: 'geckodriver.exe' executable needs to be in PATH. \n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 76\u001b[0;31m stdin=PIPE)\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m 799\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 800\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 801\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1550\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m': '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1551\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1552\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './geckodriver.exe': './geckodriver.exe'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-21-0a010110e778>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0munittest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mdriver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFirefox\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexecutable_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'./geckodriver.exe'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/firefox/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, firefox_profile, firefox_binary, timeout, capabilities, proxy, executable_path, options, service_log_path, firefox_options, service_args, desired_capabilities, log_path, keep_alive)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m log_path=service_log_path)\n\u001b[0;32m--> 164\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 165\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0mcapabilities\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_capabilities\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 81\u001b[0m raise WebDriverException(\n\u001b[1;32m 82\u001b[0m \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[0;32m---> 83\u001b[0;31m os.path.basename(self.path), self.start_error_message)\n\u001b[0m\u001b[1;32m 84\u001b[0m )\n\u001b[1;32m 85\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEACCES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mWebDriverException\u001b[0m: Message: 'geckodriver.exe' executable needs to be in PATH. \n" + ] + } + ], + "source": [ + "# Problem with getting this webdriver in current path or installed at all for that matter, could try to just copy the file into gitlab\n", + "# https://medium.com/cs-note/web-crawling-by-using-selenium-python-3-4fff0bdb4c65\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.common.keys import Keys\n", + "from selenium.webdriver.support.ui import Select\n", + "from selenium.common.exceptions import NoSuchElementException\n", + "from selenium.common.exceptions import NoAlertPresentException\n", + "import unittest, time, re\n", + "\n", + "driver = webdriver.Firefox(executable_path='./geckodriver.exe')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "WebDriverException", + "evalue": "Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 76\u001b[0;31m stdin=PIPE)\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)\u001b[0m\n\u001b[1;32m 799\u001b[0m \u001b[0merrread\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 800\u001b[0;31m restore_signals, start_new_session)\n\u001b[0m\u001b[1;32m 801\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)\u001b[0m\n\u001b[1;32m 1550\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m': '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrepr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1551\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1552\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/../../../../../../chromedriver': '/../../../../../../chromedriver'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-16-aa049f59d4d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mservice\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mService\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDRIVER_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mwd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRemote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mwd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 81\u001b[0m raise WebDriverException(\n\u001b[1;32m 82\u001b[0m \"'%s' executable needs to be in PATH. %s\" % (\n\u001b[0;32m---> 83\u001b[0;31m os.path.basename(self.path), self.start_error_message)\n\u001b[0m\u001b[1;32m 84\u001b[0m )\n\u001b[1;32m 85\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEACCES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mWebDriverException\u001b[0m: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home\n" + ] + } + ], + "source": [ + "# Same problem, driver needs to be in the path somehow. \n", + "# https://medium.com/swlh/web-scraping-stock-images-using-google-selenium-and-python-8b825ba649b9\n", + "import selenium\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.service import Service\n", + "DRIVER_PATH = '/../../../../../../chromedriver'\n", + "\n", + "service = Service(DRIVER_PATH)\n", + "service.start()\n", + "wd = webdriver.Remote(service.service_url)\n", + "wd.quit()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/Block 5/Selenium test/geckodriver.exe b/notebooks/Block 5/Selenium test/geckodriver.exe new file mode 100755 index 0000000000000000000000000000000000000000..bac836bb55f7bec9306c70d7be0e223cef468e3e Binary files /dev/null and b/notebooks/Block 5/Selenium test/geckodriver.exe differ diff --git a/notebooks/Block 5/Selenium_test.ipynb b/notebooks/Block 5/Selenium_test.ipynb deleted file mode 100644 index 73bbe2364fa9d0ecf19ef650ff7be0bd70688c47..0000000000000000000000000000000000000000 --- a/notebooks/Block 5/Selenium_test.ipynb +++ /dev/null @@ -1,55 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'selenium'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-4-0cfc1ae18607>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\"\"\"test the downloading of files\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mqueries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGoogleImageExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqueries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#leave blanks if get the search list from file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mcontextlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mclosing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mselenium\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwebdriver\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mFirefox\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'selenium'" - ] - } - ], - "source": [ - "from GoogleImageExtractor import GoogleImageExtractor\n", - "\"\"\"test the downloading of files\"\"\"\n", - "queries = \"brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard\"\n", - "\n", - "w = GoogleImageExtractor(queries)#leave blanks if get the search list from file\n", - "\n", - "w.set_num_image_to_dl(200)\n", - "w.get_searchlist_fr_file(searchlist_filename)#replace the searclist\n", - "w.multi_search_download()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/requirements.txt b/requirements.txt index 822f330f5b32ecac0c5d8ad358f85a93d268b991..d676692df34d0288515d9b9dffeb92dd6ba7c2fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ scikit-learn==0.23.2 vega_datasets==0.8.0 mrcnn==0.2 altair==4.1.0 -selenium==3.141.0 \ No newline at end of file +selenium==3.141.0 +crawler==0.0.2 \ No newline at end of file