Skip to content
Snippets Groups Projects
Commit b8850cbd authored by Simon van Hemert's avatar Simon van Hemert
Browse files

Selenium

parent 9c8315ae
No related branches found
No related tags found
No related merge requests found
......@@ -13,7 +13,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM
# from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM
class GoogleImageExtractor(object):
......
%% Cell type:code id: tags:
``` python
# Pattern does not exist for python 3.6
from GoogleImageExtractor import GoogleImageExtractor
"""test the downloading of files"""
queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
w.set_num_image_to_dl(200)
w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
w.multi_search_download()
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-0cfc1ae18607> in <module>
----> 1 from GoogleImageExtractor import GoogleImageExtractor
2 """test the downloading of files"""
3 queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
4
5 w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py in <module>
14 from selenium.webdriver.support import expected_conditions as EC
15
---> 16 from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM
17
18 class GoogleImageExtractor(object):
ModuleNotFoundError: No module named 'pattern'
%% Cell type:code id: tags:
``` python
# Could try to avoid the processor parts.
# from processor.LogProcessor import LogProcessor
# from processor.DownloadProcessor import DownloadProcessor
# from processor.ElasticSearchProcessor import ElasticSearchProcessor
from crawler.GoogleCrawler import GoogleCrawler
if __name__ == '__main__':
options = {
'output_directory': "./images"
}
# PKK is a European Union supported terrorist organization against Turkish Gofrom icrawler.builtin import GoogleImageCrawler
google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})
google_crawler.crawl(keyword='cat', max_num=100)
w = GoogleCrawler(max_image_count = 10)
# w.append_processor(LogProcessor())
# # w.append_processor(DownloadProcessor(output_directory=options['output_directory']))
# w.append_processor(ElasticSearchProcessor())
w.run('PKK')
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-14-e46c54ee3ec7> in <module>
2 # from processor.DownloadProcessor import DownloadProcessor
3 # from processor.ElasticSearchProcessor import ElasticSearchProcessor
----> 4 from crawler.GoogleCrawler import GoogleCrawler
5
6 if __name__ == '__main__':
ModuleNotFoundError: No module named 'crawler'
%% Cell type:code id: tags:
``` python
# icecrawler cant be installed on python 3.6
from icrawler.builtin import GoogleImageCrawler
google_crawler = GoogleImageCrawler(storage={'root_dir': 'your_image_dir'})
google_crawler.crawl(keyword='cat', max_num=100)
```
%% Cell type:code id: tags:
``` python
# Webdriver manager cant find
import os
import selenium
from selenium import webdriver
import time
from PIL import Image
import io
import requests
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementClickInterceptedException
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-4-334de132c807> in <module>
6 import io
7 import requests
----> 8 from webdriver_manager.chrome import ChromeDriverManager
9 from selenium.common.exceptions import ElementClickInterceptedException
ModuleNotFoundError: No module named 'webdriver_manager'
%% Cell type:code id: tags:
``` python
# Problem with getting this webdriver in current path or installed at all for that matter, could try to just copy the file into gitlab
# https://medium.com/cs-note/web-crawling-by-using-selenium-python-3-4fff0bdb4c65
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
driver = webdriver.Firefox(executable_path='./geckodriver.exe')
```
%% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
75 stderr=self.log_file,
---> 76 stdin=PIPE)
77 except TypeError:
/opt/conda/lib/python3.7/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
799 errread, errwrite,
--> 800 restore_signals, start_new_session)
801 except:
/opt/conda/lib/python3.7/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1550 err_msg += ': ' + repr(err_filename)
-> 1551 raise child_exception_type(errno_num, err_msg, err_filename)
1552 raise child_exception_type(err_msg)
FileNotFoundError: [Errno 2] No such file or directory: './geckodriver.exe': './geckodriver.exe'
During handling of the above exception, another exception occurred:
WebDriverException Traceback (most recent call last)
<ipython-input-21-0a010110e778> in <module>
9 import unittest, time, re
10
---> 11 driver = webdriver.Firefox(executable_path='./geckodriver.exe')
/opt/conda/lib/python3.7/site-packages/selenium/webdriver/firefox/webdriver.py in __init__(self, firefox_profile, firefox_binary, timeout, capabilities, proxy, executable_path, options, service_log_path, firefox_options, service_args, desired_capabilities, log_path, keep_alive)
162 service_args=service_args,
163 log_path=service_log_path)
--> 164 self.service.start()
165
166 capabilities.update(options.to_capabilities())
/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
81 raise WebDriverException(
82 "'%s' executable needs to be in PATH. %s" % (
---> 83 os.path.basename(self.path), self.start_error_message)
84 )
85 elif err.errno == errno.EACCES:
WebDriverException: Message: 'geckodriver.exe' executable needs to be in PATH.
%% Cell type:code id: tags:
``` python
# Same problem, driver needs to be in the path somehow.
# https://medium.com/swlh/web-scraping-stock-images-using-google-selenium-and-python-8b825ba649b9
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
DRIVER_PATH = '/../../../../../../chromedriver'
service = Service(DRIVER_PATH)
service.start()
wd = webdriver.Remote(service.service_url)
wd.quit()
```
%% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
75 stderr=self.log_file,
---> 76 stdin=PIPE)
77 except TypeError:
/opt/conda/lib/python3.7/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
799 errread, errwrite,
--> 800 restore_signals, start_new_session)
801 except:
/opt/conda/lib/python3.7/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1550 err_msg += ': ' + repr(err_filename)
-> 1551 raise child_exception_type(errno_num, err_msg, err_filename)
1552 raise child_exception_type(err_msg)
FileNotFoundError: [Errno 2] No such file or directory: '/../../../../../../chromedriver': '/../../../../../../chromedriver'
During handling of the above exception, another exception occurred:
WebDriverException Traceback (most recent call last)
<ipython-input-16-aa049f59d4d0> in <module>
6
7 service = Service(DRIVER_PATH)
----> 8 service.start()
9 wd = webdriver.Remote(service.service_url)
10 wd.quit()
/opt/conda/lib/python3.7/site-packages/selenium/webdriver/common/service.py in start(self)
81 raise WebDriverException(
82 "'%s' executable needs to be in PATH. %s" % (
---> 83 os.path.basename(self.path), self.start_error_message)
84 )
85 elif err.errno == errno.EACCES:
WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
File added
%% Cell type:code id: tags:
``` python
from GoogleImageExtractor import GoogleImageExtractor
"""test the downloading of files"""
queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
w.set_num_image_to_dl(200)
w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
w.multi_search_download()
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-4-0cfc1ae18607> in <module>
----> 1 from GoogleImageExtractor import GoogleImageExtractor
2 """test the downloading of files"""
3 queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
4
5 w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py in <module>
7 import re, os, sys, datetime, time
8 import pandas
----> 9 from selenium import webdriver
10 from contextlib import closing
11 from selenium.webdriver import Firefox
ModuleNotFoundError: No module named 'selenium'
......@@ -8,4 +8,5 @@ scikit-learn==0.23.2
vega_datasets==0.8.0
mrcnn==0.2
altair==4.1.0
selenium==3.141.0
\ No newline at end of file
selenium==3.141.0
crawler==0.0.2
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment