Skip to content
Snippets Groups Projects
Commit b00b3676 authored by Simon van Hemert's avatar Simon van Hemert
Browse files

Added Image crawling

parent 8ea90ff4
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from Image_crawling import Image_crawling
# Set options
options = webdriver.FirefoxOptions()
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
# Create Driver
driver = webdriver.Firefox(options=options, executable_path="/usr/bin/geckodriver")
# create instance of crawler
image_crawling = Image_crawling(driver)
# Craws image urls:
image_urls = image_crawling.fetch_image_urls("sailing", 10)
print(image_urls)
# download images
image_crawling.download_image("./images")
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-2-84df93c71169> in <module>
1 from selenium import webdriver
2 from selenium.webdriver.firefox.options import Options
----> 3 from Image_crawling import Image_crawling
4
5 # Set options
/work/hslu-deep-learning/notebooks/Block_5/Seleniumtest/Image_crawling.py in <module>
7 import os
8 import io
----> 9 import Image
10 import requests
11
ModuleNotFoundError: No module named 'Image'
%% Cell type:code id: tags:
``` python
```
""" Class containing all functions needed to download images from Google
Following example set by: https://towardsdatascience.com/image-scraping-with-python-a96feda8af2d
Adepted by: Simon van Hemert
Date edited:2021.01.05 """
import time
import os
import io
from PIL import Image
import requests
class Image_crawling:
def __init__(self, drive):
self.sleep_between_interactions = 0.1
self.drive = drive
def fetch_image_urls(self, query:str, max_links_to_fetch:int):
# build the google query
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
# load the page
self.drive.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
self.scroll_to_end()
# get all image thumbnail results
thumbnail_results = self.drive.find_elements_by_css_selector("img.Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
for img in thumbnail_results[results_start:number_results]:
# try to click every thumbnail such that we can get the real image behind it
try:
img.click()
time.sleep(self.sleep_between_interactions)
except Exception as e:
print("Exception", e, "occured in clicking on thumbnails ")
continue
# extract image urls
actual_images = self.drive.find_elements_by_css_selector('img.n3VNCb')
for actual_image in actual_images:
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found: {len(image_urls)} image links, done!")
break
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(5)
return
load_more_button = self.drive.find_element_by_css_selector(".mye4qd")
if load_more_button:
self.drive.execute_script("document.querySelector('.mye4qd').click();")
# move the result startpoint further down
results_start = len(thumbnail_results)
self.image_urls = image_urls
return image_urls
def scroll_to_end(self):
self.drive.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(self.sleep_between_interactions)
def download_image(self, folder_path:str):
for url in self.image_urls:
try:
image_content = requests.get(url).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality=85)
print(f"SUCCESS - saved {url} - as {file_path}")
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
# def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=5):
# target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))
# if not os.path.exists(target_folder):
# os.makedirs(target_folder)
# with webdriver.Chrome(executable_path=driver_path) as wd:
# res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
# for elem in res:
# persist_image(target_folder,elem)
This diff is collapsed.
......@@ -9,3 +9,4 @@ vega_datasets==0.8.0
mrcnn==0.2
altair==4.1.0
selenium==3.141.0
pillow==8.0.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment