Skip to content
Snippets Groups Projects
Commit 9c8315ae authored by Simon van Hemert's avatar Simon van Hemert
Browse files

Merge branch 'renku/autosave/simon.vanhemert/master/41726926/84bec821' into 'master'

Auto-saving for simon.vanhemert on branch master from commit 41726926

See merge request simon.vanhemert/hslu-deep-learning!4
parents 84bec821 ce728fca
No related branches found
No related tags found
No related merge requests found
Showing
with 260 additions and 0 deletions
""" Google image Extractor based on Selenium.
taken from:
https://simply-python.com/2015/05/18/saving-images-from-google-search-using-selenium-and-python/
See also:
https://github.com/scirag/selenium-image-crawler
"""
import re, os, sys, datetime, time
import pandas
from selenium import webdriver
from contextlib import closing
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from pattern.web import URL, extension, cache, plaintext, Newsfeed, DOM
class GoogleImageExtractor(object):
def __init__(self, search_key = '' ):
""" Google image search class
Args:
search_key to be entered.
"""
if type(search_key) == str:
## convert to list even for one search keyword to standalize the pulling.
self.g_search_key_list = [search_key]
elif type(search_key) == list:
self.g_search_key_list = search_key
else:
print('google_search_keyword not of type str or list')
raise
self.g_search_key = ''
## user options
self.image_dl_per_search = 200
## url construct string text
self.prefix_of_search_url = "https://www.google.com.sg/search?q="
self.postfix_of_search_url = '&source=lnms&tbm=isch&sa=X&ei=0eZEVbj3IJG5uATalICQAQ&ved=0CAcQ_AUoAQ&biw=939&bih=591'# non changable text
self.target_url_str = ''
## storage
self.pic_url_list = []
self.pic_info_list = []
## file and folder path
self.folder_main_dir_prefix = r'C:\data\temp\gimage_pic'
def reformat_search_for_spaces(self):
"""
Method call immediately at the initialization stages
get rid of the spaces and replace by the "+"
Use in search term. Eg: "Cookie fast" to "Cookie+fast"
steps:
strip any lagging spaces if present
replace the self.g_search_key
"""
self.g_search_key = self.g_search_key.rstrip().replace(' ', '+')
def set_num_image_to_dl(self, num_image):
""" Set the number of image to download. Set to self.image_dl_per_search.
Args:
num_image (int): num of image to download.
"""
self.image_dl_per_search = num_image
def get_searchlist_fr_file(self, filename):
"""Get search list from filename. Ability to add in a lot of phrases.
Will replace the self.g_search_key_list
Args:
filename (str): full file path
"""
with open(filename,'r') as f:
self.g_search_key_list = f.readlines()
def formed_search_url(self):
''' Form the url either one selected key phrases or multiple search items.
Get the url from the self.g_search_key_list
Set to self.sp_search_url_list
'''
self.reformat_search_for_spaces()
self.target_url_str = self.prefix_of_search_url + self.g_search_key +\
self.postfix_of_search_url
def retrieve_source_fr_html(self):
""" Make use of selenium. Retrieve from html table using pandas table.
"""
driver = webdriver.Firefox()
driver.get(self.target_url_str)
## wait for log in then get the page source.
try:
driver.execute_script("window.scrollTo(0, 30000)")
time.sleep(2)
self.temp_page_source = driver.page_source
#driver.find_element_by_css_selector('ksb _kvc').click()#cant find the class
driver.find_element_by_id('smb').click() #ok
time.sleep(2)
driver.execute_script("window.scrollTo(0, 60000)")
time.sleep(2)
driver.execute_script("window.scrollTo(0, 60000)")
except:
print('not able to find')
driver.quit()
self.page_source = driver.page_source
driver.close()
def extract_pic_url(self):
""" extract all the raw pic url in list
"""
dom = DOM(self.page_source)
tag_list = dom('a.rg_l')
for tag in tag_list[:self.image_dl_per_search]:
tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href'])
try:
self.pic_url_list.append(tar_str.group(1))
except:
print('error parsing', tag)
def multi_search_download(self):
""" Mutli search download"""
for indiv_search in self.g_search_key_list:
self.pic_url_list = []
self.pic_info_list = []
self.g_search_key = indiv_search
self.formed_search_url()
self.retrieve_source_fr_html()
self.extract_pic_url()
self.downloading_all_photos() #some download might not be jpg?? use selnium to download??
self.save_infolist_to_file()
def downloading_all_photos(self):
""" download all photos to particular folder
"""
self.create_folder()
pic_counter = 1
for url_link in self.pic_url_list:
print(pic_counter)
pic_prefix_str = self.g_search_key + str(pic_counter)
self.download_single_image(url_link.encode(), pic_prefix_str)
pic_counter = pic_counter +1
def download_single_image(self, url_link, pic_prefix_str):
""" Download data according to the url link given.
Args:
url_link (str): url str.
pic_prefix_str (str): pic_prefix_str for unique label the pic
"""
self.download_fault = 0
file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
temp_filename = pic_prefix_str + file_ext
temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename )
valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive
url = URL(url_link)
if url.redirect:
return # if there is re-direct, return
if file_ext not in valid_image_ext_list:
return #return if not valid image extension
f = open(temp_filename_full_path, 'wb') # save as test.gif
print(url_link)
self.pic_info_list.append(pic_prefix_str + ': ' + url_link )
try:
f.write(url.download())#if have problem skip
except:
#if self.__print_download_fault:
print('Problem with processing this data: ', url_link)
self.download_fault =1
f.close()
def create_folder(self):
"""
Create a folder to put the log data segregate by date
"""
self.gs_raw_dirpath = os.path.join(self.folder_main_dir_prefix, time.strftime("_%d_%b%y", time.localtime()))
if not os.path.exists(self.gs_raw_dirpath):
os.makedirs(self.gs_raw_dirpath)
def save_infolist_to_file(self):
""" Save the info list to file.
"""
temp_filename_full_path = os.path.join(self.gs_raw_dirpath, self.g_search_key + '_info.txt' )
with open(temp_filename_full_path, 'w') as f:
for n in self.pic_info_list:
f.write(n)
f.write('\n')
\ No newline at end of file
%% Cell type:code id: tags:
``` python
from GoogleImageExtractor import GoogleImageExtractor
"""test the downloading of files"""
queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
w.set_num_image_to_dl(200)
w.get_searchlist_fr_file(searchlist_filename)#replace the searclist
w.multi_search_download()
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-4-0cfc1ae18607> in <module>
----> 1 from GoogleImageExtractor import GoogleImageExtractor
2 """test the downloading of files"""
3 queries = "brad pitt, johnny depp, leonardo dicaprio, robert de niro, angelina jolie, sandra bullock, catherine deneuve, marion cotillard"
4
5 w = GoogleImageExtractor(queries)#leave blanks if get the search list from file
/work/hslu-deep-learning/notebooks/Block 5/GoogleImageExtractor.py in <module>
7 import re, os, sys, datetime, time
8 import pandas
----> 9 from selenium import webdriver
10 from contextlib import closing
11 from selenium.webdriver import Firefox
ModuleNotFoundError: No module named 'selenium'
notebooks/Block 5/train/johnny depp/1. 416x416.jpg

19.3 KiB

notebooks/Block 5/train/johnny depp/10. johnny-depp.jpg

422 KiB

notebooks/Block 5/train/johnny depp/11. johnny-depp-5186.jpg

34.1 KiB

notebooks/Block 5/train/johnny depp/13. johnny-depp-glasses-round.jpg

19.1 KiB

notebooks/Block 5/train/johnny depp/14. johnny-depp.jpg

15.5 KiB

notebooks/Block 5/train/johnny depp/16. johnny_depp.jpg

23.1 KiB

notebooks/Block 5/train/johnny depp/17. 349b93899916ebfa19e17a05726490bb_400x400.jpeg

25.5 KiB

notebooks/Block 5/train/johnny depp/19. 18-johnny-depp.w700.h700.jpg

165 KiB

notebooks/Block 5/train/johnny depp/2. 20-johnny-depp.w330.h330.jpg

33.2 KiB

notebooks/Block 5/train/johnny depp/20. johnny-depp.jpg

480 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment