use*_*355 25 python web-scraping
我想使用python下载所有谷歌图像搜索图像.我使用的代码似乎有些问题.我的代码是
import os
import sys
import time
from urllib import FancyURLopener
import urllib2
import simplejson
# Define search term
searchTerm = "parrot"
# Replace spaces ' ' in search term for '%20' in order to comply with request
searchTerm = searchTerm.replace(' ','%20')
# Start FancyURLopener with defined version
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
# Set count to 0
count= 0
for i in range(0,10):
# Notice that the start changes for each iteration in order to request a new set of images for each loop
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0& q='+searchTerm+'&start='+str(i*10)+'&userip=MyIP')
print url
request = urllib2.Request(url, None, {'Referer': 'testing'})
response = urllib2.urlopen(request)
# Get results using JSON
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
# Iterate for each result and get unescaped url
for myUrl in dataInfo:
count = count + 1
my_url = myUrl['unescapedUrl']
myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
Run Code Online (Sandbox Code Playgroud)
下载几页后,我收到如下错误:
Traceback(最近一次调用最后一次):
File "C:\Python27\img_google3.py", line 37, in <module>
dataInfo = data['results']
TypeError: 'NoneType' object has no attribute '__getitem__'
Run Code Online (Sandbox Code Playgroud)
该怎么办 ??????
ris*_*r0y 41
我修改了我的代码.现在代码可以为给定查询下载100个图像,并且图像是全高分辨率,即正在下载原始图像.
我正在使用urllib2和美丽的汤下载图像
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
import cookielib
import json
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
query = raw_input("query image")# you can change the query for the image here
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print url
#add the directory for your image here
DIR="Pictures"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print "there are total" , len(ActualImages),"images"
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i , (img , Type) in enumerate( ActualImages):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print cntr
if len(Type)==0:
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb')
else :
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print "could not load : "+img
print e
Run Code Online (Sandbox Code Playgroud)
我希望这可以帮助你
job*_*bin 21
在谷歌图片搜索API已被弃用,你需要使用谷歌自定义搜索你想要达到的目标.要获取图像,您需要执行以下操作:
import urllib2
import simplejson
import cStringIO
fetcher = urllib2.build_opener()
searchTerm = 'parrot'
startIndex = 0
searchUrl = "http://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + searchTerm + "&start=" + startIndex
f = fetcher.open(searchUrl)
deserialized_output = simplejson.load(f)
Run Code Online (Sandbox Code Playgroud)
这将为您提供4个结果,作为JSON,您需要通过递增startIndexAPI请求来迭代地获得结果.
要获取图像,您需要使用像cStringIO这样的库.
例如,要访问第一个图像,您需要执行以下操作:
imageUrl = deserialized_output['responseData']['results'][0]['unescapedUrl']
file = cStringIO.StringIO(urllib.urlopen(imageUrl).read())
img = Image.open(file)
Run Code Online (Sandbox Code Playgroud)
这是我最新的 google image snarfer,用 Python 编写,使用 Selenium 和无头 Chrome。
它需要python-selenium、chromium-driver和 一个retry从 pip调用的模块。
链接:http : //sam.aiki.info/b/google-images.py
示例用法:
google-images.py tiger 10 --opts isz:lt,islt:svga,itp:photo > urls.txt
parallel=5
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
(i=0; while read url; do wget -e robots=off -T10 --tries 10 -U"$user_agent" "$url" -O`printf %04d $i`.jpg & i=$(($i+1)) ; [ $(($i % $parallel)) = 0 ] && wait; done < urls.txt; wait)
Run Code Online (Sandbox Code Playgroud)
帮助用法:
$ google-images.py --help
用法:google-images.py [-h] [--safe SAFE] [--opts OPTS] 查询 n
从 Google 图片搜索中获取图片网址。
位置参数:
查询图片搜索查询
n 图像数量(大约)
可选参数:
-h, --help 显示此帮助信息并退出
--safe SAFE 安全搜索 [off|active|images]
--opts OPTS 搜索选项,例如
isz:lt,islt:svga,itp:photo,ic:color,ift:jpg
代码:
#!/usr/bin/env python3
# requires: selenium, chromium-driver, retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import selenium.common.exceptions as sel_ex
import sys
import time
import urllib.parse
from retry import retry
import argparse
import logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
logger = logging.getLogger()
retry_logger = None
css_thumbnail = "img.Q4LuWd"
css_large = "img.n3VNCb"
css_load_more = ".mye4qd"
selenium_exceptions = (sel_ex.ElementClickInterceptedException, sel_ex.ElementNotInteractableException, sel_ex.StaleElementReferenceException)
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
@retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def get_thumbnails(wd, want_more_than=0):
wd.execute_script("document.querySelector('{}').click();".format(css_load_more))
thumbnails = wd.find_elements_by_css_selector(css_thumbnail)
n_results = len(thumbnails)
if n_results <= want_more_than:
raise KeyError("no new thumbnails")
return thumbnails
@retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def get_image_src(wd):
actual_images = wd.find_elements_by_css_selector(css_large)
sources = []
for img in actual_images:
src = img.get_attribute("src")
if src.startswith("http") and not src.startswith("https://encrypted-tbn0.gstatic.com/"):
sources.append(src)
if not len(sources):
raise KeyError("no large image")
return sources
@retry(exceptions=selenium_exceptions, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def retry_click(el):
el.click()
def get_images(wd, start=0, n=20, out=None):
thumbnails = []
count = len(thumbnails)
while count < n:
scroll_to_end(wd)
try:
thumbnails = get_thumbnails(wd, want_more_than=count)
except KeyError as e:
logger.warning("cannot load enough thumbnails")
break
count = len(thumbnails)
sources = []
for tn in thumbnails:
try:
retry_click(tn)
except selenium_exceptions as e:
logger.warning("main image click failed")
continue
sources1 = []
try:
sources1 = get_image_src(wd)
except KeyError as e:
pass
# logger.warning("main image not found")
if not sources1:
tn_src = tn.get_attribute("src")
if not tn_src.startswith("data"):
logger.warning("no src found for main image, using thumbnail")
sources1 = [tn_src]
else:
logger.warning("no src found for main image, thumbnail is a data URL")
for src in sources1:
if not src in sources:
sources.append(src)
if out:
print(src, file=out)
out.flush()
if len(sources) >= n:
break
return sources
def google_image_search(wd, query, safe="off", n=20, opts='', out=None):
search_url_t = "https://www.google.com/search?safe={safe}&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img&tbs={opts}"
search_url = search_url_t.format(q=urllib.parse.quote(query), opts=urllib.parse.quote(opts), safe=safe)
wd.get(search_url)
sources = get_images(wd, n=n, out=out)
return sources
def main():
parser = argparse.ArgumentParser(description='Fetch image URLs from Google Image Search.')
parser.add_argument('--safe', type=str, default="off", help='safe search [off|active|images]')
parser.add_argument('--opts', type=str, default="", help='search options, e.g. isz:lt,islt:svga,itp:photo,ic:color,ift:jpg')
parser.add_argument('query', type=str, help='image search query')
parser.add_argument('n', type=int, default=20, help='number of images (approx)')
args = parser.parse_args()
opts = Options()
opts.add_argument("--headless")
# opts.add_argument("--blink-settings=imagesEnabled=false")
with webdriver.Chrome(options=opts) as wd:
sources = google_image_search(wd, args.query, safe=args.safe, n=args.n, opts=args.opts, out=sys.stdout)
main()
Run Code Online (Sandbox Code Playgroud)
Google弃用他们的API,抓取Google很复杂,所以我建议使用Bing API:
https://datamarket.azure.com/dataset/5BA839F1-12CE-4CCE-BF57-A49D98D29A44
谷歌并不是那么好,微软也不是那么邪恶
| 归档时间: |
|
| 查看次数: |
70459 次 |
| 最近记录: |