Yoa*_*ANG 6 python scrapy web-scraping selenium-webdriver
我想要:
目前我的蜘蛛是这样的:
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
nextPage = obtainNextPage()
if nextPage:
yield scrapy.Request(url=next_url, callback=self.parse_next)
Run Code Online (Sandbox Code Playgroud)
问题在于蜘蛛处理的一组链接,蜘蛛只能到达那组链接的最后一个链接的"下一页",我通过selenium + chromedriver查看.例如,我有10个链接(从No.1到No.10),我的蜘蛛只能获得No.10链接的下一页.我不知道问题是否发生是因为我的蜘蛛有一些结构性问题.以下是完整代码:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(15)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')[:2]:
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_next)
# checked
def pageScroll(self, url):
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_next(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
try: reply = sel.find_element_by_css_selector('ul.j_lzl_m_w').text
except: reply = ''
yield {'name': name, 'content': content, 'reply': reply}
#follow to next page
next_sel = self.driver.find_element_by_link_text("???")
next_url_name = next_sel.text
if next_sel and next_url_name == '???':
next_url = next_sel.get_attribute('href')
yield scrapy.Request(url=next_url, callback=self.parse_next)
Run Code Online (Sandbox Code Playgroud)
感谢您的帮助,欢迎任何建议参考我上面的代码
参考从一页抓取内容,存储它,并允许蜘蛛继续爬行以抓取并存储后续页面上的项目。您应该使用项目名称配置 items.py 文件,并使用元数据通过每个 scrapy.Request 传递项目。
你应该查看https://github.com/scrapy/scrapy/issues/1138
为了说明它是如何工作的,它是这样的... 1. 首先,我们设置 item.py 文件,其中包含要在每个页面上抓取的总项目。
#items.py
import scrapy
class ScrapyProjectItem(scrapy.Item):
page_one_item = scrapy.Field()
page_two_item = scrapy.Field()
page_three_item = scrapy.Field()
Run Code Online (Sandbox Code Playgroud)
然后将 items.py 项目类导入到 scrapy 蜘蛛中。
from scrapyproject.items import ScrapyProjectItem
Run Code Online (Sandbox Code Playgroud)
在您的抓取工具中,通过包含您想要的内容的每个页面迭代,其初始化 items.py 类,并使用“元”将项目传递给下一个请求。
#spider.py
def parse(self, response):
# Initializing the item class
item = ScrapyProjectItem()
# Itemizing the... item lol
item['page_one_item'] = response.css("etcetc::").extract() # set desired attribute
# Here we pass the items to the next concurrent request
for url in someurls: # Theres a million ways to skin a cat, dont know your exact use case.
yield scrapy.Request(response.urljoin(url),
callback=self.parse_next, meta={'item': item})
def parse_next(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_two_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
def parse_again(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_three_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
# At the end of each iteration of the crawl loop we can yield the result
yield item
Run Code Online (Sandbox Code Playgroud)
至于爬虫只能到达最后一个链接的问题,我想了解更多信息,而不是猜测问题可能是什么。在你的“parse_next”中,你应该添加一个“print(response.url)”来查看页面是否被访问?如果我不明白你的问题并浪费了大家的时间,我很抱歉,哈哈。
我想我更好地理解你的问题......你有一个网址列表,每个网址都有自己的一组网址,是吗?
在您的代码中,“obtainNextPage()”可能是问题所在?我过去遇到这种类型的情况时必须使用一些 xpath 和/或正则表达式魔法才能正确获取下一页。我不确定“obtainNextPage”在做什么,但是......您是否想过解析内容并使用选择器来查找下一页?例如。
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
#nextPage = obtainNextPage()
next_page = response.xpath('//path/to/nextbutton/orPage'):
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
callback=self.parse_next)
Run Code Online (Sandbox Code Playgroud)
您仍然应该添加“print(response.url)”以查看所请求的 url 是否被正确调用,可能是 urljoin 问题。
| 归档时间: |
|
| 查看次数: |
327 次 |
| 最近记录: |