如何使用 selenium 和 scrapy 来自动化该过程?

Joh*_*ene 2 python ajax selenium scrapy

我一度知道你需要使用像 selenium 这样的网络工具包来自动化抓取。

我将如何能够点击谷歌游戏商店上的下一个按钮,以便仅出于我的大学目的抓取评论!

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time


class Product(scrapy.Item):
    title = scrapy.Field()


class FooSpider(CrawlSpider):
    name = 'foo'

    start_urls = ["https://play.google.com/store/apps/details?id=com.gaana&hl=en"]

    def __init__(self, *args, **kwargs):
        super(FooSpider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Chrome(executable_path="C:\chrm\chromedriver.exe")
        self.browser.implicitly_wait(60) # 

    def parse(self,response):
        self.browser.get(response.url)
        sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]')
        items = []
        for i in range(0,200):
            time.sleep(20)
            button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")
            button.click()
            self.browser.implicitly_wait(30)    
            for site in sites:
                item = Product()

                item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract()
                yield item
Run Code Online (Sandbox Code Playgroud)

我已经更新了我的代码,但它只一次又一次地给我重复的 40 个项目。我的 for 循环有什么问题吗?

似乎正在更新的源代码没有传递到 xpath,这就是为什么它返回相同的 40 个项目

Tim*_*fey 5

我会做类似的事情:

from scrapy import CrawlSpider
from selenium import webdriver
import time

class FooSpider(CrawlSpider):
    name = 'foo'
    allow_domains = 'foo.com'
    start_urls = ['foo.com']

    def __init__(self, *args, **kwargs):
        super(FooSpider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(60)

    def parse_foo(self.response):
        self.browser.get(response.url)  # load response to the browser
        button = self.browser.find_element_by_xpath("path") # find 
        # the element to click to
        button.click() # click
        time.sleep(1) # wait until the page is fully loaded
        source = self.browser.page_source # get source of the loaded page
        sel = Selector(text=source) # create a Selector object
        data = sel.xpath('path/to/the/data') # select data
        ...
Run Code Online (Sandbox Code Playgroud)

不过,最好不要等待固定的时间。因此 time.sleep(1),您可以使用此处描述的方法之一来代替http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html

  • 它仍然没有在浏览器中加载 url (3认同)