如何告诉 python scrapy 移动到下一个起始 URL

mad*_*med 5 python scrapy web-scraping

我编写了一个 scrapy 蜘蛛,它有许多 start_url 并在这些 url 中提取电子邮件地址。该脚本需要很长时间才能执行,因此我想告诉 Scrapy 在发现电子邮件时停止抓取特定网站并移至下一个网站。

编辑:添加代码

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse

from entreprise.items import MailItem

class MailSpider(CrawlSpider):
    name = "mail"
    start_urls = []
    allowed_domains = []
    with open('scraped_data.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)
        for row in reader:
            url = row[5].strip()
            if (url.strip() != ""):
                start_urls.append(url)
                fragments = urlparse(url).hostname.split(".")
                hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
                allowed_domains.append(hostname)

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)
        return items
Run Code Online (Sandbox Code Playgroud)

ale*_*cxe 2

这个想法是使用start_requests方法来决定接下来要抓取哪些 url。此外,我们将跟踪电子邮件是否被解析为parsed_hostnames类级别集中的主机名。

另外,我现在使用urlparse更改了从 url 获取主机名的方式。

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse


class MailItem(Item):
    url = Field()
    mail = Field()


class MailSpider(CrawlSpider):
    name = "mail"

    parsed_hostnames= set()
    allowed_domains = []

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def start_requests(self):
        with open('scraped_data.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            next(reader)

            for row in reader:
                url = row[5].strip()
                if url:
                    hostname = urlparse(url).hostname
                    if hostname not in self.parsed_hostnames:
                        if hostname not in self.allowed_domains:
                            self.allowed_domains.append(hostname)
                            self.rules[0].link_extractor.allow_domains.add(hostname)
                            self.rules[1].link_extractor.allow_domains.add(hostname)

                        yield self.make_requests_from_url(url)
                    else:
                        self.allowed_domains.remove(hostname)
                        self.rules[0].link_extractor.allow_domains.remove(hostname)
                        self.rules[1].link_extractor.allow_domains.remove(hostname)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)

        hostname = urlparse(response.url).hostname
        self.parsed_hostnames.add(hostname)

        return items
Run Code Online (Sandbox Code Playgroud)

理论上应该有效。希望有帮助。