我使用Scrapy中的XMLFeedSpider废弃一个房地产网站.
我的蜘蛛生成的每个网址请求(通过start_urls)都会返回一个XML格式的网页,其中包含大量广告和指向下一页的链接(搜索结果仅限于50个广告).
因此,我想知道如何在蜘蛛中添加这个额外的页面作为新请求?
我一直在搜索stackoverflow一段时间,但我找不到一个简单的答案我的问题!
下面是我蜘蛛中的代码.我已经使用保罗提到的parse_nodes()方法更新了它,但由于某些原因未找到下一个网址.
我可以在adapt_response方法中产生额外的请求吗?
from scrapy.spider import log
from scrapy.selector import XmlXPathSelector
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import RefItem, PicItem
from crawler.seloger_helper import urlbuilder
from scrapy.http import Request
class Seloger_spider_XML(XMLFeedSpider):
name = 'Seloger_spider_XML'
allowed_domains = ['seloger.com']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'annonce'
'''Spider Initialized with department as argument'''
def __init__(self, departement=None, *args, **kwargs):
super(Seloger_spider_XML, self).__init__(*args, **kwargs)
#self.start_urls = urlbuilder(departement) #helper function which generate start_urls
self.start_urls = ['http://ws.seloger.com/search.xml?cp=72&idtt=2&tri=d_dt_crea&SEARCHpg=1']
def parse_node(self, response, node):
items = []
item = RefItem()
item['ref'] = int(''.join(node.select('//annonce/idAnnonce/text()').extract()))
item['desc'] = ''.join(node.select('//annonce/descriptif/text()').extract()).encode('utf-8')
item['libelle'] = ''.join(node.select('//annonce/libelle/text()').extract()).encode('utf-8')
item['titre'] = ''.join(node.select('//annonce/titre/text()').extract()).encode('utf-8')
item['ville'] = ''.join(node.select('//annonce/ville/text()').extract()).encode('utf-8')
item['url'] =''.join(node.select('//annonce/permaLien/text()').extract()).encode('utf-8')
item['prix'] = ''.join(node.select('//annonce/prix/text()').extract())
item['prixunite'] = ''.join(node.select('//annonce/prixUnite/text()').extract())
item['datemaj'] = ''.join(node.select('//annonce/dtFraicheur/text()').extract())[:10]
item['datecrea'] = ''.join(node.select('//annonce/dtCreation/text()').extract())[:10]
item['lati'] = (''.join(node.select('//annonce/latitude/text()').extract()))
item['longi'] = (''.join(node.select('//annonce/longitude/text()').extract()))
item['surface'] = (''.join(node.select('//annonce/surface/text()').extract()))
item['surfaceunite'] = (''.join(node.select('//annonce/surfaceUnite/text()').extract()))
item['piece'] = (''.join(node.select('//annonce/nbPiece/text()').extract())).encode('utf-8')
item['ce'] = (''.join(node.select('//annonce/dbilanEmissionGES/text()').extract())).encode('utf-8')
items.append(item)
for photos in node.select('//annonce/photos'):
for link in photos.select('photo/thbUrl/text()').extract():
pic = PicItem()
pic['pic'] = link.encode('utf-8')
pic['refpic'] = item['ref']
items.append(pic)
return items
def parse_nodes(self, response, nodes):
for n in super(Seloger_spider_XML, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
yield Request(link_url)
Run Code Online (Sandbox Code Playgroud)
谢谢吉尔斯
您可以覆盖parse_nodes()方法以挂钩"下一页"URL提取.
以下示例基于Scrapy docs XMLFeedExample:
from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
item = Item()
item['id'] = node.select('@id').extract()
item['name'] = node.select('name').extract()
item['description'] = node.select('description').extract()
return item
def parse_nodes(self, response, nodes):
# call built-in method that itself calls parse_node()
# and yield whatever it returns
for n in super(MySpider, self).parse_nodes(response, nodes):
yield n
# once you're done with item/nodes
# look for the next page link using XPath
# these lines are borrowed form
# https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/spiders/feed.py#L73
selector = XmlXPathSelector(response)
self._register_namespaces(selector)
for link_url in selector.select('//pageSuivante/text()').extract():
print "link_url", link_url
yield Request(link_url)
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
2803 次 |
| 最近记录: |