Ram*_* KC 8 python unit-testing scrapy web-scraping scrapy-spider
class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(@class,"SkyScrapperBoxes")]/div[contains(@class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/@href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[@class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[@id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
Run Code Online (Sandbox Code Playgroud)
我目前正在研究Scrapy来抓取网站.我对python中的unittest有一些了解.但是,我怎么能写单元测试,以检查链接是否正常工作,并且item['location']
,item['details']
正在返回的值或没有?我已经学习了Scrapy合同但是无法理解任何东西.所以,在这种情况下如何编写单元测试?
ale*_*cxe 12
如果我们专门讨论如何测试蜘蛛(不是管道或加载器),那么我们所做的就是从本地HTML文件中提供"假响应".示例代码:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Run Code Online (Sandbox Code Playgroud)
然后,在您的TestCase
班级中,调用该fake_response()
函数并将响应提供给parse()
回调:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Run Code Online (Sandbox Code Playgroud)
除此之外,你绝对应该开始使用Item Loaders
输入和输出处理器 - 这将有助于实现更好的模块化,因此,隔离 - 蜘蛛只会产生项目实例,数据准备和修改将被封装在加载器内,你会单独测试.
归档时间: |
|
查看次数: |
3503 次 |
最近记录: |