在Scrapy文档中,有以下示例说明如何在Scrapy中使用经过身份验证的会话:
class LoginSpider(BaseSpider):
name = 'example.com'
start_urls = ['http://www.example.com/users/login.php']
def parse(self, response):
return [FormRequest.from_response(response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
# continue scraping with authenticated session...
Run Code Online (Sandbox Code Playgroud)
我有那个工作,没关系.但我的问题是:continue scraping with authenticated session正如他们在最后一行的评论中所说,你需要做什么?
因此,我已经通过Scrapy中的经过身份验证的会话阅读并且我被挂断了,我99%确定我的解析代码是正确的,我只是不相信登录是重定向并且成功.
我也遇到了check_login_response()的问题,不知道它正在检查哪个页面.虽然"注销"会有意义.
======更新======
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from linkedpy.items import LinkedPyItem
class LinkedPySpider(InitSpider):
name = 'LinkedPy'
allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
start_urls = ["http://www.linkedin.com/csearch/results?type=companies&keywords=&pplSearchOrigin=GLHD&pageKey=member-home&search=Search#facets=pplSearchOrigin%3DFCTD%26keywords%3D%26search%3DSubmit%26facet_CS%3DC%26facet_I%3D80%26openFacets%3DJO%252CN%252CCS%252CNFR%252CF%252CCCR%252CI"]
def init_request(self):
#"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'session_key': 'user@email.com', 'session_password': 'somepassword'},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check …Run Code Online (Sandbox Code Playgroud) 我正在尝试使用Scrapy登录到init中的网站然后在确认登录后我想初始化并通过start_urls启动标准爬行.我不知道出了什么问题,但我清楚登录并且每件事都确认,但parse_item永远不会启动.任何帮助将不胜感激.
我可以把它弄到"================成功登录================="
但
我无法进入"========================== PARSE ITEM ================ =========="
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from selenium import webdriver
class ProductDetailsSpider(InitSpider):
name = 'product_details_spider'
allowed_domains = ['my_domain.com']
login_page = 'http://www.my_domain.com/'
start_urls = ['http://www.my_domain.com/nextpage1/',
'http://www.my_domain.com/nextpage2/',
'http://www.my_domain.com/nextpage3/']
rules = (
Rule(SgmlLinkExtractor(allow=()),
callback='parse_item',
follow=True),
)
def get_cookies(self):
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "http://www.my_domain.com"
driver.get(base_url + "/")
driver.find_element_by_name("USR").clear()
driver.find_element_by_name("USR").send_keys("my_user")
driver.find_element_by_name("PASSWRD").clear()
driver.find_element_by_name("PASSWRD").send_keys("my_pass")
driver.find_element_by_name("submit").click()
cookies = driver.get_cookies()
driver.close()
cookie_dic = {}
for c in cookies: …Run Code Online (Sandbox Code Playgroud) 所以我正在构建这个蜘蛛并且它爬行很好,因为我可以登录到shell并浏览HTML页面并测试我的Xpath查询.
不知道我做错了什么.任何帮助,将不胜感激.我已经重新安装了Twisted,但没有.
我的蜘蛛看起来像这样 -
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from spider_scrap.items import spiderItem
class spider(BaseSpider):
name="spider1"
#allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com"
]
def parse(self, response):
items = []
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[@id="search_results"]/div[1]/div')
for site in sites:
item = spiderItem()
item['title'] = site.select('div[2]/h2/a/text()').extract item['author'] = site.select('div[2]/span/a/text()').extract
item['price'] = site.select('div[3]/div[1]/div[1]/div/b/text()').extract()
items.append(item)
return items
Run Code Online (Sandbox Code Playgroud)
当我运行蜘蛛 - scrapy爬行Spider1时,我收到以下错误 -
2012-09-25 17:56:12-0400 [scrapy] DEBUG: Enabled item pipelines:
2012-09-25 17:56:12-0400 [Spider1] INFO: Spider opened
2012-09-25 17:56:12-0400 [Spider1] INFO: Crawled …Run Code Online (Sandbox Code Playgroud)