DrL*_*zer 7 python authentication mechanize beautifulsoup scrapy
我想从ubuntu服务器上运行的脚本登录我的yahoo帐户.我曾试图将python与机械化一起使用,但我的计划存在缺陷.
这是我目前的代码.
loginurl = "https://login.yahoo.com/config/login"
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
r = br.open(loginurl)
html = r.read()
br.select_form(nr=0)
br.form['login']='[mylogin]'
br.form['passwd']='[mypassword]'
br.submit()
print br.response().read()
Run Code Online (Sandbox Code Playgroud)
我得到的回复是雅虎登录页面,带有醒目的红色文本阅读."必须在你的broswer上启用Javascript"或类似的东西.机械化文档中有一节提到用JS创建cookie的页面,但是帮助页面返回HTTP 400(只是我的运气)
弄清楚javascript的功能,然后手动执行它听起来像是一项非常困难的任务.我愿意切换到任何工具/语言,只要它可以在ubuntu服务器上运行.即使这意味着使用不同的工具进行登录,然后将登录cookie传递回我的python脚本.任何帮助/建议表示赞赏.
更新:
我不想使用Yahoo API
我也尝试过scrapy,但我认为同样的问题也会发生
我的scrapy脚本
class YahooSpider(BaseSpider):
name = "yahoo"
start_urls = [
"https://login.yahoo.com/config/login?.intl=us&.lang=en-US&.partner=&.last=&.src=&.pd=_ver%3D0%26c%3D%26ivt%3D%26sg%3D&pkg=&stepid=&.done=http%3a//my.yahoo.com"
]
def parse(self, response):
x = HtmlXPathSelector(response)
print x.select("//input/@value").extract()
return [FormRequest.from_response(response,
formdata={'login': '[my username]', 'passwd': '[mypassword]'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
if response.url == 'http://my.yahoo.com':
return Request("[where i want to go next]",
callback=self.next_page, errback=self.error, dont_filter=True)
else:
print response.url
self.log("Login failed.", level=log.CRITICAL)
def next_page(sekf, response):
x = HtmlXPathSelector(response)
print x.select("//title/text()").extract()
Run Code Online (Sandbox Code Playgroud)
scrapy脚本只输出"https://login.yahoo.com/config/login"...... boo
我很惊讶这有效:
Python 2.6.6 (r266:84292, Dec 26 2010, 22:31:48)
[GCC 4.4.5] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> from BeautifulSoup import BeautifulSoup as BS
>>> import requests
>>> r = requests.get('https://login.yahoo.com/')
>>> soup = BS(r.text)
>>> login_form = soup.find('form', attrs={'name':'login_form'})
>>> hiddens = login_form.findAll('input', attrs={'type':'hidden'})
>>> payload = {}
>>> for h in hiddens:
... payload[str(h.get('name'))] = str(h.get('value'))
...
>>> payload['login'] = 'testtest481@yahoo.com'
>>> payload['passwd'] = '********'
>>> post_url = str(login_form.get('action'))
>>> r2 = requests.post(post_url, cookies=r.cookies, data=payload)
>>> r3 = requests.get('http://my.yahoo.com', cookies=r2.cookies)
>>> page = r3.text
>>> pos = page.find('testtest481')
>>> print page[ pos - 50 : pos + 300 ]
You are signed in as: <span class="yuhead-yid">testtest481</span> </li> </ul></li><li id="yuhead-me-signout" class="yuhead-me"><a href="
http://login.yahoo.com/config/login?logout=1&.direct=2&.done=http://www.yahoo.com&.src=my&.intl=us&.lang=en-US" target="_top" rel="nofoll
ow"> Sign Out </a><img width='0' h
>>>
Run Code Online (Sandbox Code Playgroud)
请尝试一下:
"""
ylogin.py - how-to-login-to-yahoo-programatically-from-an-ubuntu-server
http://stackoverflow.com/questions/11974478/
Test my.yahoo.com login using requests and BeautifulSoup.
"""
from BeautifulSoup import BeautifulSoup as BS
import requests
CREDS = {'login': 'CHANGE ME',
'passwd': 'CHANGE ME'}
URLS = {'login': 'https://login.yahoo.com/',
'post': 'https://login.yahoo.com/config/login?',
'home': 'http://my.yahoo.com/'}
def test():
cookies = get_logged_in_cookies()
req_with_logged_in_cookies = requests.get(URLS['home'], cookies=cookies)
assert 'You are signed in' in req_with_logged_in_cookies.text
print "If you can see this message you must be logged in."
def get_logged_in_cookies():
req = requests.get(URLS['login'])
hidden_inputs = BS(req.text).find('form', attrs={'name':'login_form'})\
.findAll('input', attrs={'type':'hidden'})
data = dict(CREDS.items() + dict( (h.get('name'), h.get('value')) \
for h in hidden_inputs).items() )
post_req = requests.post(URLS['post'], cookies=req.cookies, data=data)
return post_req.cookies
test()
Run Code Online (Sandbox Code Playgroud)
根据需要添加错误处理。
| 归档时间: |
|
| 查看次数: |
3503 次 |
| 最近记录: |