您好我已经安装了Scrapyjs + Splash,我使用以下代码
import json
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spider import Spider
from scrapy.selector import Selector
import urlparse, random
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["whoscored.com"]
start_urls = ['http://www.whoscored.com/Regions/81/Tournaments/3/Seasons/4336/Stages/9192/Fixtures/Germany-Bundesliga-2014-2015']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
})
def parse(self, response):
cnt = 0
with open('links2.txt', 'a') as f:
while True:
try:
data = ''.join(Selector(text=response.body).xpath('//a[@class="match-link match-report rc"]/@href')[cnt].extract())
data = "https://www.whoscored.com"+data
except:
break
f.write(data+'\n')
cnt += 1 …Run Code Online (Sandbox Code Playgroud) 我正在使用ScrapyJS和Splash模拟表单提交按钮单击
def start_requests(self):
script = """
function main(splash)
assert(splash:autoload("https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"))
assert(splash:go(splash.args.url))
local js = [[
var $j = jQuery.noConflict();
$j('#USER').val('frankcastle');
$j('#password').val('punisher');
$j('.button-oblong-orange.button-orange a').click();
]]
assert(splash:runjs(js))
local resumeJs = [[
function main(splash) {
var $j = jQuery.noConflict();
$j(document).ready(function(){
splash.resume();
})
}
]]
assert(splash:wait_for_resume(resumeJs))
return {
html = splash:html()
}
end
"""
splash_meta = {'splash': {'endpoint': 'execute', 'args': {'wait': 0.5, 'lua_source': script}}}
for url in self.start_urls:
yield scrapy.Request(url, self.after_login, meta=splash_meta)
def after_login(self, response):
print response.body
return
Run Code Online (Sandbox Code Playgroud)
在做完之后splash:runjs(js),我试图splash:wait(5)splash:wait_for_resume …
我正在尝试从使用javascript之类的页面获取网址
<span onclick="go1()">click here </span>
<script>function go1(){
window.location = "../innerpages/" + myname + ".php";
}
</script>
Run Code Online (Sandbox Code Playgroud)
这是我的代码使用scrapyjs与飞溅
def start_requests(self):
for url in self.start_urls:
yield Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 4, 'html': 1, 'png': 1, 'render_all': 1, 'js_source': 'document.getElementsByTagName("span")[0].click()'},
}
})
Run Code Online (Sandbox Code Playgroud)
如果我写
'js_source': 'document.title="hello world"'
Run Code Online (Sandbox Code Playgroud)
它会工作
似乎我可以处理页面内的文本,但无法从URL获取 go1()
如果我想获取网址,该怎么办 go1()
谢谢!
我想在请求中使用splash ,就像这样
requests.post(myUrl,headers=myHeaders, data=payload, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 1}
}
})
Run Code Online (Sandbox Code Playgroud)
但我有这个错误
TypeError: request() got an unexpected keyword argument 'meta'
Run Code Online (Sandbox Code Playgroud)
我知道这与scrapy.Request有关,但我想用于请求