adr*_*ian 4 python scrapy web-scraping
我正在使用 Scrapy FormRequest抓取此网页https://researchgrant.gov.sg/eservices/advanced-search/?keyword=&source=sharepoint&type=project&status=open&_pp_projectstatus=&_pp_hiname=&_pp_piname=&_pp_source=sharepoint&_pp_details=#project。我的代码如下。参数_pp_hinamewithab和_pp_pinamewithpua应该只返回 1 个结果,response.text而是返回 HTML 代码中的所有结果。参数显然不起作用,但我看不出有什么问题。
def start_requests(self):
params = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
yield scrapy.FormRequest('https://researchgrant.gov.sg/eservices/mvcgrid',callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})
def parse_item(self,response):
print(response.text)
Run Code Online (Sandbox Code Playgroud)
最新更新:
class ToScrapeCSSSpiderSG(scrapy.Spider):
name = "toscrapesg-css"
# start_urls = [
# 'https://researchgrant.gov.sg/eservices/mvcgrid',
# ]
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': '',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
def start_requests(self):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url,callback=self.parse_item,method='POST',formdata=self.params,headers = {'X-Requested-With':'XMLHttpRequest'})
def parse_item(self,response):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
link=row.xpath('td[1]/a/@href').extract_first()
yield scrapy.FormRequest(link,callback = self.parse_product,method='GET')
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers = {'X-Requested-With': 'XMLHttpRequest'})
def parse_product(self,response):
text = response.xpath('//span[contains(@id,"ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle")]/text()').extract()
# text = info.xpath('./text()').extract()
print(text)
Run Code Online (Sandbox Code Playgroud)
它只在POST正文中发送Name=advancesearchawardedprojectsp。其他参数应该在 url 中作为查询。
所以url应该是
您可以urllib.parse.urlencode(args)为此使用。
它给了我一个结果。
import urllib.parse
def start_requests(self):
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
}
args = urllib.parse.urlencode(args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})
Run Code Online (Sandbox Code Playgroud)
编辑:加载下一页并检查按钮Next Page停止的示例。
编辑:现在它可以保存在csv文件中。
import scrapy
import urllib.parse
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
params = {
'name': 'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
#'_pp_hiname': 'tan',
#'_pp_piname': '',
'_pp_hiname': 'ab',
'_pp_piname': '', #'pua',
'_pp_source': '',
'_pp_details': '',
}
def start_requests(self):
# create request for first page
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
def parse_item(self,response):
#print('parse_item] url:', response.url)
#print('parse_item] text:', response.text)
#for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
# for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
# link = row.xpath('td[1]/a/@href').extract_first()
# yield scrapy.Request(link, callback=self.parse_product)
for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
link = row.xpath('.//a/@href').get()
#title = row.xpath('.//a/text()').get()
yield scrapy.Request(link, callback=self.parse_product)
# create request for next page
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
# next page
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
def parse_product(self, response):
#print('parse_product] url:', response.url)
# .extract_first() or .get() instead of .extract()
project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get()
title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get()
pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get()
hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get()
date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get()
# etc.
item = {
'id': project_id,
'title': title,
'pi': pi,
'hi': hi,
'date': date,
}
yield item
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
Run Code Online (Sandbox Code Playgroud)