SMT*_*MTH -2 python web-scraping python-3.x python-requests
Oxford, Oxfordshire我正在尝试使用请求模块从此网页中抓取此搜索的不同列表。这是我点击搜索按钮之前输入框的样子。
我已经定义了一个准确的选择器来定位列表,但脚本无法获取任何数据。
import requests
from pprint import pprint
from bs4 import BeautifulSoup
link = 'https://www.zoopla.co.uk/search/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
'Referer': 'https://www.zoopla.co.uk/for-sale/',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
params = {
'view_type': 'list',
'section': 'for-sale',
'q': 'Oxford, Oxfordshire',
'geo_autocomplete_identifier': 'oxford',
'search_source': 'home'
}
res = requests.get(link,params=params,headers=headers)
soup = BeautifulSoup(res.text,"html5lib")
for item in soup.select("[id^='listing'] a[href^='/for-sale/details/']:has(h2[data-testid='listing-title'])"):
print(item.get("href"))
Run Code Online (Sandbox Code Playgroud)
编辑:
如果我尝试类似以下的操作,脚本似乎可以完美运行。唯一的主要问题是我必须在标头中使用硬编码的 cookie,这些 cookie 在几分钟内就会过期。
import json
from pprint import pprint
from bs4 import BeautifulSoup
import cloudscraper
base = 'https://www.zoopla.co.uk{}'
link = 'https://www.zoopla.co.uk/for-sale/'
url = 'https://www.zoopla.co.uk/for-sale/property/oxford/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'cookie': 'ajs_anonymous_id=caa7072ed7f64911a51dda2b525a3ca3; zooplapsid=cafe5156dd8f4cdda14e748c9270f623; base_device_id=68f7b6b7-27b8-429e-af66-366a4b64bac4; g_state={"i_p":1675619616576,"i_l":2}; zid=31173482e60549da9ccc1632e52a264c; zooplasid=31173482e60549da9ccc1632e52a264c; base_session_start_page=https://www.zoopla.co.uk/; base_request=https://www.zoopla.co.uk/; base_session_id=2315eaf2-6d59-4075-aeaa-6288af3efef7; base_session_count=8; forced_features={}; forced_experiments={}; active_session=anon; _gid=GA1.3.821027055.1675853830; __cf_bm=6.bEGFdT2vYz3G3iO7swuTFwSfhyzA0DvGoCjB6KvVg-1675853990-0-AQqWHydhL+/hqq8KRqOpCKDNtd6E96qjLgyOF77S8f7DpqCbMFoxAycD8ahQd7FOShSq0oHD//gpDj095eQPdtccDyZ0qu6GvxiSpjNP0+D7sblJP1e3Mlmxw5YroG3O4OuJHgBco3zThrx2SRyVDfx7M1zNlwi/1OVfww/u2wfb5DCW+gGz1b18zEvpNRszYQ==; cookie_consents={"schemaVersion":4,"content":{"brand":1,"consents":[{"apiVersion":1,"stored":false,"date":"Wed, 08 Feb 2023 10:59:02 GMT","categories":[{"id":1,"consentGiven":true},{"id":3,"consentGiven":false},{"id":4,"consentGiven":false}]}]}}; _ga=GA1.3.1980576228.1675275335; _ga_HMGEC3FKSZ=GS1.1.1675853830.7.1.1675853977.0.0.0'
}
params = {
'q': 'Oxford, Oxfordshire',
'search_source': 'home',
'pn': 1
}
scraper = cloudscraper.create_scraper()
res = scraper.get(url,params=params,headers=headers)
print(res.status_code)
soup = BeautifulSoup(res.text,"lxml")
container = soup.select_one("script[id='__NEXT_DATA__']").contents[0]
items = json.loads(container)['props']['pageProps']['initialProps']['regularListingsFormatted']
for item in items:
print(item['address'],base.format(item['listingUris']['detail']))
Run Code Online (Sandbox Code Playgroud)
如何在不使用标题中的硬编码 cookie 的情况下从该网站获取内容?
以下代码示例在不添加 headers 和 params 参数的情况下运行顺利。该网站的数据不是动态的,这意味着您可以从静态 HTML dom 获取所需的数据,但主要障碍是它们使用 Cloudflare 保护。因此,要摆脱这种限制,您可以使用 cloudscraper 代替requests module或 selenium。这里我使用cloudscraper,效果很好。
脚本:
import pandas as pd
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper()
kw= ['Oxford', 'Oxfordshire']
data = []
for k in kw:
for page in range(1,3):
url = f"https://www.zoopla.co.uk/for-sale/property/oxford/?search_source=home&q={k}&pn={page}"
page = scraper.get(url)
#print(page)
soup = BeautifulSoup(page.content, "html.parser")
for card in soup.select('[data-testid="regular-listings"] [id^="listing"]'):
link = "https://www.zoopla.co.uk" + card.a.get("href")
print(link)
#data.append({'link':link})
# df = pd.DataFrame(data)
# print(df)
Run Code Online (Sandbox Code Playgroud)
输出:
https://www.zoopla.co.uk/for-sale/details/63903233/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63898182/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63898168/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63898177/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63897930/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63897571/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63896910/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63896858/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63896815/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63893187/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/47501048/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63891727/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63890876/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63889459/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63888298/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63887586/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63887525/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/59469692/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63882084/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63878480/?search_identifier=cbe92a4f0868061e26dff87f97442c6a
https://www.zoopla.co.uk/for-sale/details/63877980/?search_identifier=cbe92a4f0868061e26dff87f97442c6a
Run Code Online (Sandbox Code Playgroud)
... 很快
| 归档时间: |
|
| 查看次数: |
399 次 |
| 最近记录: |