Fre*_*ang 4 python beautifulsoup web-crawler
我尝试使用 python 来获取产品信息,如名称和价格。但这一次不起作用,即使我通过网络浏览器程序员模式检查html代码来获取类名并尝试使用这个名称来获取我想要的任何东西。
但我得到的结果是这样的,我找不到 的任何项目"class_="col-xs-2-4 shopee-search-item-result__item",我应该添加更多的标题信息吗?
打印结果
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json
url = 'https://shopee.tw/shop/1819984/search?shopCollection=9271157'
headers = {
'Host': 'shopee.tw',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Cookie':'SPC_IA=-1; SPC_EC=-; SPC_F=L07IMDECRHjifEKyg7XuNCJ00GNdJGTA; REC_T_ID=246cfcdc-18fa-11ea-b254-f8f21e2be0b8; SPC_T_ID="Fyr1skVDq7FDiJOuTYHBmMfMr2Cw1eZyPbYJhBYoRmf/gvfvkOf5zgjIVXLrYYlg32aSx1PfmhWq7QsQzwM86mdeXG8VU7ERK4N+gfPFd14="; SPC_U=-; SPC_T_IV="/oJN8EB7iQwg7+n5mXd6cw=="; _gcl_au=1.1.788704691.1575727322; _fbp=fb.1.1575727322914.443117835; _ga=GA1.2.1422761069.1575727324; __BWfp=c1575727332595xf5a099d8b; cto_lwid=7ea874b3-f31f-47d7-aef9-60eed0156d33; cto_bundle=0tgQ7V9rU3JlRTU4aWlTc09JNXRaN014Y3ZXa1BtVVcwT2RhOU1UZ0tweUFvWUo2WHRPQjd0JTJCM1duaG5iWXFFRWxpbHZkTFluWUZLSEFudTFreGJueFoxU0EyanhnMWN6ZEVIUVV6cFlhd050emhFMWQ4bmhVelZwVSUyRmwwQUp5c29lOEhPT2ZobE10S1dvT09HYWNhVXV1YWx5R3dSOGw0MHcwZWpiZ2pXU2VHSzdrJTNE; _med=refer; G_ENABLED_IDPS=google; fbm_382498665271383=base_domain=.shopee.tw; SPC_SI=jq6hwq6ju6hig9hfulumcagdqaiopatc; _gid=GA1.2.143857303.1577796150; csrftoken=3Pya3o5WYEvhLOj9FqCqbV3angfwBlko; AMP_TOKEN=%24NOT_FOUND; _dc_gtm_UA-61915057-6=1'
}
r = requests.get(url,headers=headers,allow_redirects=True)
print(r.status_code)
print(r.history)
print(r.url)
soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
print(len(items))
```**strong text**
Run Code Online (Sandbox Code Playgroud)
该页面用于JavaScript显示项目但BeautifulSoup/requests无法运行JavaScipt。
使用DevToolsin Firefox/ Chrome(tab "Network") 我发现 url 用于JavaScript从服务器获取 JSON 数据,因此它甚至不需要BeautifulSoup.
为了正确工作,它需要所有这些标头。
\n如果没有User-Agent,X-Requested-With它会发送空数据。
\n没有Referer它就不会发送价格。
import requests\n\nurl = 'https://shopee.tw/api/v2/search_items/?by=pop&limit=30&match_id=1819984&newest=0&order=desc&page_type=shop&shop_categoryids=9271157&version=2'\n\nheaders = {\n 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',\n 'X-Requested-With': 'XMLHttpRequest',\n 'Referer': 'https://shopee.tw/shop/1819984/search?shopCollection=9271157',\n} \n\nr = requests.get(url, headers=headers)\n\ndata = r.json()\n\n#print(data['items'][0].keys())\n\nfor item in data['items']:\n print('name:', item['name'])\n print('prince:', item['price'])\n print('sold:', item['historical_sold'])\n print('---')\n\n#print(data['items'][0]) # for test only \nRun Code Online (Sandbox Code Playgroud)\n结果:
\nname: \xe3\x80\x8e\xe7\x8f\xbe\xe8\xb2\xa8+\xe9\xa0\x90\xe8\xb3\xbc\xe3\x80\x8f Balea \xe7\xb2\xbe\xe8\x8f\xaf\xe8\x86\xa0\xe5\x9b\x8a 7 \xe5\x85\xa5\nprince: 4900000\nsold: 5104\n---\nname: \xe7\x8f\xbe\xe8\xb2\xa8\xe4\xbe\x9b\xe6\x87\x89 \xe5\xbe\xb7\xe5\x9c\x8b Invisibobble \xe7\xa5\x9e\xe5\xa5\x87\xe9\xad\x94\xe9\xab\xae\xe5\x9c\x88\xe6\xb5\x81\xe7\xb7\x9a\xe9\xad\x94\xe9\xab\xae\xe5\x9c\x88\nprince: 7500000\nsold: 26\n---\nRun Code Online (Sandbox Code Playgroud)\n顺便说一句:为了测试查看可以使用json缩进格式化的所有值
import json\n\nprint(json.dumps(data['items'][0], indent=4))\nRun Code Online (Sandbox Code Playgroud)\n结果:
\n{\n "itemid": 1212735748,\n "welcome_package_info": null,\n "liked": false,\n "recommendation_info": null,\n "bundle_deal_info": null,\n "price_max_before_discount": -1,\n "image": "338673ff6f2b23d63514e5af85269d46",\n "is_cc_installment_payment_eligible": false,\n "shopid": 1819984,\n "can_use_wholesale": true,\n "group_buy_info": null,\n "reference_item_id": "",\n "currency": "TWD",\n "raw_discount": null,\n "show_free_shipping": false,\n "video_info_list": [],\n "ads_keyword": null,\n "collection_id": null,\n "images": [\n "338673ff6f2b23d63514e5af85269d46"\n ],\n "match_type": null,\n "price_before_discount": 0,\n "is_category_failed": false,\n "show_discount": 0,\n "cmt_count": 306,\n "view_count": 93,\n "display_name": null,\n "catid": 67,\n "json_data": null,\n "upcoming_flash_sale": null,\n "is_official_shop": false,\n "brand": "Dm Ebelin",\n "price_min": 4900000,\n "liked_count": 136,\n "can_use_bundle_deal": false,\n "show_official_shop_label": false,\n "coin_earn_label": null,\n "price_min_before_discount": -1,\n "cb_option": 0,\n "sold": 0,\n "deduction_info": null,\n "stock": 3647,\n "status": 1,\n "price_max": 4900000,\n "add_on_deal_info": null,\n "is_group_buy_item": null,\n "flash_sale": null,\n "price": 4900000,\n "shop_location": "\\u53f0\\u4e2d\\u5e02\\u6f6d\\u5b50\\u5340",\n "item_rating": {\n "rating_star": 4.996732,\n "rating_count": [\n 306,\n 0,\n 0,\n 0,\n 1,\n 305\n ],\n "rcount_with_image": 11,\n "rcount_with_context": 139\n },\n "show_official_shop_label_in_title": false,\n "tier_variations": [],\n "is_adult": null,\n "discount": null,\n "flag": 65536,\n "is_non_cc_installment_payment_eligible": false,\n "has_lowest_price_guarantee": false,\n "has_group_buy_stock": false,\n "preview_info": null,\n "welcome_package_type": 0,\n "name": "\\u300e\\u73fe\\u8ca8+\\u9810\\u8cfc\\u300f Balea \\u7cbe\\u83ef\\u81a0\\u56ca 7 \\u5165",\n "distance": null,\n "adsid": null,\n "ctime": 1527866201,\n "wholesale_tier_list": [\n {\n "min_count": 150,\n "price": 4700000,\n "max_count": 300\n },\n {\n "min_count": 301,\n "price": 4600000,\n "max_count": 1000\n },\n {\n "min_count": 1001,\n "price": 4500000,\n "max_count": null\n }\n ],\n "show_shopee_verified_label": false,\n "campaignid": null,\n "show_official_shop_label_in_normal_position": null,\n "item_status": "normal",\n "shopee_verified": false,\n "hidden_price_display": null,\n "size_chart": null,\n "item_type": 0,\n "shipping_icon_type": null,\n "campaign_stock": null,\n "label_ids": [],\n "service_by_shopee_flag": 0,\n "badge_icon_type": 0,\n "historical_sold": 5104,\n "transparent_background_image": ""\n}\nRun Code Online (Sandbox Code Playgroud)\n
| 归档时间: |
|
| 查看次数: |
13325 次 |
| 最近记录: |