如何从shopee网站抓取商品?

Fre*_*ang 4 python beautifulsoup web-crawler

我尝试使用 python 来获取产品信息,如名称和价格。但这一次不起作用,即使我通过网络浏览器程序员模式检查html代码来获取类名并尝试使用这个名称来获取我想要的任何东西。

但我得到的结果是这样的,我找不到 的任何项目"class_="col-xs-2-4 shopee-search-item-result__item",我应该添加更多的标题信息吗?

打印结果

import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json

url = 'https://shopee.tw/shop/1819984/search?shopCollection=9271157'
headers = {
'Host': 'shopee.tw',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Cookie':'SPC_IA=-1; SPC_EC=-; SPC_F=L07IMDECRHjifEKyg7XuNCJ00GNdJGTA; REC_T_ID=246cfcdc-18fa-11ea-b254-f8f21e2be0b8; SPC_T_ID="Fyr1skVDq7FDiJOuTYHBmMfMr2Cw1eZyPbYJhBYoRmf/gvfvkOf5zgjIVXLrYYlg32aSx1PfmhWq7QsQzwM86mdeXG8VU7ERK4N+gfPFd14="; SPC_U=-; SPC_T_IV="/oJN8EB7iQwg7+n5mXd6cw=="; _gcl_au=1.1.788704691.1575727322; _fbp=fb.1.1575727322914.443117835; _ga=GA1.2.1422761069.1575727324; __BWfp=c1575727332595xf5a099d8b; cto_lwid=7ea874b3-f31f-47d7-aef9-60eed0156d33; cto_bundle=0tgQ7V9rU3JlRTU4aWlTc09JNXRaN014Y3ZXa1BtVVcwT2RhOU1UZ0tweUFvWUo2WHRPQjd0JTJCM1duaG5iWXFFRWxpbHZkTFluWUZLSEFudTFreGJueFoxU0EyanhnMWN6ZEVIUVV6cFlhd050emhFMWQ4bmhVelZwVSUyRmwwQUp5c29lOEhPT2ZobE10S1dvT09HYWNhVXV1YWx5R3dSOGw0MHcwZWpiZ2pXU2VHSzdrJTNE; _med=refer; G_ENABLED_IDPS=google; fbm_382498665271383=base_domain=.shopee.tw; SPC_SI=jq6hwq6ju6hig9hfulumcagdqaiopatc; _gid=GA1.2.143857303.1577796150; csrftoken=3Pya3o5WYEvhLOj9FqCqbV3angfwBlko; AMP_TOKEN=%24NOT_FOUND; _dc_gtm_UA-61915057-6=1'
}

r = requests.get(url,headers=headers,allow_redirects=True)
print(r.status_code)
print(r.history)
print(r.url)

soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
print(len(items))
```**strong text**
Run Code Online (Sandbox Code Playgroud)

fur*_*ras 7

该页面用于JavaScript显示项目但BeautifulSoup/requests无法运行JavaScipt

\n

使用DevToolsin Firefox/ Chrome(tab "Network") 我发现 url 用于JavaScript从服务器获取 JSON 数据,因此它甚至不需要BeautifulSoup.

\n

为了正确工作,它需要所有这些标头。

\n

如果没有User-AgentX-Requested-With它会发送空数据。
\n没有Referer它就不会发送价格。

\n
import requests\n\nurl = 'https://shopee.tw/api/v2/search_items/?by=pop&limit=30&match_id=1819984&newest=0&order=desc&page_type=shop&shop_categoryids=9271157&version=2'\n\nheaders = {\n    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',\n    'X-Requested-With': 'XMLHttpRequest',\n    'Referer': 'https://shopee.tw/shop/1819984/search?shopCollection=9271157',\n}    \n\nr = requests.get(url, headers=headers)\n\ndata = r.json()\n\n#print(data['items'][0].keys())\n\nfor item in data['items']:\n    print('name:', item['name'])\n    print('prince:', item['price'])\n    print('sold:', item['historical_sold'])\n    print('---')\n\n#print(data['items'][0]) # for test only \n
Run Code Online (Sandbox Code Playgroud)\n

结果:

\n
name: \xe3\x80\x8e\xe7\x8f\xbe\xe8\xb2\xa8+\xe9\xa0\x90\xe8\xb3\xbc\xe3\x80\x8f Balea \xe7\xb2\xbe\xe8\x8f\xaf\xe8\x86\xa0\xe5\x9b\x8a 7 \xe5\x85\xa5\nprince: 4900000\nsold: 5104\n---\nname: \xe7\x8f\xbe\xe8\xb2\xa8\xe4\xbe\x9b\xe6\x87\x89 \xe5\xbe\xb7\xe5\x9c\x8b Invisibobble \xe7\xa5\x9e\xe5\xa5\x87\xe9\xad\x94\xe9\xab\xae\xe5\x9c\x88\xe6\xb5\x81\xe7\xb7\x9a\xe9\xad\x94\xe9\xab\xae\xe5\x9c\x88\nprince: 7500000\nsold: 26\n---\n
Run Code Online (Sandbox Code Playgroud)\n
\n

顺便说一句:为了测试查看可以使用json缩进格式化的所有值

\n
import json\n\nprint(json.dumps(data['items'][0], indent=4))\n
Run Code Online (Sandbox Code Playgroud)\n

结果:

\n
{\n    "itemid": 1212735748,\n    "welcome_package_info": null,\n    "liked": false,\n    "recommendation_info": null,\n    "bundle_deal_info": null,\n    "price_max_before_discount": -1,\n    "image": "338673ff6f2b23d63514e5af85269d46",\n    "is_cc_installment_payment_eligible": false,\n    "shopid": 1819984,\n    "can_use_wholesale": true,\n    "group_buy_info": null,\n    "reference_item_id": "",\n    "currency": "TWD",\n    "raw_discount": null,\n    "show_free_shipping": false,\n    "video_info_list": [],\n    "ads_keyword": null,\n    "collection_id": null,\n    "images": [\n        "338673ff6f2b23d63514e5af85269d46"\n    ],\n    "match_type": null,\n    "price_before_discount": 0,\n    "is_category_failed": false,\n    "show_discount": 0,\n    "cmt_count": 306,\n    "view_count": 93,\n    "display_name": null,\n    "catid": 67,\n    "json_data": null,\n    "upcoming_flash_sale": null,\n    "is_official_shop": false,\n    "brand": "Dm Ebelin",\n    "price_min": 4900000,\n    "liked_count": 136,\n    "can_use_bundle_deal": false,\n    "show_official_shop_label": false,\n    "coin_earn_label": null,\n    "price_min_before_discount": -1,\n    "cb_option": 0,\n    "sold": 0,\n    "deduction_info": null,\n    "stock": 3647,\n    "status": 1,\n    "price_max": 4900000,\n    "add_on_deal_info": null,\n    "is_group_buy_item": null,\n    "flash_sale": null,\n    "price": 4900000,\n    "shop_location": "\\u53f0\\u4e2d\\u5e02\\u6f6d\\u5b50\\u5340",\n    "item_rating": {\n        "rating_star": 4.996732,\n        "rating_count": [\n            306,\n            0,\n            0,\n            0,\n            1,\n            305\n        ],\n        "rcount_with_image": 11,\n        "rcount_with_context": 139\n    },\n    "show_official_shop_label_in_title": false,\n    "tier_variations": [],\n    "is_adult": null,\n    "discount": null,\n    "flag": 65536,\n    "is_non_cc_installment_payment_eligible": false,\n    "has_lowest_price_guarantee": false,\n    "has_group_buy_stock": false,\n    "preview_info": null,\n    "welcome_package_type": 0,\n    "name": "\\u300e\\u73fe\\u8ca8+\\u9810\\u8cfc\\u300f Balea \\u7cbe\\u83ef\\u81a0\\u56ca 7 \\u5165",\n    "distance": null,\n    "adsid": null,\n    "ctime": 1527866201,\n    "wholesale_tier_list": [\n        {\n            "min_count": 150,\n            "price": 4700000,\n            "max_count": 300\n        },\n        {\n            "min_count": 301,\n            "price": 4600000,\n            "max_count": 1000\n        },\n        {\n            "min_count": 1001,\n            "price": 4500000,\n            "max_count": null\n        }\n    ],\n    "show_shopee_verified_label": false,\n    "campaignid": null,\n    "show_official_shop_label_in_normal_position": null,\n    "item_status": "normal",\n    "shopee_verified": false,\n    "hidden_price_display": null,\n    "size_chart": null,\n    "item_type": 0,\n    "shipping_icon_type": null,\n    "campaign_stock": null,\n    "label_ids": [],\n    "service_by_shopee_flag": 0,\n    "badge_icon_type": 0,\n    "historical_sold": 5104,\n    "transparent_background_image": ""\n}\n
Run Code Online (Sandbox Code Playgroud)\n