无法在 Instagram 公共帐户上抓取超过 12 个帖子

Question

无法在 Instagram 公共帐户上抓取超过 12 个帖子

kar*_*lle 6 python selenium beautifulsoup web-scraping instagram

我想使用 Python 从公共 Instagram 帐户中抓取所有帖子，以进行我在大学进行的一项研究。然而，我开始感到沮丧，因为我无法从 Instagram 中提取超过 12 条帖子。

Selenium 完成了滚动页面的工作，并且我已经得到 beautifulsoup 来以适当的方式解析我想要的数据，尽管仅适用于前十二篇文章。到目前为止，我已经尝试了几种不同的方法，但开始感到陷入困境。我在这里查看了几个教程和线程，例如：

如何用 python 抓取完整的 Instagram 页面？

使用 Selenium Python 进行网页抓取 [Twitter + Instagram]

https://michaeljsanders.com/2017/05/12/scrapin-and-scrollin.html

https://edmundmartin.com/scraping-instagram-with-python/

感谢所有人和任何回应！

最好的问候，卡勒。

我试过的代码。示例1：

from bs4 import BeautifulSoup
import ssl
import json
import time

from selenium import webdriver
from datetime import datetime


class Insta_Image_Links_Scraper:

def getlinks(self, user, url):
    print('[+] Downloading:\n')
    c = webdriver.Chrome()
    c.get("https://www.instagram.com/frank_the_carden/")
    lenOfPage = c.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    match=False
    while(match==False):
            lastCount = lenOfPage
            time.sleep(2)
            lenOfPage = c.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
            if lastCount==lenOfPage:
                    match=True



    soup = BeautifulSoup(c.page_source, 'lxml')
    body = soup.find('body')
    script = body.find('script')
    page_json = script.text.strip().replace('window._sharedData =', '').replace(';', '')

    data = json.loads(page_json)
    print('Scraping posts for user ' + user+"...........")
    for post in data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
        timestamp = post['node']['taken_at_timestamp']
        likedby = post['node']['edge_liked_by']['count']
        comments = post['node']['edge_media_to_comment']['count']
        isVideo = post['node']['is_video']
        caption = post['node']['edge_media_to_caption']

        print('Post on :',datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S'))
        print('Liked by :',likedby)
        print('comments :',comments)
        print('caption :',caption)

def main(self):
    self.ctx = ssl.create_default_context()
    self.ctx.check_hostname = False
    self.ctx.verify_mode = ssl.CERT_NONE

    with open("accounts.txt") as f:
        self.content = f.readlines()
    self.content = [x.strip() for x in self.content]
    for user in self.content:
        self.getlinks(user,
                      'https://www.instagram.com/'
                      + user + '/')


if __name__ == '__main__':
    obj = Insta_Image_Links_Scraper()
    obj.main()

Run Code Online (Sandbox Code Playgroud)

示例2：

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import json
from datetime import datetime

c = webdriver.Chrome()

c.get("https://www.instagram.com/frank_the_carden/")
time.sleep(1)

elem = c.find_element_by_tag_name("body")

no_of_pagedowns = 20

while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.2)
    no_of_pagedowns-=1

soup = BeautifulSoup(c.page_source, 'html.parser')
body = soup.find('body')
script = body.find('script')
page_json = script.text.strip().replace('window._sharedData =', '').replace(';', '')

data = json.loads(page_json)
for post in data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
            timestamp = post['node']['taken_at_timestamp']
            likedby = post['node']['edge_liked_by']['count']
            comments = post['node']['edge_media_to_comment']['count']
            isVideo = post['node']['is_video']
            caption = post['node']['edge_media_to_caption']

            print('Post on :',datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S'))
            print('Liked by :',likedby)
            print('comments :',comments)
            print('caption :',caption)

Run Code Online (Sandbox Code Playgroud)

示例3：

import time
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import urllib3


browser = webdriver.Chrome()

media_url = 'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={"id":"%s","first":50,"after":"%s"}'
browser = webdriver.Chrome()

# first get https://instagram.com to obtain cookies
browser.get('https://www.instagram.com/frank_the_carden/')
browser_cookies = browser.get_cookies()

# set a session with cookies
session = requests.Session()
for cookie in browser_cookies:
    c = {cookie['name']: cookie['value']}
    session.cookies.update(c)

# get response as JSON
response = session.get(media_url % ('5719699176', ''), verify=False).json()
time.sleep(1)

elem = browser.find_element_by_tag_name("body")

no_of_pagedowns = 20

while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.2)
    no_of_pagedowns-=1

soup = BeautifulSoup(browser.page_source, 'html.parser')
body = soup.find('body')
script = body.find('script')
page_json = script.text.strip().replace('window._sharedData =', '').replace(';', '')
data = json.loads(page_json)
for post in data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
            timestamp = post['node']['taken_at_timestamp']
            likedby = post['node']['edge_liked_by']['count']
            comments = post['node']['edge_media_to_comment']['count']
            isVideo = post['node']['is_video']
            caption = post['node']['edge_media_to_caption']

            print('Post on :',datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S'))
            print('Liked by :',likedby)
            print('comments :',comments)
            print('caption :',caption)

Run Code Online (Sandbox Code Playgroud)

示例4：

from random import choice
import json
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

browser = webdriver.Chrome()

browser.get("https://www.instagram.com/frank_the_carden/")

# Selenium script to scroll to the bottom
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
                lastCount = lenOfPage
                time.sleep(1)
                lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
                if lastCount==lenOfPage:
                    match=True

_user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]

class InstagramScraper:

    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy

    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(_user_agents)

    def __request_url(self, url):
        try:
            response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
                                                                                                 'https': self.proxy})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from Instagram')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text

    @staticmethod
    def extract_json_data(html):
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)

    def profile_page_metrics(self, profile_url):
        results = {}
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                if key != 'edge_owner_to_timeline_media':
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
                    elif value:
                        results[key] = value
        return results

    def profile_page_recent_posts(self, profile_url):
        results = []
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                if node and isinstance(node, dict):
                    results.append(node)
        return results

from pprint import pprint

k = InstagramScraper()
results = k.profile_page_recent_posts('https://www.instagram.com/frank_the_carden/')
pprint(results)

Run Code Online (Sandbox Code Playgroud)

Answer 1

Bor*_*lov 1

我会直接调用 instagram graph ql api，就像您在“示例 3”中所做的那样。我有一个工作代码，但他们改变了 query_hash 的生成方式，我无法让它工作，但你可能也面临同样的问题。

除此之外，我目前正在使用这个python 客户端抓取 Instagram 数据。但您需要提供 Instagram 凭据才能正常工作。

归档时间：	6 年前
查看次数：	2530 次
最近记录：	3 年，4 月前