我一直在为初学者观看python上的bucky roberts视频,并且我正在尝试使用视频中的类似代码为Wikipedia页面构建基本的网络抓取工具。
import requests
from bs4 import BeautifulSoup
def main_page_spider(max_pages):
page_list={1: "Contents",
2:"Overview",
3:"Outlines",
4:"Lists",
5:"Portals",
6:"Glossaries",
7:"Categories",
8:"Indices",
9:"Reference",
10:"Culture",
11:"Geography",
12:"Health",
13:"History",
14:"Mathematics",
15:"Nature",
16:"People",
17:"Philosophy",
18:"Religion",
19:"Society",
20:"Technology"}
for page in range(1,max_pages+1):
if page == 1:
url = "https://en.wikipedia.org/wiki/Portal:Contents"
else:
url = "https://en.wikipedia.org/wiki/Portal:Contents/" + str(page_list[page])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
divs = soup.find('div', {'class': "mw-body-content", 'id': "bodyContent"})
for link in divs.findAll('a'):
href = "https://en.wikipedia.org" + str(link.get("href"))
get_link_data(href)
print(href)
def get_link_data(link_url):
source_code = …Run Code Online (Sandbox Code Playgroud)