Bry*_*Bry 1 python beautifulsoup web-scraping python-requests
嗨,我想创建一个迷你爬虫,但不使用Scrapy,
我创建了这样的东西:
response = requests.get(url)
homepage_link_list = []
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
homepage_link_list.append(link.get("href"))
link_list = []
for item in homepage_link_list:
response = requests.get(item)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
link_list.append(link.get("href"))
Run Code Online (Sandbox Code Playgroud)
虽然我遇到的问题是它只获取网页链接中的链接,但我怎样才能让它获取网站所有链接中的所有链接。
您需要一个递归调用流程。我在下面写了一个面向类的代码。要点如下:
http://example.com#item1,忽略item1https://example.com已经被抓取,则忽略http://example.comhttp://example.com已经被抓取,忽略http://example.com/''' Scraper.
'''
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
if fields.netloc == self.domain:
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
self.urls.add(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
if __name__ == '__main__':
rscraper = RecursiveScraper("http://bbc.com")
rscraper.scrape()
print(rscraper.urls)
Run Code Online (Sandbox Code Playgroud)