3 python optimization selenium web-scraping
我正在尝试通过 Selenium 和 python抓取https://arxiv.org/search/?query=healthcare&searchtype=allI。for 循环执行时间太长。我尝试使用无头浏览器和 PhantomJS 进行抓取,但它没有抓取抽象字段(需要单击更多按钮来扩展抽象字段)
import pandas as pd
import selenium
import re
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Firefox
browser = Firefox()
url_healthcare = 'https://arxiv.org/search/?query=healthcare&searchtype=all'
browser.get(url_healthcare)
dfs = []
for i in range(1, 39):
articles = browser.find_elements_by_tag_name('li[class="arxiv-result"]')
for article in articles:
title = article.find_element_by_tag_name('p[class="title is-5 mathjax"]').text
arxiv_id = article.find_element_by_tag_name('a').text.replace('arXiv:','')
arxiv_link = article.find_elements_by_tag_name('a')[0].get_attribute('href')
pdf_link = article.find_elements_by_tag_name('a')[1].get_attribute('href')
authors = article.find_element_by_tag_name('p[class="authors"]').text.replace('Authors:','')
try:
link1 = browser.find_element_by_link_text('? More')
link1.click()
except:
time.sleep(0.1)
abstract = article.find_element_by_tag_name('p[class="abstract mathjax"]').text
date = article.find_element_by_tag_name('p[class="is-size-7"]').text
date = re.split(r"Submitted|;",date)[1]
tag = article.find_element_by_tag_name('div[class="tags is-inline-block"]').text.replace('\n', ',')
try:
doi = article.find_element_by_tag_name('div[class="tags has-addons"]').text
doi = re.split(r'\s', doi)[1]
except NoSuchElementException:
doi = 'None'
all_combined = [title, arxiv_id, arxiv_link, pdf_link, authors, abstract, date, tag, doi]
dfs.append(all_combined)
print('Finished Extracting Page:', i)
try:
link2 = browser.find_element_by_class_name('pagination-next')
link2.click()
except:
browser.close
time.sleep(0.1)
Run Code Online (Sandbox Code Playgroud)
下面的实现在16 秒内实现了这一点。
为了加快执行过程,我采取了以下措施:
Selenium
完全删除(无需点击)abstract
, usedBeautifulSoup
的输出并稍后处理multiprocessing
以显着加快进程from multiprocessing import Process, Manager
import requests
from bs4 import BeautifulSoup
import re
import time
start_time = time.time()
def get_no_of_pages(showing_text):
no_of_results = int((re.findall(r"(\d+,*\d+) results for all",showing_text)[0].replace(',','')))
pages = no_of_results//200 + 1
print("total pages:",pages)
return pages
def clean(text):
return text.replace("\n", '').replace(" ",'')
def get_data_from_page(url,page_number,data):
print("getting page",page_number)
response = requests.get(url+"start="+str(page_number*200))
soup = BeautifulSoup(response.content, "lxml")
arxiv_results = soup.find_all("li",{"class","arxiv-result"})
for arxiv_result in arxiv_results:
paper = {}
paper["titles"]= clean(arxiv_result.find("p",{"class","title is-5 mathjax"}).text)
links = arxiv_result.find_all("a")
paper["arxiv_ids"]= links[0].text.replace('arXiv:','')
paper["arxiv_links"]= links[0].get('href')
paper["pdf_link"]= links[1].get('href')
paper["authors"]= clean(arxiv_result.find("p",{"class","authors"}).text.replace('Authors:',''))
split_abstract = arxiv_result.find("p",{"class":"abstract mathjax"}).text.split("? More\n\n\n",1)
if len(split_abstract) == 2:
paper["abstract"] = clean(split_abstract[1].replace("? Less",''))
else:
paper["abstract"] = clean(split_abstract[0].replace("? Less",''))
paper["date"] = re.split(r"Submitted|;",arxiv_results[0].find("p",{"class":"is-size-7"}).text)[1]
paper["tag"] = clean(arxiv_results[0].find("div",{"class":"tags is-inline-block"}).text)
doi = arxiv_results[0].find("div",{"class":"tags has-addons"})
if doi is None:
paper["doi"] = "None"
else:
paper["doi"] = re.split(r'\s', doi.text)[1]
data.append(paper)
print(f"page {page_number} done")
if __name__ == "__main__":
url = 'https://arxiv.org/search/?searchtype=all&query=healthcare&abstracts=show&size=200&order=-announced_date_first&'
response = requests.get(url+"start=0")
soup = BeautifulSoup(response.content, "lxml")
with Manager() as manager:
data = manager.list()
processes = []
get_data_from_page(url,0,data)
showing_text = soup.find("h1",{"class":"title is-clearfix"}).text
for i in range(1,get_no_of_pages(showing_text)):
p = Process(target=get_data_from_page, args=(url,i,data))
p.start()
processes.append(p)
for p in processes:
p.join()
print("Number of entires scraped:",len(data))
stop_time = time.time()
print("Time taken:", stop_time-start_time,"seconds")
Run Code Online (Sandbox Code Playgroud)
输出:
>>> python test.py
Run Code Online (Sandbox Code Playgroud)
getting page 0
page 0 done
total pages: 10
getting page 1
getting page 4
getting page 2
getting page 6
getting page 5
getting page 3
getting page 7
getting page 9
getting page 8
page 9 done
page 4 done
page 1 done
page 6 done
page 2 done
page 7 done
page 3 done
page 5 done
page 8 done
Number of entires scraped: 1890
Time taken: 15.911492586135864 seconds
Run Code Online (Sandbox Code Playgroud)
笔记:
.py
文件中。对于 Jupyter 笔记本,请参阅此。data
列表中条目的顺序与网站上的顺序不匹配,因为它们Manager
会在出现时追加dictionaries
到其中。gets
page 0
,然后计算number of pages
,然后再计算multiprocessing
剩余的页面。这样做的缺点是,在0th page
处理 时,没有其他进程正在运行。因此,如果您删除该部分并简单地运行循环,10 pages
那么所花费的时间应该在8 秒左右。 归档时间: |
|
查看次数: |
217 次 |
最近记录: |