tag*_*aga 2 python web-scraping python-newspaper newspaper3k
我已经Newspapper3k在我的 Mac 上安装了 Lib sudo pip3 install Newspapper3k。我使用 Python 3。我想返回 Article 对象支持的数据,即 url、日期、标题、文本、摘要和关键字,但我没有得到任何数据:
import newspaper
from newspaper import Article
#creating website for scraping
cnn_paper = newspaper.build('https://www.euronews.com/', memoize_articles=False)
#I have tried for https://www.euronews.com/, https://edition.cnn.com/, https://www.bbc.com/
for article in cnn_paper.articles:
article_url = article.url #works
news_article = Article(article_url)#works
print("OBJECT:", news_article, '\n')#works
print("URL:", article_url, '\n')#works
print("DATE:", news_article.publish_date, '\n')#does not work
print("TITLE:", news_article.title, '\n')#does not work
print("TEXT:", news_article.text, '\n')#does not work
print("SUMMARY:", news_article.summary, '\n')#does not work
print("KEYWORDS:", news_article.keywords, '\n')#does not work
print()
input()
Run Code Online (Sandbox Code Playgroud)
我得到了 Article 对象和 URL,但其他一切都是 ''。我在不同的网站上尝试过,但结果都是一样的。
然后我尝试添加:
news_article.download()
news_article.parse()
news_article.nlp()
Run Code Online (Sandbox Code Playgroud)
我还尝试设置配置并设置标头和超时,但结果是相同的。
当我这样做时,对于每个网站,我只获得 16 篇带有日期、标题和正文值的文章。这对我来说很奇怪,对于每个网站,我都获得相同数量的数据,但对于超过 95% 的新闻文章,我没有获得任何数据。
美丽汤能帮助我吗?
有人可以帮助我理解问题是什么,为什么我得到这么多 Null/Nan/"" 值,以及如何解决这个问题?
这是 lib 的文档:
我建议您查看我在GitHub上发布的报纸概述文档。该文档有多个提取示例和其他可能有用的技术。
关于你的问题...
Newspaper3K几乎可以完美地解析某些网站。但是有很多网站需要检查页面的导航结构以确定如何正确解析文章元素。
例如,https://www.marketwatch.com具有单独的文章元素,例如标题、发布日期和存储在页面元标记部分中的其他项目。
下面的报纸示例将正确解析元素。我注意到您可能需要对关键字或标签输出进行一些数据清理。
import newspaper
from newspaper import Config
from newspaper import Article
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.marketwatch.com'
article_urls = set()
marketwatch = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
for sub_article in marketwatch.articles:
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
article.download()
article.parse()
if article.url not in article_urls:
article_urls.add(article.url)
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'parsely-pub-date'}
article_published_date = " ".join(str(x) for x in published_date)
authors = sorted({value for (key, value) in article_meta_data.items() if key == 'parsely-author'})
article_author = ', '.join(authors)
title = {value for (key, value) in article_meta_data.items() if key == 'parsely-title'}
article_title = " ".join(str(x) for x in title)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list)
tags = ''.join({value for (key, value) in article_meta_data.items() if key == 'parsely-tags'})
tag_list = sorted(tags.lower().split(','))
article_tags = ', '.join(tag_list)
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')
print(article_text)
Run Code Online (Sandbox Code Playgroud)
https://www.euronews.com与https://www.marketwatch.com类似,不同之处在于一些文章元素位于主体中,其他项目位于元标记部分内。
import newspaper
from newspaper import Config
from newspaper import Article
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.euronews.com'
article_urls = set()
euronews = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
for sub_article in euronews.articles:
if sub_article.url not in article_urls:
article_urls.add(sub_article.url)
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
article.download()
article.parse()
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'date.created'}
article_published_date = " ".join(str(x) for x in published_date)
article_title = article.title
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list).strip()
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
4236 次 |
| 最近记录: |