Dal*_*len 26
这是您获取电子邮件内容的方式,即*.eml文件.这在Python2.5 - 2.7上完美运行.尝试3.它也应该工作.
from email import message_from_file
import os
# Path to directory where attachments will be stored:
path = "./msgfiles"
# To have attachments extracted into memory, change behaviour of 2 following functions:
def file_exists (f):
"""Checks whether extracted file was extracted before."""
return os.path.exists(os.path.join(path, f))
def save_file (fn, cont):
"""Saves cont to a file fn"""
file = open(os.path.join(path, fn), "wb")
file.write(cont)
file.close()
def construct_name (id, fn):
"""Constructs a file name out of messages ID and packed file name"""
id = id.split(".")
id = id[0]+id[1]
return id+"."+fn
def disqo (s):
"""Removes double or single quotations."""
s = s.strip()
if s.startswith("'") and s.endswith("'"): return s[1:-1]
if s.startswith('"') and s.endswith('"'): return s[1:-1]
return s
def disgra (s):
"""Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
s = s.strip()
if s.startswith("<") and s.endswith(">"): return s[1:-1]
return s
def pullout (m, key):
"""Extracts content from an e-mail message.
This works for multipart and nested multipart messages too.
m -- email.Message() or mailbox.Message()
key -- Initial message ID (some string)
Returns tuple(Text, Html, Files, Parts)
Text -- All text from all parts.
Html -- All HTMLs from all parts
Files -- Dictionary mapping extracted file to message ID it belongs to.
Parts -- Number of parts in original message.
"""
Html = ""
Text = ""
Files = {}
Parts = 0
if not m.is_multipart():
if m.get_filename(): # It's an attachment
fn = m.get_filename()
cfn = construct_name(key, fn)
Files[fn] = (cfn, None)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# Not an attachment!
# See where this belongs. Text, Html or some other data:
cp = m.get_content_type()
if cp=="text/plain": Text += m.get_payload(decode=True)
elif cp=="text/html": Html += m.get_payload(decode=True)
else:
# Something else!
# Extract a message ID and a file name if there is one:
# This is some packed file and name is contained in content-type header
# instead of content-disposition header explicitly
cp = m.get("content-type")
try: id = disgra(m.get("content-id"))
except: id = None
# Find file name:
o = cp.find("name=")
if o==-1: return Text, Html, Files, 1
ox = cp.find(";", o)
if ox==-1: ox = None
o += 5; fn = cp[o:ox]
fn = disqo(fn)
cfn = construct_name(key, fn)
Files[fn] = (cfn, id)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# This IS a multipart message.
# So, we iterate over it and call pullout() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
pl = m.get_payload(y)
except: break
# pl is a new Message object which goes back to pullout
t, h, f, p = pullout(pl, key)
Text += t; Html += h; Files.update(f); Parts += p
y += 1
return Text, Html, Files, Parts
def extract (msgfile, key):
"""Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
msgfile -- A file-like readable object
key -- Some ID string for that particular Message. Can be a file name or anything.
Returns dict()
Keys: from, to, subject, date, text, html, parts[, files]
Key files will be present only when message contained binary files.
For more see __doc__ for pullout() and caption() functions.
"""
m = message_from_file(msgfile)
From, To, Subject, Date = caption(m)
Text, Html, Files, Parts = pullout(m, key)
Text = Text.strip(); Html = Html.strip()
msg = {"subject": Subject, "from": From, "to": To, "date": Date,
"text": Text, "html": Html, "parts": Parts}
if Files: msg["files"] = Files
return msg
def caption (origin):
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
origin -- Message() object
Returns tuple(From, To, Subject, Date)
If message doesn't contain one/more of them, the empty strings will be returned.
"""
Date = ""
if origin.has_key("date"): Date = origin["date"].strip()
From = ""
if origin.has_key("from"): From = origin["from"].strip()
To = ""
if origin.has_key("to"): To = origin["to"].strip()
Subject = ""
if origin.has_key("subject"): Subject = origin["subject"].strip()
return From, To, Subject, Date
Run Code Online (Sandbox Code Playgroud)
# Usage:
f = open("message.eml", "rb")
print extract(f, f.name)
f.close()
Run Code Online (Sandbox Code Playgroud)
我使用邮箱为我的邮件组编程了这个,这就是为什么它如此复杂.它永远不会让我失望.从来没有任何垃圾.如果消息是多部分,则输出字典将包含一个键"文件"(子字典),其中包含提取的其他非文本或html文件的所有文件名.这是一种提取附件和其他二进制数据的方法.您可以在pullout()中更改它,或者只是更改file_exists()和save_file()的行为.
construct_name()构造一个消息id和多部分消息文件名的文件名,如果有的话.
在pullout()中,Text和Html变量是字符串.对于在线邮件组,可以将任何文本或HTML打包到多部分中,而不是一次性附件.
如果您需要更复杂的内容,请将Text和Html更改为列表并附加到它们并根据需要添加它们.什么都没有问题.
也许这里有一些错误,因为它打算使用mailbox.Message(),而不是使用email.Message().我在email.Message()上尝试过它并且运行正常.
你说,你"希望列出所有".来自哪里?如果你参考POP3邮箱或一些不错的开源邮件的邮箱,那么你使用邮箱模块.如果您想从其他人列出它们,那么您就遇到了问题.例如,要从MS Outlook获取邮件,您必须知道如何读取OLE2复合文件.其他邮件很少将它们称为*.eml文件,因此我认为这正是您想要做的.然后在PyPI上搜索olefile或compoundfiles模块和Google,了解如何从MS Outlook收件箱文件中提取电子邮件.或者保存自己一团糟,然后将它们从那里导出到某个目录.当您将它们作为eml文件时,请应用此代码.
小智 15
我发现这个代码更简单
import email
import os
path = './'
listing = os.listdir(path)
for fle in listing:
if str.lower(fle[-3:])=="eml":
msg = email.message_from_file(open(fle))
attachments=msg.get_payload()
for attachment in attachments:
try:
fnam=attachment.get_filename()
f=open(fnam, 'wb').write(attachment.get_payload(decode=True,))
f.close()
except Exception as detail:
#print detail
pass
Run Code Online (Sandbox Code Playgroud)
Pat*_*Old 13
在这里发布此内容是为了任何想要从电子邮件中提取文本并获取 .eml 文件列表的人 - 我花了很长时间才在网上找到了一个很好的答案。注意:这不会获取电子邮件的附件,而只会获取电子邮件中的文本。
import email
from email import policy
from email.parser import BytesParser
import glob
import os
path = '/path/to/data/' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
text = text.split("\n")
print (name) # Get name of eml file
print (text) # Get list of all text in email
Run Code Online (Sandbox Code Playgroud)
归功于这篇文章中的一些代码:Reading .eml files with Python 3.6 using emaildata 0.3.4
Dalen 答案的 Python 3 版本。基本上修复了语法问题。(由于缺乏声誉,无法发表评论,作为答案也更清晰)。
# To have attachments extracted into memory, change behaviour of 2 following functions:
def file_exists (f):
"""Checks whether extracted file was extracted before."""
return os.path.exists(os.path.join(path, f))
def save_file (fn, cont):
"""Saves cont to a file fn"""
file = open(os.path.join(path, fn), "wb")
file.write(cont)
file.close()
def construct_name (id, fn):
"""Constructs a file name out of messages ID and packed file name"""
id = id.split(".")
id = id[0]+id[1]
return id+"."+fn
def disqo (s):
"""Removes double or single quotations."""
s = s.strip()
if s.startswith("'") and s.endswith("'"): return s[1:-1]
if s.startswith('"') and s.endswith('"'): return s[1:-1]
return s
def disgra (s):
"""Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
s = s.strip()
if s.startswith("<") and s.endswith(">"): return s[1:-1]
return s
def pullout (m, key):
"""Extracts content from an e-mail message.
This works for multipart and nested multipart messages too.
m -- email.Message() or mailbox.Message()
key -- Initial message ID (some string)
Returns tuple(Text, Html, Files, Parts)
Text -- All text from all parts.
Html -- All HTMLs from all parts
Files -- Dictionary mapping extracted file to message ID it belongs to.
Parts -- Number of parts in original message.
"""
Html = ""
Text = ""
Files = {}
Parts = 0
if not m.is_multipart():
if m.get_filename(): # It's an attachment
fn = m.get_filename()
cfn = construct_name(key, fn)
Files[fn] = (cfn, None)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# Not an attachment!
# See where this belongs. Text, Html or some other data:
cp = m.get_content_type()
if cp=="text/plain":
Text += str(m.get_payload(decode=True))
elif cp=="text/html":
Html += str(m.get_payload(decode=True))
else:
# Something else!
# Extract a message ID and a file name if there is one:
# This is some packed file and name is contained in content-type header
# instead of content-disposition header explicitly
cp = m.get("content-type")
try: id = disgra(m.get("content-id"))
except: id = None
# Find file name:
o = cp.find("name=")
if o==-1: return Text, Html, Files, 1
ox = cp.find(";", o)
if ox==-1: ox = None
o += 5; fn = cp[o:ox]
fn = disqo(fn)
cfn = construct_name(key, fn)
Files[fn] = (cfn, id)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# This IS a multipart message.
# So, we iterate over it and call pullout() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
pl = m.get_payload(y)
except: break
# pl is a new Message object which goes back to pullout
t, h, f, p = pullout(pl, key)
Text += t; Html += h; Files.update(f); Parts += p
y += 1
return Text, Html, Files, Parts
def extract (msgfile, key):
"""Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
msgfile -- A file-like readable object
key -- Some ID string for that particular Message. Can be a file name or anything.
Returns dict()
Keys: from, to, subject, date, text, html, parts[, files]
Key files will be present only when message contained binary files.
For more see __doc__ for pullout() and caption() functions.
"""
m = email.message_from_file(msgfile)
From, To, Subject, Date = caption(m)
Text, Html, Files, Parts = pullout(m, key)
Text = Text.strip(); Html = Html.strip()
msg = {"subject": Subject, "from": From, "to": To, "date": Date,
"text": Text, "html": Html, "parts": Parts}
if Files: msg["files"] = Files
return msg
def caption (origin):
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
origin -- Message() object
Returns tuple(From, To, Subject, Date)
If message doesn't contain one/more of them, the empty strings will be returned.
"""
Date = ""
if origin.__contains__("date"): Date = origin["date"].strip()
From = ""
if origin.__contains__("from"): From = origin["from"].strip()
To = ""
if origin.__contains__("to"): To = origin["to"].strip()
Subject = ""
if origin.__contains__("subject"): Subject = origin["subject"].strip()
return From, To, Subject, Date
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
21718 次 |
| 最近记录: |