Apr*_*cot 5 csv gmail parsing python-2.7
我已从我的 Gmail 帐户下载了邮件存档。我正在使用以下来自博客的 python(2.7) 代码将存档的内容转换为 csv。
import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
writer.writerow([message['subject'], message['from'], message['date']])
Run Code Online (Sandbox Code Playgroud)
我也想包含邮件正文(实际消息)...但不知道如何。我之前没有使用过python,有人可以帮忙吗?我已经使用了其他给出的选项,但无法通过。
为了完成相同的任务,我也使用了以下代码:但是第 60 行出现缩进错误:return json_msg。我尝试了不同的缩进选项,但没有任何改进。
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'Users/mymachine/client1/Takeout/Mail/archive.mbox'
OUT_FILE = 'Users/mymachine/client1/Takeout/Mail/archive.mbox.json'
def cleanContent(msg):
msg = quopri.decodestring(msg)
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() == 'multipart':
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
Run Code Online (Sandbox Code Playgroud)
尝试这个。
import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
if message.is_multipart():
content = ''.join(part.get_payload() for part in message.get_payload())
else:
content = message.get_payload()
writer.writerow([message['subject'], message['from'], message['date'],content])
Run Code Online (Sandbox Code Playgroud)
或这个:
import mailbox
import csv
def get_message(message):
if not message.is_multipart():
return message.get_payload()
contents = ""
for msg in message.get_payload():
contents = contents + str(msg.get_payload()) + '\n'
return contents
if __name__ == "__main__":
writer = csv.writer(open("clean_mail.csv", "wb"))
for message in mailbox.mbox("archive.mbox"):
contents = get_message(message)
writer.writerow([message["subject"], message["from"], message["date"],contents])
Run Code Online (Sandbox Code Playgroud)
在此处查找文档。