sec*_*ind 7 python csv twitter json
I'm trying to gather twitter statistics from a specific dataset that was provided to me. I have no control over how the data is formatted before it is given to me so I'm locked into this messy for.
I would like some suggestions on how I can build a python program to parse this sort of input and outputting something more along the lines of a CSV file with the field titles as header and the values below.
I want to use python because eventually I would like to use some statistical tools that I've already put together.
Additionally the output of a CSV type format is preferred because I may input it into something like SPSS for statistical verification.
Here's a sample of what a single post looks like from the dataset:
{"text":"A gente todos os dias arruma os cabelos: por que não o coração?","contributors":null,"geo":null,"retweeted":false,"in_reply_to_screen_name":null,"truncated":false,"entities":{"urls":[],"hashtags":[],"user_mentions":[]},"in_reply_to_status_id_str":null,"id":50270714498002945,"source":"web","in_reply_to_user_id_str":null,"favorited":false,"in_reply_to_status_id":null,"created_at":"Tue Mar 22 19:00:46 +0000 2011","in_reply_to_user_id":null,"retweet_count":0,"id_str":"50270714498002945","place":null,"user":{"location":"Brasil, Recife-PE","statuses_count":16,"profile_background_tile":true,"lang":"en","profile_link_color":"867c5f","id":59154474,"following":null,"favourites_count":0,"protected":false,"profile_text_color":"91957f","verified":false,"contributors_enabled":false,"description":"","profile_sidebar_border_color":"eae2bc","name":"Natalia Aráujo","profile_background_color":"eae2bc","created_at":"Wed Jul 22 15:27:15 +0000 2009","followers_count":10,"geo_enabled":false,"profile_background_image_url":"http://a3.twimg.com/profile_background_images/220796682/music-2.png","follow_request_sent":null,"url":null,"utc_offset":-10800,"time_zone":"Brasilia","notifications":null,"profile_use_background_image":true,"friends_count":18,"profile_sidebar_fill_color":"eae2bc","screen_name":"nat_araujo","id_str":"59154474","show_all_inline_media":false,"profile_image_url":"http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG","listed_count":1,"is_translator":false},"coordinates":null}
Run Code Online (Sandbox Code Playgroud)
数据集是一个连续的行,帖子之间没有行返回.实际帖子之间唯一的分隔符是:
所有帖子都以
{"text":
Run Code Online (Sandbox Code Playgroud)
结束
null}
Run Code Online (Sandbox Code Playgroud)
任何建议都将不胜感激,我当然很乐意与大家分享我的成果.
编辑
根据每个人的说法,我从以下开始:
import sys import json from pprint import pprint if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsondatain = json.load(inputfile) pprint(jsondatain) inputfile.close()
以下列形式输出更清洁的东西:
{u'contributors': None, u'coordinates': None, u'created_at': u'Tue Mar 22 19:00:46 +0000 2011', u'entities': {u'hashtags': [], u'urls': [], u'user_mentions': []}, u'favorited': False, u'geo': None, u'id': 50270714498002945L, u'id_str': u'50270714498002945', u'in_reply_to_screen_name': None, u'in_reply_to_status_id': None, u'in_reply_to_status_id_str': None, u'in_reply_to_user_id': None, u'in_reply_to_user_id_str': None, u'place': None, u'retweet_count': 0, u'retweeted': False, u'source': u'web', u'text': u'A gente todos os dias arruma os cabelos: por que n\xe3o o cora\xe7\xe3o?', u'truncated': False, u'user': {u'contributors_enabled': False, u'created_at': u'Wed Jul 22 15:27:15 +0000 2009', u'description': u'', u'favourites_count': 0, u'follow_request_sent': None, u'followers_count': 10, u'following': None, u'friends_count': 18, u'geo_enabled': False, u'id': 59154474, u'id_str': u'59154474', u'is_translator': False, u'lang': u'en', u'listed_count': 1, u'location': u'Brasil, Recife-PE', u'name': u'Natalia Ar\xe1ujo', u'notifications': None, u'profile_background_color': u'eae2bc', u'profile_background_image_url': u'http://a3.twimg.com/profile_background_images/220796682/music-2.png', u'profile_background_tile': True, u'profile_image_url': u'http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG', u'profile_link_color': u'867c5f', u'profile_sidebar_border_color': u'eae2bc', u'profile_sidebar_fill_color': u'eae2bc', u'profile_text_color': u'91957f', u'profile_use_background_image': True, u'protected': False, u'screen_name': u'nat_araujo', u'show_all_inline_media': False, u'statuses_count': 16, u'time_zone': u'Brasilia', u'url': None, u'utc_offset': -10800, u'verified': False}
编辑
我已经添加到上一个代码中,试图输出到csv文件:
import sys import json #from pprint import pprint import csv if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsondatain = json.load(inputfile) f=csv.writer(open("test.csv","wb+")) f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"]) for x in jsondatain: f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]]) #pprint(jsondatain) inputfile.close()
但是,当我运行它时,我得到:
File "twitterjson2cvs.py", line 28, in f.writerow([x["contributors"],x["fields"]["coordinates"],x["fields"]["created_at"],x["fields"]["entities"],x["fields"]["hashtags"],x["fields"]["urls"],x["fields"]["user_mentions"],x["fields"]["favorited"],x["fields"]["geo"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["in_reply_to_screen_name"],x["fields"]["in_reply_to_status_id"],x["fields"]["in_reply_to_status_id_str"],x["fields"]["in_reply_to_user_id"],x["fields"]["in_reply_to_user_id_str"],x["fields"]["place"],x["fields"]["retweet_count"],x["fields"]["retweeted"],x["fields"]["source"],x["fields"]["text"],x["fields"]["truncated"],x["fields"]["user"],x["fields"]["contributors_enabled"],x["fields"]["created_at"],x["fields"]["description"],x["fields"]["favourites_count"],x["fields"]["follow_request_sent"],x["fields"]["followers_count"],x["fields"]["following"],x["fields"]["friends_count"],x["fields"]["geo_enabled"],x["fields"]["id"],x["fields"]["id_str"],x["fields"]["is_translator"],x["fields"]["lang"],x["fields"]["listed_count"],x["fields"]["location"],x["fields"]["name"],x["fields"]["notifications"],x["fields"]["profile_background_color"],x["fields"]["profile_background_image_url"],x["fields"]["profile_background_tile"],x["fields"]["profile_image_url"],x["fields"]["profile_link_color"],x["fields"]["profile_sidebar_border_color"],x["fields"]["profile_sidebar_fill_color"],x["fields"]["profile_text_color"],x["fields"]["profile_use_background_image"],x["fields"]["protected"],x["fields"]["screen_name"],x["fields"]["show_all_inline_media"],x["fields"]["statuses_count"],x["fields"]["time_zone"],x["fields"]["url"],x["fields"]["utc_offset"],x["fields"]["verified"]]) TypeError: string indices must be integers
该错误与字段的格式有关,但我没有看到它.
编辑
我更新了代码以反映您的格式建议,如下所示:
import sys import json import csv if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsondatain = json.load(inputfile) f=csv.writer(open("test.csv","wb+")) f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"]) for x in jsondatain: f.writerow( ( x['contributors'], x['coordinates'], x['created_at'], x['entities']['hashtags'], x['entities']['urls'], x['entities']['user_mentions'], x['favorited'], x['geo'], x['id'], x['id_str'], x['in_reply_to_screen_name'], x['in_reply_to_status_id'], x['in_reply_to_status_id_str'], x['in_reply_to_user_id'], x['in_reply_to_user_id_str'], x['place'], x['retweet_count'], x['retweeted'], x['source'], x['text'].encode('utf8'), x['truncated'], x['user']['contributors_enabled'], x['user']['created_at'], x['user']['description'], x['user']['favourites_count'], x['user']['follow_request_sent'], x['user']['followers_count'], x['user']['following'], x['user']['friends_count'], x['user']['geo_enabled'], x['user']['id'], x['user']['id_str'], x['user']['is_translator'], x['user']['lang'], x['user']['listed_count'], x['user']['location'], x['user']['name'].encode('utf8'), x['user']['notifications'], x['user']['profile_background_color'], x['user']['profile_background_image_url'], x['user']['profile_background_tile'], x['user']['profile_image_url'], x['user']['profile_link_color'], x['user']['profile_sidebar_border_color'], x['user']['profile_sidebar_fill_color'], x['user']['profile_text_color'], x['user']['profile_use_background_image'], x['user']['protected'], x['user']['screen_name'], x['user']['show_all_inline_media'], x['user']['statuses_count'], x['user']['time_zone'], x['user']['url'], x['user']['utc_offset'], x['user']['verified'] ) ) inputfile.close()
我仍然得到以下错误:
twitterjson2cvs.py TweetFile1300820340639.tcm.online Traceback (most recent call last): File "workspace/coalmine-datafilter/src/twitterjson2csv.py", line 30, in x['contributors'], TypeError: string indices must be integers
编辑
对于单个json格式的输入文件,现在一切都很好.前面的例子json字符串输入到这个程序中:
import sys import json import csv if len(sys.argv) != 2: print 'To Use: twitterjson2cvs.py (path/filename)' sys.exit() inputfile = open(sys.argv[1]) jsonindata = json.load(inputfile) f=csv.writer(open("test.csv","wb+")) f.writerow(["contributors","coordinates","created_at","entities","hashtags","urls","user_mentions","favorited","geo","id","id_str","in_reply_to_screen_name","in_reply_to_status_id","in_reply_to_status_id_str","in_reply_to_user_id","in_reply_to_user_id_str","place","retweet_count","retweeted","source","text","truncated","user","contributors_enabled","created_at","description","favourites_count","follow_request_sent","followers_count","following","friends_count","geo_enabled","id","id_str","is_translator","lang","listed_count","location","name","notifications","profile_background_color","profile_background_image_url","profile_background_tile","profile_image_url","profile_link_color","profile_sidebar_border_color","profile_sidebar_fill_color","profile_text_color","profile_use_background_image","protected","screen_name","show_all_inline_media","statuses_count","time_zone","url","utc_offset","verified"]) f.writerow( ( jsonindata['contributors'], jsonindata['coordinates'], jsonindata['created_at'], jsonindata['entities']['hashtags'], jsonindata['entities']['urls'], jsonindata['entities']['user_mentions'], jsonindata['favorited'], jsonindata['geo'], jsonindata['id'], jsonindata['id_str'], jsonindata['in_reply_to_screen_name'], jsonindata['in_reply_to_status_id'], jsonindata['in_reply_to_status_id_str'], jsonindata['in_reply_to_user_id'], jsonindata['in_reply_to_user_id_str'], jsonindata['place'], jsonindata['retweet_count'], jsonindata['retweeted'], jsonindata['source'], jsonindata['text'].encode('utf8'), jsonindata['truncated'], jsonindata['user']['contributors_enabled'], jsonindata['user']['created_at'], jsonindata['user']['description'], jsonindata['user']['favourites_count'], jsonindata['user']['follow_request_sent'], jsonindata['user']['followers_count'], jsonindata['user']['following'], jsonindata['user']['friends_count'], jsonindata['user']['geo_enabled'], jsonindata['user']['id'], jsonindata['user']['id_str'], jsonindata['user']['is_translator'], jsonindata['user']['lang'], jsonindata['user']['listed_count'], jsonindata['user']['location'], jsonindata['user']['name'].encode('utf8'), jsonindata['user']['notifications'], jsonindata['user']['profile_background_color'], jsonindata['user']['profile_background_image_url'], jsonindata['user']['profile_background_tile'], jsonindata['user']['profile_image_url'], jsonindata['user']['profile_link_color'], jsonindata['user']['profile_sidebar_border_color'], jsonindata['user']['profile_sidebar_fill_color'], jsonindata['user']['profile_text_color'], jsonindata['user']['profile_use_background_image'], jsonindata['user']['protected'], jsonindata['user']['screen_name'], jsonindata['user']['show_all_inline_media'], jsonindata['user']['statuses_count'], jsonindata['user']['time_zone'], jsonindata['user']['url'], jsonindata['user']['utc_offset'], jsonindata['user']['verified'] ) ) inputfile.close()
results in a nicely formatted output ready for tools like SPSS to use as follows:
contributors,coordinates,created_at,entities,hashtags,urls,user_mentions,favorited,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,place,retweet_count,retweeted,source,text,truncated,user,contributors_enabled,created_at,description,favourites_count,follow_request_sent,followers_count,following,friends_count,geo_enabled,id,id_str,is_translator,lang,listed_count,location,name,notifications,profile_background_color,profile_background_image_url,profile_background_tile,profile_image_url,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,protected,screen_name,show_all_inline_media,statuses_count,time_zone,url,utc_offset,verified ,,Tue Mar 22 19:00:46 +0000 2011,[],[],[],False,,50270714498002945,50270714498002945,,,,,,,0,False,web,A gente todos os dias arruma os cabelos: por que não o coração?,False,False,Wed Jul 22 15:27:15 +0000 2009,,0,,10,,18,False,59154474,59154474,False,en,1,"Brasil, Recife-PE",Natalia Aráujo,,eae2bc,http://a3.twimg.com/profile_background_images/220796682/music-2.png,True,http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG,867c5f,eae2bc,eae2bc,91957f,True,False,nat_araujo,False,16,Brasilia,,-10800,False
Now the only problem is that my I have input files that have multiple json strings inline with each other all on one continuous line. When I attempt to run the same program on those files I get the following error:
Traceback (most recent call last): File "workspace/coalmine-datafilter/src/twitterjson2cvs.py", line 22, in jsonindata = json.load(inputfile) File "/usr/lib/python2.6/json/__init__.py", line 267, in load parse_constant=parse_constant, **kw) File "/usr/lib/python2.6/json/__init__.py", line 307, in loads return _default_decoder.decode(s) File "/usr/lib/python2.6/json/decoder.py", line 322, in decode raise ValueError(errmsg("Extra data", s, end, len(s))) ValueError: Extra data: line 1 column 1514 - line 2 column 1 (char 1514 - 2427042)
The input file is very large, (ie: multiple thousands of twitter posts), I don't know if the error is due to the number of posts or if it's because the file has multiple {"...."}{"...."} all on the same line. Any ideas? Do I perhaps need to add a line return somehow after each feed?
这应该可以帮助您开始...您将需要处理嵌套对象
import json
import csv
f = file('test.json', 'r')
data = json.load(f)
#result = []
for k,v in data.iteritems():
print k,v
#result.append(v)
f = file('output.csv', 'w')
writer = csv.writer(f)
writer.writerows(result)
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
3387 次 |
最近记录: |