我使用tweepy提取了一些Twitter数据,并且从那里得到的格式不是最格式正确的json,但是我不太想办法做到这一点,而这不会花费我几个小时的时间。(我在大约100个文件中拥有大约一百万行的Twitter数据)。
下面附上的是我的json文件的一(1)行的示例。
从字面上看,任何方式做到这一点将不胜感激。
谢谢!
user_mentions“:[{” screen_name“:” nparama1951“,” name“:” N.Paramasivam“,” id“:2783847553,” id_str“:” 2783847553“,” indices“:[0,12]},{” screen_name“:” RBhamaria“,” name“:” Capt。Ritesh Bhamaria \ ud83c \ uddee \ ud83c \ uddf3 \ ud83c \ udde6 \ ud83c \ uddfa“,” id“:326234086,” id_str“:” 326234086“,” indices“:[13,23]},{” screen_name“: “ jyotsnavarma9”,“名称”:“ Jyotsna Varma \ ud83c \ uddee \ ud83c \ uddf3”,“ id”:3166184569,“ id_str”:“ 3166184569”,“索引”:[24,38]},{“ screen_name” :“ JaganNKaushik”,“名称”:“ JN Kaushik”,“ id”:878303278420840448,“ id_str”:“ 878303278420840448”,“
这看起来像ndjson。在ndjson与jsonlite包既可以处理它。
dat <- ndjson::stream_in("data/INCIndia26febru.json")
# Source: local data table [1,584 x 2,814]
#
# # A tibble: 1,584 x 2,814
# contributors coordinates created_at entities.hashta… entities.symbols entities.urls entities.user_m…
# <int> <int> <chr> <int> <int> <int> <dbl>
# 1 NA NA Tue Feb 2… NA NA NA 1.15e 9
# 2 NA NA Tue Feb 2… NA NA NA 1.15e 9
# 3 NA NA Tue Feb 2… NA NA NA 2.23e 8
# 4 NA NA Tue Feb 2… NA NA NA 7.44e 7
# 5 NA NA Tue Feb 2… NA NA NA 1.06e18
# 6 NA NA Tue Feb 2… NA NA NA 1.47e 8
# 7 NA NA Tue Feb 2… NA NA NA 7.44e 7
# 8 NA NA Tue Feb 2… NA NA NA 1.15e 9
# 9 NA NA Tue Feb 2… NA NA NA 1.15e 9
# 10 NA NA Tue Feb 2… NA NA NA 7.44e 7
# # … with 1,574 more rows, and 2,807 more variables: entities.user_mentions.0.id_str <chr>,
# # entities.user_mentions.0.indices.0 <dbl>, entities.user_mentions.0.indices.1 <dbl>,
# # entities.user_mentions.0.name <chr>, entities.user_mentions.1.id <dbl>,
# # entities.user_mentions.0.screen_name <chr>, entities.user_mentions.1.id_str <chr>,
# # entities.user_mentions.1.indices.0 <dbl>, retweeted_status.entities.user_mentions.0.name <chr>,
# # retweeted_status.entities.user_mentions.0.screen_name <chr>,
# # retweeted_status.extended_tweet.display_text_range.0 <dbl>,
# # retweeted_status.entities.user_mentions.0.id_str <chr>,
# # retweeted_status.entities.user_mentions.0.indices.0 <dbl>,
# # retweeted_status.entities.user_mentions.0.indices.1 <dbl>, retweeted_status.entities.urls.0.url <chr>,
# # retweeted_status.entities.user_mentions.0.id <dbl>,
# # retweeted_status.extended_tweet.entities.media.0.media_url_https <chr>,
# # retweeted_status.extended_tweet.entities.media.0.sizes.large.h <dbl>,
# # retweeted_status.extended_tweet.entities.media.0.sizes.medium.h <dbl>,
# # retweeted_status.extended_tweet.entities.media.0.sizes.large.resize <chr>,
# # retweeted_status.extended_tweet.entities.media.0.sizes.large.w <dbl>,
# # retweeted_status.extended_tweet.entities.media.0.indices.0 <dbl>,
# # retweeted_status.extended_tweet.entities.media.0.indices.1 <dbl>,
# # retweeted_status.extended_tweet.entities.media.0.media_url <chr>, in_reply_to_user_id_str <chr>,
# # in_reply_to_status_id_str <chr>, is_quote_status <lgl>, lang <chr>, id_str <chr>,
# # in_reply_to_screen_name <chr>, in_reply_to_status_id <dbl>, in_reply_to_user_id <dbl>, retweet_count <dbl>,
# # place <int>, quote_count <dbl>, retweeted <lgl>, retweeted_status.contributors <int>, reply_count <dbl>,
# # retweeted_status.coordinates <int>, retweeted_status.created_at <chr>, user.created_at <chr>,
# # user.contributors_enabled <lgl>, retweeted_status.user.utc_offset <int>,
# # retweeted_status.user.verified <lgl>, source <chr>, text <chr>, timestamp_ms <chr>, truncated <lgl>,
# # user.default_profile <lgl>, user.following <int>, user.default_profile_image <lgl>, user.description <chr>,
# # user.follow_request_sent <int>, user.followers_count <dbl>, user.friends_count <dbl>,
# # user.favourites_count <dbl>, quoted_status.extended_tweet.extended_entities.media.0.sizes.thumb.resize <chr>,
# # quoted_status.extended_tweet.extended_entities.media.0.sizes.thumb.w <dbl>,
# # quoted_status.extended_tweet.extended_entities.media.0.video_info.aspect_ratio.0 <dbl>,
# # quoted_status.extended_tweet.extended_entities.media.0.video_info.aspect_ratio.1 <dbl>,
# # quoted_status.extended_tweet.extended_entities.media.0.type <chr>,
# # quoted_status.extended_tweet.extended_entities.media.0.video_info.duration_millis <dbl>,
# # quoted_status.extended_tweet.extended_entities.media.0.video_info.variants.0.bitrate <dbl>,
# # quoted_status.extended_tweet.extended_entities.media.0.url <chr>,
# # retweeted_status.entities.hashtags.0.indices.0 <dbl>, retweeted_status.entities.hashtags.0.indices.1 <dbl>,
# # retweeted_status.entities.hashtags.0.text <chr>, favorite_count <dbl>,
# # entities.user_mentions.1.indices.1 <dbl>, entities.user_mentions.1.name <chr>,
# # entities.user_mentions.1.screen_name <chr>, favorited <lgl>, filter_level <chr>, geo <int>, id <dbl>,
# # retweeted_status.favorite_count <dbl>, retweeted_status.favorited <lgl>, retweeted_status.filter_level <chr>,
# # retweeted_status.extended_tweet.full_text <chr>,
# # retweeted_status.extended_tweet.extended_entities.media.1.sizes.thumb.resize <chr>,
# # retweeted_status.extended_tweet.extended_entities.media.1.sizes.thumb.w <dbl>,
# # retweeted_status.extended_tweet.extended_entities.media.1.type <chr>,
# # retweeted_status.extended_tweet.extended_entities.media.1.url <chr>, user.id_str <chr>,
# # user.is_translator <lgl>, user.lang <chr>, user.listed_count <dbl>, user.location <chr>, user.name <chr>,
# # user.geo_enabled <lgl>, user.id <dbl>, retweeted_status.display_text_range.0 <dbl>,
# # retweeted_status.display_text_range.1 <dbl>, retweeted_status.entities.hashtags <int>,
# # retweeted_status.entities.symbols <int>, retweeted_status.entities.urls.0.display_url <chr>,
# # retweeted_status.entities.urls.0.expanded_url <chr>, retweeted_status.entities.urls.0.indices.0 <dbl>,
# # retweeted_status.entities.urls.0.indices.1 <dbl>, retweeted_status.geo <int>, …
Run Code Online (Sandbox Code Playgroud)
要么:
dat <- jsonlite::stream_in(file("data/INCIndia26febru.json"))
tibble::glimpse(dat)
# Observations: 1,584
# Variables: 36
# $ created_at <chr> "Tue Feb 26 13:09:36 +0000 2019", "Tue Feb 26 13:09:38 +0000 2019", "Tue Feb …
# $ id <dbl> 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.10038…
# $ id_str <chr> "1100382373823299586", "1100382382396448770", "1100382385013645314", "1100382…
# $ text <chr> "RT @INCIndia: Congress President @RahulGandhi addresses North-East DCC &…
# $ source <chr> "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for…
# $ truncated <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ in_reply_to_status_id <dbl> NA, 1.100360e+18, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_status_id_str <chr> NA, "1100359823630237697", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# $ in_reply_to_user_id <dbl> NA, 1.153045e+09, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_user_id_str <chr> NA, "1153045459", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_screen_name <chr> NA, "INCIndia", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ user <data.frame> <data.frame[38 x 39]>
# $ geo <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ coordinates <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ place <data.frame> <data.frame[38 x 9]>
# $ contributors <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ retweeted_status <data.frame> <data.frame[38 x 34]>
# $ is_quote_status <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FA…
# $ quote_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ reply_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ retweet_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ favorite_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ entities <data.frame> <data.frame[38 x 5]>
# $ favorited <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ retweeted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ filter_level <chr> "low", "low", "low", "low", "low", "low", "low", "low", "low", "low", "low", …
# $ lang <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en",…
# $ timestamp_ms <chr> "1551186576692", "1551186578736", "1551186579360", "1551186579893", "15511865…
# $ display_text_range <list> [NULL, <23, 100>, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL…
# $ possibly_sensitive <lgl> NA, NA, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# $ quoted_status_id <dbl> NA, NA, NA, NA, NA, NA, 1.100365e+18, NA, NA, 1.100365e+18, NA, NA, 1.100365e…
# $ quoted_status_id_str <chr> NA, NA, NA, NA, NA, NA, "1100364859768782849", NA, NA, "1100364859768782849",…
# $ quoted_status <data.frame> <data.frame[38 x 32]>
# $ quoted_status_permalink <data.frame> <data.frame[38 x 3]>
# $ extended_tweet <data.frame> <data.frame[38 x 4]>
# $ extended_entities <data.frame> <data.frame[38 x 1]>
Run Code Online (Sandbox Code Playgroud)