通过在 dplyr 中求和来创建“其他”类别

And*_*ora 1 r dplyr

我有以下数据集,其中包含 ID 和每个 ID 的数值。我需要显示前 10 个最大的 ID,并将其余的分组到“其他”类别下。

我已经研究了这个问题/答案,但它适用于计数而不是总和。所以这种方法不适用于我的情况

我想要的结果

ID          Value
63366849    42084408
63475529    21879648
6774736     21321888
91274582    17393328
63445915    15215002
1097911044  9926442
1095812405  8971332
1097910557  5388376
37548920    5114854
OTHERS      153587848
Run Code Online (Sandbox Code Playgroud)

我的数据:

df=structure(list(ID= c("63366849", "63475529", "6774736", 
"91274582", "63445915", "1097911044", "1095812405", "1097910557", 
"37548920", "1005153076", "13513021", "51991938", "1010090155", 
"91265898", "91237574", "91344448", "1095956598", "28065538", 
"63341531", "63335642", "5637749", "17419836", "5567029", "5651301", 
"37839500", "63450446", "28424298", "29264885", "63328148", "63562603", 
"51702988", "7416450", "1005234045", "91520220", "91159937", 
"13801492", "9260536", "37722978", "63355924", "1098711718", 
"63443769", "60306461", "28485149", "2151526", "2127233", "1096063398", 
"27948572", "13834945", "27938888", "91179848", "41678031", "37837953", 
"91233367", "5557908", "28012681", "33446782", "91243709", "1007790961", 
"1095826946", "1095926858", "20189860", "24049724", "27903426", 
"28133395", "41446577", "37319481", "2033552", "27950302", "91200190", 
"91205127", "9527919", "1095303440", "13826841", "91218008", 
"63280472", "24074310", "37805433", "13801074", "28037341", "63353740", 
"1095793583", "1102371309", "1116493966", "2127316", "2153230", 
"37316948", "51778219", "5503745", "1099735096", "1222254600", 
"27786180", "2162536", "27964677", "2020408", "91349686", "37514200", 
"30208806", "1102351224", "28354311", "27929976", "63523710", 
"13838438", "28296329", "28410262", "28494063", "5721778", "91275248", 
"91353706", "37751608", "1097610166", "1098080648", "13810090", 
"63444976", "1221464992", "1097497475", "91100475", "1095312933", 
"28131889", "63506738", "7922341", "28238547", "27985819", "27958501", 
"23912889", "5707630", "1102388693", "24028654", "28128637", 
"5554160", "63369230", "36521093", "91525097", "27929902", "63441041", 
"1097919703", "40984557", "1097129811", "13843871", "63301433", 
"28236969", "37831774", "1000942168", "1005220322", "1098661759", 
"1095843775", "28399609", "37211106", "5683927", "28254346", 
"91268076", "28295325", "28496274", "13820294", "28354968", "37793321", 
"63495244", "63299126", "13849979", "63357743", "60262321", "1098753025", 
"91435400", "2195064", "63345546", "63251135", "13827671", "2007126", 
"40378506", "1097121371", "37828831", "5707059", "63324477", 
"1102644458", "37796841", "28334234", "28010824", "28495363", 
"27929493", "5558783", "51974763", "28296333", "63489129", "1102385538", 
"13814150", "13829893", "28377189", "91342964", "164497637", 
"52326169", "63286143", "1099738639", "63503298", "1097913686", 
"91070753", "37652636", "60352132"), Value= c(42084408, 
21879648, 21321888, 17393328, 15215002, 9926442, 8971332, 5388376, 
5114854, 5093730, 5004621, 3696756, 3424872, 3347432, 3337895, 
3105759, 3104409, 3056238, 2748870, 2738808, 2662730, 2555784, 
2268480, 2126700, 2051811, 2035200, 2025495, 1962000, 1926012, 
1829736, 1822980, 1657659, 1574901, 1546320, 1539355, 1480530, 
1456140, 1372302, 1372302, 1367235, 1350330, 1330560, 1327761, 
1289520, 1265556, 1258350, 1245060, 1228770, 1228545, 1209705, 
1196100, 1180533, 1175274, 1170286, 1144792, 1106820, 1106820, 
1080621, 1080621, 1080621, 1080621, 1080621, 1080621, 1080621, 
1080621, 1080264, 1067400, 1067400, 1003000, 993510, 992520, 
988560, 957408, 957408, 922350, 889893, 888300, 883800, 855150, 
835920, 833481, 833481, 833481, 833481, 833481, 833481, 833481, 
833481, 823458, 775300, 741420, 734760, 734760, 729882, 672660, 
666740, 631020, 621000, 621000, 600651, 598050, 553410, 553410, 
553410, 553410, 553410, 553410, 553410, 552690, 503340, 468120, 
439800, 416040, 415136, 403235, 398700, 383629, 383629, 383629, 
367380, 364941, 360207, 360088, 346680, 339582, 339201, 339201, 
339201, 339201, 303120, 300450, 289080, 282042, 238080, 234060, 
219531, 211300, 195447, 195447, 192960, 192960, 169000, 169000, 
169000, 163868, 163868, 156540, 143460, 136080, 132048, 127818, 
127818, 124920, 124920, 124920, 124920, 105120, 93690, 93690, 
77580, 66024, 64650, 62460, 62460, 62376, 58800, 58338, 57240, 
50524, 49260, 49260, 48240, 46528, 44100, 43110, 42930, 41640, 
40755, 38715, 37050, 35685, 33012, 30687, 28620, 28620, 24678, 
24570, 15189, 15120, 15120, 14504, 14310, 11424, 6000, 5940, 
5428)), row.names = c(NA, -196L), class = c("tbl_df", "tbl", 
"data.frame"))
Run Code Online (Sandbox Code Playgroud)

Rob*_*ach 5

使用该forcats包的一个稍微干净的选项:

\n
df %>%\n  mutate(ID = fct_lump_n(factor(ID), n = 9, w = Value, other_level = "OTHERS")) %>%\n  group_by(ID) %>%\n  summarize(Value = sum(Value)) %>%\n  arrange(ID == "OTHERS", desc(Value))\n\n# A tibble: 10 \xc3\x97 2\n# ID    Value\n# <fct> <dbl>\n# 63366849  42084408\n# 63475529  21879648\n# 6774736   21321888\n# 91274582  17393328\n# 63445915  15215002\n# 1097911044    9926442\n# 1095812405    8971332\n# 1097910557    5388376\n# 37548920  5114854\n# OTHERS    153587848\n
Run Code Online (Sandbox Code Playgroud)\n