使用(d)plyr创建一系列新列

Sté*_*ent 4 r plyr dplyr data.table

比如说我有一个带有列的数据框,a我想a^i为几个值创建列i.

> dat <- data.frame(a=1:5)
> dat
    a
1   1
2   2
3   3
4   4
5   5
Run Code Online (Sandbox Code Playgroud)

举个例子,我想要的输出i=2:5:

  a power_2 power_3 power_4 power_5
1 1       1       1       1       1
2 2       4       8      16      32
3 3       9      27      81     243
4 4      16      64     256    1024
5 5      25     125     625    3125
Run Code Online (Sandbox Code Playgroud)

目前我得到的输出data.table如下:

DT <- data.table(dat)
exponents <- 2:5
DT[, paste0("power_",exponents):=lapply(exponents, function(p) a^p)]
Run Code Online (Sandbox Code Playgroud)

怎么办plyr/ dplyr?当然,我可以通过键入power_i=a^i每个来做如下,i但这不是我想要的.

mutate(dat, power_2=a^2, power_3=a^3, ...)
Run Code Online (Sandbox Code Playgroud)

结论回答后

已经提出了几个答案,并且已经通过@docendo discimus进行了比较.我只是加上比较data.table.

library(data.table)
library(dplyr)
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
DT <-  data.table(dat)

library(microbenchmark)

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  times = 30,
  unit = "relative"
)
Unit: relative
       expr       min        lq      mean    median        uq       max neval cld
 data.table  1.022945  1.039674  1.108558  1.026319  1.083644  2.370180    30  a 
     Henrik  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30  a 
      dd.do  1.149195  1.160735  1.167672  1.158141  1.150280  1.268279    30  a 
      dd.bc 14.350034 13.982658 13.737964 13.632361 13.606221 15.866711    30   b
Run Code Online (Sandbox Code Playgroud)

通过两个base解决方案更新基准,Henrik2和josh(来自他的评论),这是最快的:

set.seed(2015)
dat <- data.frame(a = sample(1000))

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`)),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
  times = 30,
  unit = "relative"
)

# Unit: relative
#       expr       min        lq      mean    median        uq       max neval  cld
# data.table  1.991613  2.029778  1.982169  1.990417  1.946677  1.694030    30  bc 
#     Henrik  2.026345  2.017179  1.996419  2.003189  2.030176  1.733583    30  bc 
#    Henrik2  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30 a   
#      dd.do  2.356886  2.375713  2.322452  2.348053  2.304826  2.101494    30   c 
#      dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116    30    d
#       josh  1.725750  1.699887  1.641290  1.625331  1.637823  1.330598    30  b
Run Code Online (Sandbox Code Playgroud)

Hen*_*rik 7

一种可能性是使用outerin do,然后设置名称setNames

i <- 2:5
dat %>%
  do(data.frame(., outer(.$a, i, `^`))) %>%
  setNames(., c("a", paste0("power_", i)))

#   a power_2 power_3 power_4 power_5
# 1 1       1       1       1       1
# 2 2       4       8      16      32
# 3 3       9      27      81     243
# 4 4      16      64     256    1024
# 5 5      25     125     625    3125
Run Code Online (Sandbox Code Playgroud)

如果你首先命名'power vector'"i",你可以调用cbind而不是dodata.frame,并且dplyr在这种特殊情况下我看不到立即需要函数.

cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`))
#   a power_2 power_3 power_4 power_5
# 1 1       1       1       1       1
# 2 2       4       8      16      32
# 3 3       9      27      81     243
# 4 4      16      64     256    1024
# 5 5      25     125     625    3125
Run Code Online (Sandbox Code Playgroud)

base,非do代码是你的大样本数据的速度更快.我还添加了base@Josh O'Brien 的解决方案.

set.seed(2015)
dat <- data.frame(a = sample(1000))

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`)),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
  times = 30,
  unit = "relative"
)

# Unit: relative
#       expr       min        lq      mean    median        uq       max neval  cld
# data.table  1.991613  2.029778  1.982169  1.990417  1.946677  1.694030    30  bc 
#     Henrik  2.026345  2.017179  1.996419  2.003189  2.030176  1.733583    30  bc 
#    Henrik2  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30 a   
#      dd.do  2.356886  2.375713  2.322452  2.348053  2.304826  2.101494    30   c 
#      dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116    30    d
#       josh  1.725750  1.699887  1.641290  1.625331  1.637823  1.330598    30  b
Run Code Online (Sandbox Code Playgroud)


tal*_*lat 5

这是一个使用选项do:

i <- 2:5
n <- c(names(dat), paste0("power_", i))
dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n)
#  a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125
Run Code Online (Sandbox Code Playgroud)

另一种选择,使用bind_cols:

dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n)
#  a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125
Run Code Online (Sandbox Code Playgroud)

评论后编辑:

@Henrik的解决方案比我的快:

set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))

library(microbenchmark)

microbenchmark(
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  times = 30,
  unit = "relative"
  )
Unit: relative
   expr       min        lq    median        uq       max neval
 Henrik  1.000000  1.000000  1.000000  1.000000  1.000000    30
  dd.do  1.138506  1.179104  1.173298  1.149581  2.660237    30
  dd.bc 18.862923 18.702178 18.058984 17.537727 16.426538    30
Run Code Online (Sandbox Code Playgroud)