使用非空列的名称创建新列

Vit*_*nko 6 r data.table

我的数据集如下所示:

library(data.table)

df <- data.table(a = c(1,2,3,4,5),
                 b = c(1,0,2,5,1),
                 c = c(0,1,1,0,0),
                 d = c(1,0,0,2,2))

df
#    a b c d
# 1: 1 1 0 1
# 2: 2 0 1 0
# 3: 3 2 1 0
# 4: 4 5 0 2
# 5: 5 1 0 2
Run Code Online (Sandbox Code Playgroud)

我想创建一个具有非空列名称的新列.结果将是:

df_result <- data.table(a = c(1,2,3,4,5),
                        z = c('b_d', 'c', 'b_c', 'b_d', 'b_d'))

df_result
#    a   z
# 1: 1 b_d
# 2: 2   c
# 3: 3 b_c
# 4: 4 b_d
# 5: 5 b_d
Run Code Online (Sandbox Code Playgroud)

ale*_*laz 12

假设nrow >> ncol,您可以按列工作

ff = function(x)
{
    ans = character(nrow(x))
    for(j in seq_along(x)) {
        i = x[[j]] > 0L
        ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
    }
    return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c"   "b_c" "b_d" "b_d"
Run Code Online (Sandbox Code Playgroud)

  • 是的,但你可以做'df [,z:= ff(.SD),. SDcols = -1]这样的东西,并避免使用深层拷贝,这与你的原始解决方案不同 - 因为`with = FALSE`总是会产生深拷贝. (2认同)
  • @DavidArenburg:没有意识到这一点; 谢谢. (2认同)

akr*_*run 8

一种选择是使用将格式从'wide'转换为'long' melt.由'a'分组,我们paste'变量'元素对应于'value'中的非零元素(在'i'中作为逻辑条件提供).

melt(df, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d
Run Code Online (Sandbox Code Playgroud)

或代替meltING,我们可以用"a",组unlistData.table的子集(.SD)和paste所述names对应的非零元素("I1")列.

df[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]
Run Code Online (Sandbox Code Playgroud)

基准

set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6, 
   replace=TRUE), c = sample(0:4, 1e6, replace=TRUE), 
    d = sample(0:3, 1e6, replace=TRUE))

akrun1 <- function() {
   melt(df1, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
    }

 akrun2 <- function() {
   df1[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]
   }

 ronak <- function() {
    data.table(z = lapply(apply(df1, 1, function(x)
                which(x[-1]!= 0)), 
       function(x) paste0(names(x), collapse = "_")))
   }

eddi <- function(){
 df1[, newcol := gsub("NA_|_NA|NA", "",                          
   do.call(function(...) paste(..., sep = "_"),            
     Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
 , .SDcols = b:d]

 }

alexis = function(x)
   {
   ans = character(nrow(x))
   for(j in seq_along(x)) {
    i = x[[j]] > 0L
    ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
   }
  return(gsub("^_", "", ans))
}





system.time(akrun1())
#   user  system elapsed 
#  22.04    0.15   22.36 
 system.time(akrun2())
#   user  system elapsed 
# 26.33    0.00   26.41 
 system.time(ronak())
#   user  system elapsed 
#  25.60    0.26   25.96 


system.time(alexis(df1[, -1L, with = FALSE]))
#   user  system elapsed 
#   1.92    0.06    2.09 

system.time(eddi())
#  user  system elapsed 
#   2.41    0.06    3.19 
Run Code Online (Sandbox Code Playgroud)

  • @akrun也请为其他两个答案添加基准 (3认同)
  • @akrun嗯,好吧,如果我有任何想做的话,那很久以前就会完成.你可以改进*你的答案,因为它包含了最慢答案的基准,并省略了两个更快的答案. (2认同)

edd*_*ddi 8

这是一个直接的方法:

df[, newcol := gsub("NA_|_NA|NA", "",                           # remove unwanted text
       do.call(function(...) paste(..., sep = "_"),             # paste colnames together
         Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
   , .SDcols = b:d]
#   a b c d newcol
#1: 1 1 0 1    b_d
#2: 2 0 1 0      c
#3: 3 2 1 0    b_c
#4: 4 5 0 2    b_d
#5: 5 1 0 2    b_d
Run Code Online (Sandbox Code Playgroud)

在akrun的测试数据上,它的速度提高了10倍.