我的数据集如下所示:
library(data.table)
df <- data.table(a = c(1,2,3,4,5),
b = c(1,0,2,5,1),
c = c(0,1,1,0,0),
d = c(1,0,0,2,2))
df
# a b c d
# 1: 1 1 0 1
# 2: 2 0 1 0
# 3: 3 2 1 0
# 4: 4 5 0 2
# 5: 5 1 0 2
Run Code Online (Sandbox Code Playgroud)
我想创建一个具有非空列名称的新列.结果将是:
df_result <- data.table(a = c(1,2,3,4,5),
z = c('b_d', 'c', 'b_c', 'b_d', 'b_d'))
df_result
# a z
# 1: 1 b_d
# 2: 2 c
# 3: 3 b_c
# 4: 4 b_d
# 5: 5 b_d
Run Code Online (Sandbox Code Playgroud)
ale*_*laz 12
假设nrow >> ncol,您可以按列工作
ff = function(x)
{
ans = character(nrow(x))
for(j in seq_along(x)) {
i = x[[j]] > 0L
ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
}
return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c" "b_c" "b_d" "b_d"
Run Code Online (Sandbox Code Playgroud)
一种选择是使用将格式从'wide'转换为'long' melt.由'a'分组,我们paste'变量'元素对应于'value'中的非零元素(在'i'中作为逻辑条件提供).
melt(df, id.var='a')[value!=0,
.(z=paste(variable, collapse="_")), keyby =a]
# a z
#1: 1 b_d
#2: 2 c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d
Run Code Online (Sandbox Code Playgroud)
或代替meltING,我们可以用"a",组unlistData.table的子集(.SD)和paste所述names对应的非零元素("I1")列.
df[, {i1 <- !!unlist(.SD)
paste(names(.SD)[i1], collapse="_")} , by= a]
Run Code Online (Sandbox Code Playgroud)
set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6,
replace=TRUE), c = sample(0:4, 1e6, replace=TRUE),
d = sample(0:3, 1e6, replace=TRUE))
akrun1 <- function() {
melt(df1, id.var='a')[value!=0,
.(z=paste(variable, collapse="_")), keyby =a]
}
akrun2 <- function() {
df1[, {i1 <- !!unlist(.SD)
paste(names(.SD)[i1], collapse="_")} , by= a]
}
ronak <- function() {
data.table(z = lapply(apply(df1, 1, function(x)
which(x[-1]!= 0)),
function(x) paste0(names(x), collapse = "_")))
}
eddi <- function(){
df1[, newcol := gsub("NA_|_NA|NA", "",
do.call(function(...) paste(..., sep = "_"),
Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
, .SDcols = b:d]
}
alexis = function(x)
{
ans = character(nrow(x))
for(j in seq_along(x)) {
i = x[[j]] > 0L
ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
}
return(gsub("^_", "", ans))
}
system.time(akrun1())
# user system elapsed
# 22.04 0.15 22.36
system.time(akrun2())
# user system elapsed
# 26.33 0.00 26.41
system.time(ronak())
# user system elapsed
# 25.60 0.26 25.96
system.time(alexis(df1[, -1L, with = FALSE]))
# user system elapsed
# 1.92 0.06 2.09
system.time(eddi())
# user system elapsed
# 2.41 0.06 3.19
Run Code Online (Sandbox Code Playgroud)
这是一个直接的方法:
df[, newcol := gsub("NA_|_NA|NA", "", # remove unwanted text
do.call(function(...) paste(..., sep = "_"), # paste colnames together
Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
, .SDcols = b:d]
# a b c d newcol
#1: 1 1 0 1 b_d
#2: 2 0 1 0 c
#3: 3 2 1 0 b_c
#4: 4 5 0 2 b_d
#5: 5 1 0 2 b_d
Run Code Online (Sandbox Code Playgroud)
在akrun的测试数据上,它的速度提高了10倍.