将数据集中的每个单元格除以 R 中的列总和

Question

将数据集中的每个单元格除以 R 中的列总和

我试图将数据框中的每个单元格除以列的总和。例如，我有一个数据框 df：

sample   a   b   c
a2      1    4    6
a3      5    5    4

Run Code Online (Sandbox Code Playgroud)

我想创建一个新的数据框，将每个单元格放入并除以列的总和，如下所示：

sample   a   b   c
a2      .167  .444  .6
a3      .833  .556  .4

Run Code Online (Sandbox Code Playgroud)

我已经使用sweep() 看到了答案，但这看起来像是用于矩阵，而且我有数据框。我了解如何使用 colSums()，但我不确定如何编写一个函数来循环遍历列中的每个单元格，然后除以列总和。谢谢您的帮助！

Answer 1

www*_*www 15

解决方案1

这里有两个dplyr解决方案。我们可以使用mutate_at或mutate_if来有效地指定我们想要应用操作的列，或者我们想要应用操作的条件。

library(dplyr)

# Apply the operation to all column except sample
dat2 <- dat %>%
  mutate_at(vars(-sample), funs(./sum(.))) 
dat2
#   sample         a         b   c
# 1     a2 0.1666667 0.4444444 0.6
# 2     a3 0.8333333 0.5555556 0.4

# Apply the operation if the column is numeric 
dat2 <- dat %>%
  mutate_if(is.numeric, funs(./sum(.))) 
dat2
#   sample         a         b   c
# 1     a2 0.1666667 0.4444444 0.6
# 2     a3 0.8333333 0.5555556 0.4

Run Code Online (Sandbox Code Playgroud)

解决方案2

我们还可以使用purrr包中的map_atandmap_if函数。然而，由于输出是一个列表，我们需要从基础 R 或从dplyr将列表转换为数据框。as.data.frameas_data_frame

library(dplyr)
library(purrr)

# Apply the operation to column a, b, and c    
dat2 <- dat %>%
  map_at(c("a", "b", "c"), ~./sum(.)) %>% 
  as_data_frame()
dat2
# # A tibble: 2 x 4
#   sample     a     b     c
#   <chr>  <dbl> <dbl> <dbl>
# 1 a2     0.167 0.444 0.600
# 2 a3     0.833 0.556 0.400

# Apply the operation if the column is numeric
dat2 <- dat %>%
  map_if(is.numeric, ~./sum(.)) %>%
  as_data_frame()
dat2
# # A tibble: 2 x 4
#   sample     a     b     c
#   <chr>  <dbl> <dbl> <dbl>
# 1 a2     0.167 0.444 0.600
# 2 a3     0.833 0.556 0.400

Run Code Online (Sandbox Code Playgroud)

解决方案3

我们还可以使用.SD并.SDcols从data.table包。

library(data.table)

# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
#    sample         a         b   c
# 1:     a2 0.1666667 0.4444444 0.6
# 2:     a3 0.8333333 0.5555556 0.4

Run Code Online (Sandbox Code Playgroud)

解决方案4

我们还可以使用该lapply函数循环遍历除第一列之外的所有列来执行操作。

dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
#   sample         a         b   c
# 1     a2 0.1666667 0.4444444 0.6
# 2     a3 0.8333333 0.5555556 0.4

Run Code Online (Sandbox Code Playgroud)

我们还可以使用 apply 循环遍历所有列，但在函数中添加 if-else 语句以确保仅对数字列执行操作。

dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
  # Check if the column is numeric
  if (is.numeric(x)){
    return(x/sum(x))
  } else{
    return(x)
  }
})
dat2
#   sample         a         b   c
# 1     a2 0.1666667 0.4444444 0.6
# 2     a3 0.8333333 0.5555556 0.4

Run Code Online (Sandbox Code Playgroud)

解决方案5

一个dplyr和tidyr解决方案基于gather和spread。

library(dplyr)
library(tidyr)

dat2 <- dat %>%
  gather(Column, Value, -sample) %>%
  group_by(Column) %>%
  mutate(Value = Value/sum(Value)) %>%
  spread(Column, Value)
dat2
# # A tibble: 2 x 4
#   sample     a     b     c
# * <chr>  <dbl> <dbl> <dbl>
# 1 a2     0.167 0.444 0.600
# 2 a3     0.833 0.556 0.400

Run Code Online (Sandbox Code Playgroud)

绩效评估

我很好奇哪种方法的性能最好。因此，我使用microbenchmark包进行了以下性能评估，其中数据框与 OP 的示例具有相同的列名，但有 1000000 行。

library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)

set.seed(100)

dat <- data_frame(sample = paste0("a", 1:1000000),
                  a = rpois(1000000, lambda = 3),
                  b = rpois(1000000, lambda = 3),
                  c = rpois(1000000, lambda = 3))

# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)    

head(dat)
# # A tibble: 6 x 4
#   sample     a     b     c
#   <chr>  <int> <int> <int>
# 1 a1         2     5     2
# 2 a2         2     5     5
# 3 a3         3     2     4
# 4 a4         1     2     2
# 5 a5         3     3     1
# 6 a6         3     6     1

Run Code Online (Sandbox Code Playgroud)

除了我提出的所有方法之外，我还对其他人提出的另外两种方法感兴趣：prop.tableHenrik 在评论中提出的方法，以及Spacedman提出的apply方法。我用. 如果在一个解决方案中有两种方法，我曾经将它们分开。我也将方法称为 as和方法 as 。请注意，我修改为将输出作为数据框，以便所有方法都可以有数据框、tibble 或 data.table 输出。m1_1, m1_2, m2_1, ... to m5_prop.tablem6applym7m6

这是我用来评估性能的代码。

per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
                      m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
                      m2_1 = {dat2 <- dat %>%
                        map_at(c("a", "b", "c"), ~./sum(.)) %>% 
                        as_data_frame()
                      },
                      m2_2 = {dat2 <- dat %>%
                        map_if(is.numeric, ~./sum(.)) %>%
                        as_data_frame()},
                      m3 = {dat_dt2 <- copy(dat_dt)
                            dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)), 
                                      .SDcols = c("a", "b", "c")]},
                      m4_1 = {dat2 <- dat
                              dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
                      m4_2 = {dat2 <- dat
                              dat2[] <- lapply(dat2[], function(x){
                        if (is.numeric(x)){
                          return(x/sum(x))
                        } else{
                          return(x)
                        }
                      })},
                      m5 = {dat2 <- dat %>%
                        gather(Column, Value, -sample) %>%
                        group_by(Column) %>%
                        mutate(Value = Value/sum(Value)) %>%
                        spread(Column, Value)},
                      m6 = {dat2 <- dat
                            dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
                      m7 = {dat2 <- dat
                            dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
                      )
print(per)
# Unit: milliseconds
# expr         min          lq       mean      median          uq        max neval
# m1_1   23.335600   24.326445   28.71934   25.134798   27.465017   75.06974   100
# m1_2   20.373093   21.202780   29.73477   21.967439   24.897305  216.27853   100
# m2_1    9.452987    9.817967   17.83030   10.052634   11.056073  175.00184   100
# m2_2   10.009197   10.342819   16.43832   10.679270   11.846692  163.62731   100
#   m3   16.195868   17.154327   34.40433   18.975886   46.521868  190.50681   100
# m4_1    8.100504    8.342882   12.66035    8.778545    9.348634  181.45273   100
# m4_2    8.130833    8.499926   15.84080    8.766979    9.732891  172.79242   100
#   m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354   100
#   m6  117.038355  150.688502  191.43501  166.665125  218.837502  325.58701   100
#   m7  119.680606  155.743991  199.59313  174.007653  215.295395  357.02775   100


library(ggplot2)
autoplot(per)

Run Code Online (Sandbox Code Playgroud)

结果表明，基于lapply( m4_1and m4_2)的tidyr方法最快，而基于( m5)的方法最慢，说明当行数较大时，使用gatherandspread方法不是一个好主意。

数据

dat <- read.table(text = "sample   a   b   c
a2      1    4    6
                  a3      5    5    4",
                  header = TRUE, stringsAsFactors = FALSE)

Run Code Online (Sandbox Code Playgroud)

Answer 2

Spa*_*man 7

鉴于这种：

> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
  sample a b c
1     a2 1 4 6
2     a3 5 5 4

Run Code Online (Sandbox Code Playgroud)

您可以通过应用其余列来替换除第一列之外的每一列：

> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})

> d
  sample         a         b   c
1     a2 0.1666667 0.4444444 0.6
2     a3 0.8333333 0.5555556 0.4

Run Code Online (Sandbox Code Playgroud)

如果您不想d被踩踏，请事先制作一份副本。

归档时间：	7 年，11 月前
查看次数：	8845 次
最近记录：	7 年，11 月前