如何在ggplot2中创建多个y轴(每个变量一个)

Viv*_*ien 1 r ggplot2 parallel-coordinates

我想绘制一个类似于平行坐标图的图来进行描述性统计。我想绘制按性别分层的每个变量的平均值和标准差。

在此输入图像描述

不幸的是,我无法找到为每个变量创建自己的 y 轴的方法。

它看起来应该与此图类似,但使用平均值和标准差而不是每一行。

在此输入图像描述

这篇文章中,他们创建了另一个图,但我无法根据我的需求和数据进行调整,因此平均值和标准差如上图所示。

这是我的数据和代码:


library(dplyr)
library(tidyr)
library(ggplot2)

my_data <- data.frame(
  sex = c("m", "w", "m", "w", "m", "w"),
  age = c(25, 30, 22, 35, 28, 46),
  testosterone = c(450, 200, 400, 300, 500, 350),
  cognition = c(75, 80, 70, 85, 78, 90),
  estrogen = c(20, 40, 15, 50, 10, 45)
)

numeric_vars <- c("age", "testosterone", "cognition", "estrogen")

# Calculate means and standard deviations for each sex and each variable
df_means <- my_data %>%
  group_by(sex) %>%
  summarise(across(all_of(numeric_vars), mean, na.rm = TRUE))

df_sds <- my_data %>%
  group_by(sex) %>%
  summarise(across(all_of(numeric_vars), sd, na.rm = TRUE))

# Reshape data for ggplot
df_melted <- reshape2::melt(df_means, id.vars = 'sex')
df_sds_melted <- reshape2::melt(df_sds, id.vars = 'sex')

# Combine means and standard deviations
df_combined <- merge(df_melted, df_sds_melted, by = c('sex', 'variable'))
names(df_combined)[names(df_combined) == 'value.x'] <- 'mean'
names(df_combined)[names(df_combined) == 'value.y'] <- 'sd'

# Create a parallel coordinates plot without normalization
ggplot(df_combined, aes(x = variable, y = mean, group = sex, color = sex)) +
  geom_line() +
  geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = sex), alpha = 0.2) +
  scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
  scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
  theme_minimal() 


Run Code Online (Sandbox Code Playgroud)

r2e*_*ans 5

像这样的东西吗?

其中唯一的“常数”是:

  • tick_ends定义刻度线段的偏移量;和
  • hjust = 1.2,在每个文本标签的段和右侧之间提供一点距离。
  • (我猜想pretty(n=8),对于要添加的大致刻度数)也是一个神奇常数。)

我选择设置variable因子的级别以匹配起始图中变量的顺序(与您的代码不匹配),请随意取消设置或更改levels=

library(dplyr)
library(tidyr)
library(ggplot2)

long_data <- my_data %>%
  summarize(
    across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
    .by = sex
  ) %>%
  pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
  mutate(
    variable = factor(variable, levels = c("age", "testosterone", "cognition", "estrogen")),
    x = as.numeric(variable)
  ) %>%
  mutate(
    min = min(min), max = max(max),
    mu_01 = scales::rescale(mu, from = c(min[1], max[1]), to = 0:1),
    sd_01 = sd / abs(min[1] - max[1]),
    .by = variable
  )
ticks <- summarize(long_data, .by = c(variable, x),
                   y = pretty(c(min, max), n = 8),
                   y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
tick_ends <- c(-0.05, 0)

ggplot(long_data, aes(x, mu_01, group = sex, color = sex)) +
  geom_segment(
    aes(x = x + tick_ends[1], xend = x + tick_ends[2], y = y_01, yend = y_01),
    data = ticks, inherit.aes = FALSE
  ) +
  geom_line(
    aes(x = x, y = y_01, group = variable),
    data = summarize(ticks, .by = variable, x = x[1] + tick_ends[2], y_01 = range(y_01)),
    inherit.aes = FALSE
  ) +
  geom_ribbon(aes(ymin = mu_01 - sd_01, ymax = mu_01 + sd_01, fill = sex), alpha = 0.2) +
  geom_line() +
  geom_text(
    aes(x = x + tick_ends[1], y = y_01, label = y),
    hjust = 1.2, data = ticks, inherit.aes = FALSE
  ) +
  scale_x_continuous(
    name = NULL,
    breaks = seq_along(levels(long_data$variable)),
    labels = levels(long_data$variable),
    limits = range(as.numeric(long_data$variable)) + c(-0.1, 0.1),
    minor_breaks = seq(1, 10, 1),
    position = "top"
  ) +
  scale_y_continuous(name = NULL, breaks = NULL) +
  scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
  scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
  theme_minimal()
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

.by=dplyr 动词中使用“requires”dplyr_1.1.0或“稍后”;如果您有旧版本的 dplyr,请删除并在相应动词前.by=c(..)添加相应的内容。group_by(..)

根据观测值缩放所有变量的一种可能(低级?)“风险”[0,1]是,它可能隐藏或夸大变量之间推断的相对大小/重要性。不确定这里是否有简单的方法来解决这个问题。


扩大

为了使内轴对齐,我不知道是否有比以这种方式控制重新缩放更好的方法。现在使用新数据:

my_data <- data.frame( sex = c("m", "w", "m", "w", "m", "w"), age = c(25, 30, 22, 35, 28, 46), testosterone = c(450, 5, 400, 10, 500, 15), cognition = c(0.1, 0.5, 0.3, 0.6, 0.4, 0.1), estrogen = c(20, 80, 15, 60, 10, 70), Var6 = c(1000, 900, 600, 800, 700, 500), Var7 = c(3, 5, 6, 7, 3, 4), Var8 = c(20, 15, 30, 35, 25, 60), Var9 = c(0.6, 0.9, 1.4, 2.1, 1.2, 0.6), Var10 = c(200, 150, 300, 200, 130, 400), Var11 = c(900, 800, 450, 600, 650, 750), Var12 = c(30, 45, 60, 40, 30, 45) ) 
numeric_vars <- setdiff(names(my_data), "sex")
Run Code Online (Sandbox Code Playgroud)

使用该代码和稍微更新的代码:

extend_rescale <- function(x, name = "variable") {
  p <- pretty(range(x))
  d <- diff(p)[1]
  p <- c(p[1], p[length(p)]) + d * c(-1, 1)
  out <- data.frame(
    x = scales::rescale(x, from = p, to = 0:1),
    step = d, min = p[1], max = p[2]
  )
  names(out)[1] <- name
  out
}

long_data <- my_data %>%
  summarize(
    across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
    .by = sex
  ) %>%
  pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
  mutate(
    variable = factor(variable), # set levels= if desired
    x = as.numeric(variable)
  ) %>%
  mutate(
    extend_rescale(c(range(c(mu + sd, mu - sd)), mu), name = "mu_01")[-(1:2),],
    sd_01 = sd / abs(max[1] - min[1]),
    .by = variable)
ticks <- summarize(long_data, .by = c(variable, x),
                   y = seq(min[1], max[2], by = step[1]),
                   y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
tick_ends <- c(-0.05, 0)
Run Code Online (Sandbox Code Playgroud)

(相同的情节代码。)

在此输入图像描述