Viv*_*ien 1 r ggplot2 parallel-coordinates
我想绘制一个类似于平行坐标图的图来进行描述性统计。我想绘制按性别分层的每个变量的平均值和标准差。
不幸的是,我无法找到为每个变量创建自己的 y 轴的方法。
它看起来应该与此图类似,但使用平均值和标准差而不是每一行。
在这篇文章中,他们创建了另一个图,但我无法根据我的需求和数据进行调整,因此平均值和标准差如上图所示。
这是我的数据和代码:
library(dplyr)
library(tidyr)
library(ggplot2)
my_data <- data.frame(
sex = c("m", "w", "m", "w", "m", "w"),
age = c(25, 30, 22, 35, 28, 46),
testosterone = c(450, 200, 400, 300, 500, 350),
cognition = c(75, 80, 70, 85, 78, 90),
estrogen = c(20, 40, 15, 50, 10, 45)
)
numeric_vars <- c("age", "testosterone", "cognition", "estrogen")
# Calculate means and standard deviations for each sex and each variable
df_means <- my_data %>%
group_by(sex) %>%
summarise(across(all_of(numeric_vars), mean, na.rm = TRUE))
df_sds <- my_data %>%
group_by(sex) %>%
summarise(across(all_of(numeric_vars), sd, na.rm = TRUE))
# Reshape data for ggplot
df_melted <- reshape2::melt(df_means, id.vars = 'sex')
df_sds_melted <- reshape2::melt(df_sds, id.vars = 'sex')
# Combine means and standard deviations
df_combined <- merge(df_melted, df_sds_melted, by = c('sex', 'variable'))
names(df_combined)[names(df_combined) == 'value.x'] <- 'mean'
names(df_combined)[names(df_combined) == 'value.y'] <- 'sd'
# Create a parallel coordinates plot without normalization
ggplot(df_combined, aes(x = variable, y = mean, group = sex, color = sex)) +
geom_line() +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = sex), alpha = 0.2) +
scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
theme_minimal()
Run Code Online (Sandbox Code Playgroud)
像这样的东西吗?
其中唯一的“常数”是:
tick_ends定义刻度线段的偏移量;和hjust = 1.2,在每个文本标签的段和右侧之间提供一点距离。pretty(n=8),对于要添加的大致刻度数)也是一个神奇常数。)我选择设置variable因子的级别以匹配起始图中变量的顺序(与您的代码不匹配),请随意取消设置或更改levels=。
library(dplyr)
library(tidyr)
library(ggplot2)
long_data <- my_data %>%
summarize(
across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
.by = sex
) %>%
pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
mutate(
variable = factor(variable, levels = c("age", "testosterone", "cognition", "estrogen")),
x = as.numeric(variable)
) %>%
mutate(
min = min(min), max = max(max),
mu_01 = scales::rescale(mu, from = c(min[1], max[1]), to = 0:1),
sd_01 = sd / abs(min[1] - max[1]),
.by = variable
)
ticks <- summarize(long_data, .by = c(variable, x),
y = pretty(c(min, max), n = 8),
y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
tick_ends <- c(-0.05, 0)
ggplot(long_data, aes(x, mu_01, group = sex, color = sex)) +
geom_segment(
aes(x = x + tick_ends[1], xend = x + tick_ends[2], y = y_01, yend = y_01),
data = ticks, inherit.aes = FALSE
) +
geom_line(
aes(x = x, y = y_01, group = variable),
data = summarize(ticks, .by = variable, x = x[1] + tick_ends[2], y_01 = range(y_01)),
inherit.aes = FALSE
) +
geom_ribbon(aes(ymin = mu_01 - sd_01, ymax = mu_01 + sd_01, fill = sex), alpha = 0.2) +
geom_line() +
geom_text(
aes(x = x + tick_ends[1], y = y_01, label = y),
hjust = 1.2, data = ticks, inherit.aes = FALSE
) +
scale_x_continuous(
name = NULL,
breaks = seq_along(levels(long_data$variable)),
labels = levels(long_data$variable),
limits = range(as.numeric(long_data$variable)) + c(-0.1, 0.1),
minor_breaks = seq(1, 10, 1),
position = "top"
) +
scale_y_continuous(name = NULL, breaks = NULL) +
scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
theme_minimal()
Run Code Online (Sandbox Code Playgroud)
.by=dplyr 动词中使用“requires”dplyr_1.1.0或“稍后”;如果您有旧版本的 dplyr,请删除并在相应动词前.by=c(..)添加相应的内容。group_by(..)
根据观测值缩放所有变量的一种可能(低级?)“风险”[0,1]是,它可能隐藏或夸大变量之间推断的相对大小/重要性。不确定这里是否有简单的方法来解决这个问题。
为了使内轴对齐,我不知道是否有比以这种方式控制重新缩放更好的方法。现在使用新数据:
my_data <- data.frame( sex = c("m", "w", "m", "w", "m", "w"), age = c(25, 30, 22, 35, 28, 46), testosterone = c(450, 5, 400, 10, 500, 15), cognition = c(0.1, 0.5, 0.3, 0.6, 0.4, 0.1), estrogen = c(20, 80, 15, 60, 10, 70), Var6 = c(1000, 900, 600, 800, 700, 500), Var7 = c(3, 5, 6, 7, 3, 4), Var8 = c(20, 15, 30, 35, 25, 60), Var9 = c(0.6, 0.9, 1.4, 2.1, 1.2, 0.6), Var10 = c(200, 150, 300, 200, 130, 400), Var11 = c(900, 800, 450, 600, 650, 750), Var12 = c(30, 45, 60, 40, 30, 45) )
numeric_vars <- setdiff(names(my_data), "sex")
Run Code Online (Sandbox Code Playgroud)
使用该代码和稍微更新的代码:
extend_rescale <- function(x, name = "variable") {
p <- pretty(range(x))
d <- diff(p)[1]
p <- c(p[1], p[length(p)]) + d * c(-1, 1)
out <- data.frame(
x = scales::rescale(x, from = p, to = 0:1),
step = d, min = p[1], max = p[2]
)
names(out)[1] <- name
out
}
long_data <- my_data %>%
summarize(
across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
.by = sex
) %>%
pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
mutate(
variable = factor(variable), # set levels= if desired
x = as.numeric(variable)
) %>%
mutate(
extend_rescale(c(range(c(mu + sd, mu - sd)), mu), name = "mu_01")[-(1:2),],
sd_01 = sd / abs(max[1] - min[1]),
.by = variable)
ticks <- summarize(long_data, .by = c(variable, x),
y = seq(min[1], max[2], by = step[1]),
y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
tick_ends <- c(-0.05, 0)
Run Code Online (Sandbox Code Playgroud)
(相同的情节代码。)