加速模拟

Question

加速模拟

sta*_*oob 9 algorithm performance r random-walk

我有以下情况：

如果该马尔可夫链从位置 = 5 开始，则变量在 prob=0.5 时移动 +1，在 prob=-0.5 时移动 -1

情况 1：变量首次达到位置 = 0 的预期时间是多少？
情况 2：变量在至少一次达到position=10 后首次达到position=0 的预期时间是多少？

我尝试按如下方式执行此操作（我创建了一个跟踪变量来查看模拟卡在哪里）：

# the simulations can take a long time to run, I interrupted them

library(ggplot2)
library(gridExtra)

n_sims <- 100
times_to_end_0 <- numeric(n_sims)
times_to_end_0_after_10 <- numeric(n_sims)
paths_0 <- vector("list", n_sims)
paths_0_after_10 <- vector("list", n_sims)

for (i in 1:n_sims) {
    print(paste("Running simulation", i, "for situation 1..."))
    y <- 5
    time <- 0
    path_0 <- c(y)

    while(y > 0) {
        step <- sample(c(-1, 1), 1, prob = c(0.5, 0.5))
        y <- y + step
        path_0 <- c(path_0, y)
        time <- time + 1

        if (y == 0) {
            times_to_end_0[i] <- time
            paths_0[[i]] <- data.frame(time = 1:length(path_0), y = path_0, sim = i)
            break
        }
    }

    print(paste("Running simulation", i, "for situation 2..."))
    y <- 5
    time <- 0
    reached_10 <- FALSE
    path_0_after_10 <- c(y)

    while(y > 0 || !reached_10) {
        step <- sample(c(-1, 1), 1, prob = c(0.5, 0.5))
        y <- y + step
        path_0_after_10 <- c(path_0_after_10, y)
        time <- time + 1

        if (y == 10) {
            reached_10 <- TRUE
        }

        if (y == 0 && reached_10) {
            times_to_end_0_after_10[i] <- time
            paths_0_after_10[[i]] <- data.frame(time = 1:length(path_0_after_10), y = path_0_after_10, sim = i)
            break
        }
    }
}

df1 <- data.frame(time = times_to_end_0)
df2 <- data.frame(time = times_to_end_0_after_10[times_to_end_0_after_10 > 0])

mean1 <- mean(log(df1$time))
mean2 <- mean(log(df2$time))

p1 <- ggplot(df1, aes(x = log(time))) +
    geom_density() +
    geom_vline(aes(xintercept = exp(mean1)), color = "red", linetype = "dotted") +
    labs(title = paste("Density of Times to Reach 0 - Mean Time:", round(exp(mean1), 2)), x = "Time", y = "Density") + theme_bw()

p2 <- ggplot(df2, aes(x = log(time))) +
    geom_density() +
    geom_vline(aes(xintercept = exp(mean2)), color = "red", linetype = "dotted") +
    labs(title = paste("Density of Times to Reach 0 After Reaching 10 - Mean Time:", round(exp(mean2), 2)), x = "Time", y = "Density") + theme_bw() 

plot_df_0 <- do.call(rbind, paths_0)

p3 <- ggplot(plot_df_0, aes(x = log(time), y = y, group = sim)) +
    geom_line() +
    labs(title = "Paths of First Simulation", x = "Time", y = "Y") +
    theme_bw()

plot_df_0_after_10 <- do.call(rbind, paths_0_after_10)

p4 <- ggplot(plot_df_0_after_10, aes(x = log(time), y = y, group = sim)) +
    geom_line() +
    labs(title = "Paths of Second Simulation", x = "Time", y = "Y") +
    theme_bw()

grid.arrange(p1, p2, p3, p4, ncol = 2)

Run Code Online (Sandbox Code Playgroud)

我的问题：我可以做些什么来提高模拟的效率吗？

谢谢！

Answer 1

jbl*_*d94 10

众所周知，在对称随机游走中从到移动的预期时间是无限的。我们仍然可以模拟右删失步行时间。kk + 1

下面是更直接的模拟方法。考虑一下从 5 步到 6 步。它总是需要奇数步。它采取步骤的概率是。一旦达到 6，从 6 到 7 的步数与从 5 到 6 所需的步数无关。2*n - 1abs(choose(0.5, n))

为了加快模拟速度，首先预先计算移动1个空间的步数的CDF。如果我们想模拟移动空间总共需要多少步n，我们可以对n预先计算的 CDF 中的随机样本求和。（该分布是非常重尾的，因此我们将让它运行到最大步数，达到该步数后模拟将提前终止并返回Inf。）

将模拟实现为函数：

max.steps <- 1e8 + 1
p <- c(0, cumsum(abs(choose(0.5, 1:((max.steps + 1)/2)))), 1)

f <- function(n, path) {
  m <- matrix(2*findInterval(runif(sum(abs(diff(path)))*n), p) - 1, n)
  m[m > max.steps] <- Inf
  rowSums(m)
}

Run Code Online (Sandbox Code Playgroud)

模拟情况1：

# moving from 5 to 0
path <- c(5, 0)
f(10, path)
#>  [1]   21  189   25   15   19   31  137   25   43 1199
# probability of stopping the simulation early (steps = Inf)
1 - p[length(p) - 1]^sum(abs(diff(path)))
#> [1] 0.0003988786

Run Code Online (Sandbox Code Playgroud)

模拟情况2：

path <- c(5, 10, 0)
f(10, path)
#>  [1]   34459      33     545     115     237     109     143   14287 1251545
#> [10]     613
# probability of stopping the simulation early (steps = Inf)
1 - p[length(p) - 1]^sum(abs(diff(path)))
#> [1] 0.001196159

Run Code Online (Sandbox Code Playgroud)

预计算需要一些时间，但只需要计算一次。一旦完成，模拟运行得非常快：

# pre-calculation time
system.time(p <- c(0, cumsum(abs(choose(0.5, 1:((max.steps + 1)/2)))), 1))
#>    user  system elapsed 
#>   18.02    0.47   18.50
# time 1M simulation runs
system.time(f(1e6, c(5, 10, 0)))
#>    user  system elapsed 
#>    0.83    0.02    0.86

Run Code Online (Sandbox Code Playgroud)

Answer 2

Til*_*ill 9

调用sample()for/while 循环的每个循环会产生大量开销\n并导致模拟运行缓慢。

\n

下面的解决方案sample()每次模拟运行一次，大小为1e6(1,000,000)。\n你也可以做得更大，1e7在我的机器上看起来相当快，并且所有周期\n都0在1e7. 事情1e8变得很慢，\nat1e9对于大多数机器来说可能太多了。您设置的样本大小越小，0您获得的运行次数就越多。

\n

功能

\n

该函数依赖于矢量化来加快速度。

\n

sim <- function(y = 5,\n                size = 1e6) {\n  steps <- sample(c(-1, 1),\n                  size,\n                  prob = c(0.5, 0.5),\n                  replace = TRUE)\n  y_steps <- c(y, steps)\n  y_steps_cum <- cumsum(y_steps)\n  times <- which(y_steps_cum == 0)\n  times_to_0 <- times[1]\n  path <- if (!is.na(times_to_0))\n    y_steps_cum[1:times_to_0]\n  else\n    NA\n  touched_10 <- which(y_steps_cum == 10)[1]\n  times_to_0_after_10 <-\n    if (!is.na(touched_10))\n      times[times > touched_10][1]\n  else\n    NA\n  path_to_0_after_10 <-\n    if (!is.na(times_to_0_after_10))\n      y_steps_cum[1:times_to_0_after_10]\n  else\n    NA\n  \n  list(\n    times_to_0 = times_to_0,\n    path = list(path),\n    touched_10 = touched_10,\n    times_to_0_after_10 = times_to_0_after_10,\n    path_to_0_after_10 = list(path_to_0_after_10)\n  )\n}\n

Run Code Online (Sandbox Code Playgroud)\n

单一模拟：

\n

sim()\n#> $times_to_0\n#> [1] 314\n#> \n#> $path\n#> $path[[1]]\n#>   [1]  5  4  3  2  3  4  3  4  3  2  3  4  5  6  7  8  9  8  9 10  9 10  9  8  7\n#>  [26]  6  7  6  5  4  5  6  7  8  9 10  9 10 11 12 11 12 11 12 13 12 11 12 13 12\n#>  [51] 13 12 11 12 11 10 11 10 11 10 11 12 13 12 13 14 13 12 11 10  9  8  7  6  7\n#>  [76]  6  7  8  9  8  7  8  9  8  7  8  7  6  7  8  9  8  9  8  7  6  5  6  5  4\n#> [101]  3  4  5  6  5  6  5  6  7  8  7  8  7  6  7  6  7  8  9  8  7  8  9  8  9\n#> [126]  8  7  6  5  6  7  8  9  8  7  8  7  8  9 10  9 10  9  8  9  8  9  8  9 10\n#> [151] 11 12 11 12 11 10 11 10 11 12 11 12 13 12 13 14 15 16 15 14 15 14 13 12 13\n#> [176] 14 15 14 15 16 17 18 19 18 19 20 19 18 17 16 15 14 13 12 11 10 11 10  9 10\n#> [201]  9 10 11 12 11 10 11 10  9 10 11 10 11 12 13 12 13 14 15 14 15 16 17 16 15\n#> [226] 14 15 14 13 12 11 10  9  8  9  8  7  8  9 10 11 10 11 12 11 10 11 10  9 10\n#> [251]  9 10 11 12 11 12 11 10  9 10 11 12 11 12 13 12 11 12 13 12 13 14 13 12 13\n#> [276] 12 11 12 11 10  9  8  7  6  7  8  7  6  7  6  5  6  5  4  5  4  3  2  3  4\n#> [301]  5  4  5  4  5  4  5  4  3  2  1  2  1  0\n#> \n#> \n#> $touched_10\n#> [1] 20\n#> \n#> $times_to_0_after_10\n#> [1] 314\n#> \n#> $path_to_0_after_10\n#> $path_to_0_after_10[[1]]\n#>   [1]  5  4  3  2  3  4  3  4  3  2  3  4  5  6  7  8  9  8  9 10  9 10  9  8  7\n#>  [26]  6  7  6  5  4  5  6  7  8  9 10  9 10 11 12 11 12 11 12 13 12 11 12 13 12\n#>  [51] 13 12 11 12 11 10 11 10 11 10 11 12 13 12 13 14 13 12 11 10  9  8  7  6  7\n#>  [76]  6  7  8  9  8  7  8  9  8  7  8  7  6  7  8  9  8  9  8  7  6  5  6  5  4\n#> [101]  3  4  5  6  5  6  5  6  7  8  7  8  7  6  7  6  7  8  9  8  7  8  9  8  9\n#> [126]  8  7  6  5  6  7  8  9  8  7  8  7  8  9 10  9 10  9  8  9  8  9  8  9 10\n#> [151] 11 12 11 12 11 10 11 10 11 12 11 12 13 12 13 14 15 16 15 14 15 14 13 12 13\n#> [176] 14 15 14 15 16 17 18 19 18 19 20 19 18 17 16 15 14 13 12 11 10 11 10  9 10\n#> [201]  9 10 11 12 11 10 11 10  9 10 11 10 11 12 13 12 13 14 15 14 15 16 17 16 15\n#> [226] 14 15 14 13 12 11 10  9  8  9  8  7  8  9 10 11 10 11 12 11 10 11 10  9 10\n#> [251]  9 10 11 12 11 12 11 10  9 10 11 12 11 12 13 12 11 12 13 12 13 14 13 12 13\n#> [276] 12 11 12 11 10  9  8  7  6  7  8  7  6  7  6  5  6  5  4  5  4  3  2  3  4\n#> [301]  5  4  5  4  5  4  5  4  3  2  1  2  1  0\n

Run Code Online (Sandbox Code Playgroud)\n

100 次模拟：

\n

library(tidyverse)\n\ntictoc::tic() # tic() and toc() are for benchmarking\n\nres <-\n  lapply(rep(5, 100),\n         sim) |>\n  bind_rows(.id = "sim")\n\ntictoc::toc()\n#> 2.596 sec elapsed\n\nres\n#> # A tibble: 100 \xc3\x97 6\n#>    sim   times_to_0 path       touched_10 times_to_0_after_10 path_to_0_after_10\n#>    <chr>      <int> <list>          <int>               <int> <list>            \n#>  1 1           3202 <dbl>              26                3202 <dbl [3,202]>     \n#>  2 2           1690 <dbl>              16                1690 <dbl [1,690]>     \n#>  3 3             32 <dbl [32]>      11586               12012 <dbl [12,012]>    \n#>  4 4            386 <dbl>              54                 386 <dbl [386]>       \n#>  5 5            280 <dbl>              20                 280 <dbl [280]>       \n#>  6 6             20 <dbl [20]>        272                2890 <dbl [2,890]>     \n#>  7 7           3520 <dbl>              20                3520 <dbl [3,520]>     \n#>  8 8            160 <dbl>               8                 160 <dbl [160]>       \n#>  9 9         141814 <dbl>               8              141814 <dbl [141,814]>   \n#> 10 10          1474 <dbl>              10                1474 <dbl [1,474]>     \n#> # \xe2\x84\xb9 90 more rows\n

Run Code Online (Sandbox Code Playgroud)\n

绘制结果

\n

library(ggplot2)\nlibrary(gridExtra)\n#> \n#> Attaching package: \'gridExtra\'\n#> The following object is masked from \'package:dplyr\':\n#> \n#>     combine\n\nmean1 <- mean(log(res$times_to_0), na.rm = TRUE)\nmean2 <- mean(log(res$times_to_0_after_10), na.rm = TRUE)\n\np1 <-\n  ggplot(res, aes(x = log(times_to_0))) +\n  geom_density() +\n  geom_vline(aes(xintercept = exp(mean1)),\n             color = "red",\n             linetype = "dotted") +\n  labs(\n    title = paste("Density of Times to Reach 0 - Mean Time:", round(exp(mean1), 2)),\n    x = "Time",\n    y = "Density"\n  ) + theme_bw()\n\np2 <- \n  ggplot(res, aes(x = log(times_to_0_after_10))) +\n  geom_density() +\n  geom_vline(aes(xintercept = exp(mean2)),\n             color = "red",\n             linetype = "dotted") +\n  labs(\n    title = paste(\n      "Density of Times to Reach 0 After Reaching 10 - Mean Time:",\n      round(exp(mean2), 2)\n    ),\n    x = "Time",\n    y = "Density"\n  ) + theme_bw()\n\np3 <-\n  res |>\n  select(sim, path) |>\n  unnest(path) |>\n  mutate(time = 1:n(), .by = sim) |>\n  ggplot(aes(\n    x = log(time),\n    y = path,\n    group = sim\n  )) +\n  geom_line(linewidth = .05) +\n  labs(title = "Paths of First Simulation", x = "Time", y = "Y") +\n  theme_bw()\n\np4 <-\n  res |>\n  select(sim, path_to_0_after_10) |>\n  unnest(path_to_0_after_10) |>\n  mutate(time = 1:n(), .by = sim) |>\n  ggplot(aes(\n    x = log(time),\n    y = path_to_0_after_10,\n    group = sim\n  )) +\n  geom_line(linewidth = .05) +\n  labs(title = "Paths of Second Simulation", x = "Time", y = "Y") +\n  theme_bw()\n\ngrid.arrange(p1, p2, p3, p4, ncol = 2)\n#> Warning: Removed 1 rows containing non-finite values (`stat_density()`).\n#> Warning: Removed 3 rows containing non-finite values (`stat_density()`).\n#> Warning: Removed 1 row containing missing values (`geom_line()`).\n#> Warning: Removed 3 rows containing missing values (`geom_line()`).\n

Run Code Online (Sandbox Code Playgroud)\n

\n

Answer 3

Tho*_*ing 5

免责声明：这里不是模拟工作，而是我对情况 1 的0一些想法，其中我试图获得从到达的时间平均值的解析解5。

假设你有u时间+1（其中u应该是任何非负整数，并且你需要有u+5时间-1可以0从到达5.

从这个意义上说，你有总u + u + 5 = 2u+5步骤，因此概率是

P(u) = (choose(2*u+4,u)-choose(2*u+4,u-1))*0.5^(2*u+5)

Run Code Online (Sandbox Code Playgroud)

其中是在最后一步之前choose(2*u+4,u)-choose(2*u+4,u-1)未命中的路径数。为了概括，可以将其表述为好像应该有多次，例如，从起始值到第一次到达。0choose(2*u+K-1,u)-choose(2*u+K-1,u-1)u+1u+K-1K0

由于u应该是任何非负整数，因此到达时间的期望5应该按如下方式计算

  sum_{u=0}^{Inf} (2*u+5)*P(u) 
= sum_{u=0}^{Inf} (2*u+5)*(choose(2*u+4,u)-choose(2*u+4,u-1))*0.5^(2*u+5)

Run Code Online (Sandbox Code Playgroud)

然而，不幸的是，上面显示的系列总和有所不同。

下面只是一个有关分布的说明示例

u <- 0:500
N <- 2 * u + 5
P <- (choose(N-1, u) - choose(N-1, u + 5)) * 0.5^(N)
plot(N, P)

Run Code Online (Sandbox Code Playgroud)

归档时间：	2 年，4 月前
查看次数：	326 次
最近记录：	2 年，4 月前