JS4*_*137 0 benchmarking vector rust
为了满足高性能数学库的需求,我一直在对各种方法进行基准测试,以在 Rust vec 上进行就地操作(最好通过引用)。这些方法是:
map(),收集到一个新的 vec,并覆盖现有的for_each()这是基准测试代码:
use std::time::{Instant, Duration};
const N_ITEMS: usize = 100000;
const N_BENCH_TIMES: i32 = 100;
fn bench_simple() {
let mut a: Vec<i32> = vec![5; N_ITEMS];
for i in a.iter_mut() {
*i += 1;
}
}
fn bench_iterator() {
let mut a: Vec<i32> = vec![5; N_ITEMS];
let a: Vec<i32> = a.iter_mut().map(|x| *x + 1).collect();
}
fn bench_foreach() {
let mut a: Vec<i32> = vec![5; N_ITEMS];
a.iter_mut().for_each(|x| *x += 1);
}
fn benchmark(name: &str, f: &dyn Fn()) {
println!("Benchmark: {}", name);
let mut time = Duration::new(0, 0);
for _ in 0..N_BENCH_TIMES {
let start_time = Instant::now();
f();
time += start_time.elapsed();
}
println!("Total: {:?} over {} instances", time, N_BENCH_TIMES);
println!("Average: {:?}", time.div_f64(N_BENCH_TIMES as f64));
}
fn main() {
benchmark("Simple", &bench_simple);
benchmark("Iterator", &bench_iterator);
benchmark("ForEach", &bench_foreach);
}
Run Code Online (Sandbox Code Playgroud)
这些是结果:
Benchmark: Simple
Total: 402.619146ms over 100 instances
Average: 4.026191ms
Benchmark: Iterator
Total: 587.769728ms over 100 instances
Average: 5.877697ms
Benchmark: ForEach
Total: 409.527297ms over 100 instances
Average: 4.095273ms
Run Code Online (Sandbox Code Playgroud)
Rust 文档说:
“在某些情况下
for_each也可能比循环更快,因为它将在适配器上使用内部迭代,例如Chain”
所以我期望for_each()比 for 循环更快,并且都比 快得多map(),这需要创建一个新向量并使用它来覆盖现有向量。但在基准测试中显然并非如此?我对各个迭代器如何工作的理解不正确吗?
我非常有信心您不会使用 进行构建cargo run --release,从而使您的整个基准测试完全毫无意义。
这是我的一台相当旧的笔记本电脑上的基准测试的输出,没有 --release:
Benchmark: Simple\nTotal: 545.9198ms over 100 instances\nAverage: 5.459198ms\nBenchmark: Iterator\nTotal: 461.5787ms over 100 instances\nAverage: 4.615787ms\nBenchmark: ForEach\nTotal: 334.7625ms over 100 instances\nAverage: 3.347625ms\nRun Code Online (Sandbox Code Playgroud)\n这是 --release:
Benchmark: Simple\nTotal: 10.6028ms over 100 instances\nAverage: 106.028\xc2\xb5s\nBenchmark: Iterator\nTotal: 176.4\xc2\xb5s over 100 instances\nAverage: 1.764\xc2\xb5s\nBenchmark: ForEach\nTotal: 5.8382ms over 100 instances\nAverage: 58.382\xc2\xb5s\nRun Code Online (Sandbox Code Playgroud)\n那么这是怎么回事?
\n由此产生的程序集揭示了一些情况:
\nexample::bench_iterator:\n mov rax, qword ptr [rip + __rust_no_alloc_shim_is_unstable@GOTPCREL]\n movzx ecx, byte ptr [rax]\n mov ecx, 99968\n test rcx, rcx\n je .LBB1_3\n.LBB1_2:\n add rcx, -32\n test rcx, rcx\n jne .LBB1_2\n.LBB1_3:\n movzx eax, byte ptr [rax]\n ret\nRun Code Online (Sandbox Code Playgroud)\n编译器几乎完全优化了Iterator基准测试。
这就是black_box存在的原因。它告诉编译器不要优化某些东西。
这是您的代码,black_box在适当的位置插入了适当的 es:
Benchmark: Simple\nTotal: 545.9198ms over 100 instances\nAverage: 5.459198ms\nBenchmark: Iterator\nTotal: 461.5787ms over 100 instances\nAverage: 4.615787ms\nBenchmark: ForEach\nTotal: 334.7625ms over 100 instances\nAverage: 3.347625ms\nRun Code Online (Sandbox Code Playgroud)\n那么现在的输出是什么?以下是我在笔记本电脑上的几次运行:
\nBenchmark: Simple\nTotal: 13.9338ms over 100 instances\nAverage: 139.338\xc2\xb5s\nBenchmark: Iterator\nTotal: 4.9353ms over 100 instances\nAverage: 49.353\xc2\xb5s\nBenchmark: ForEach\nTotal: 7.554ms over 100 instances\nAverage: 75.54\xc2\xb5s\nRun Code Online (Sandbox Code Playgroud)\nBenchmark: Simple\nTotal: 6.1412ms over 100 instances\nAverage: 61.412\xc2\xb5s\nBenchmark: Iterator\nTotal: 12.5715ms over 100 instances\nAverage: 125.715\xc2\xb5s\nBenchmark: ForEach\nTotal: 14.4137ms over 100 instances\nAverage: 144.137\xc2\xb5s\nRun Code Online (Sandbox Code Playgroud)\n正如您所看到的,差异是巨大的。
\n为了缓解这种情况,我们需要使用适当的基准包criterion,例如 ,它可以执行诸如预热和动态重复计数之类的操作。
这是您重写的代码criterion:
请注意,这不会进入main.rs,而是进入目录中的源文件benches/。有关更多信息,请阅读标准用户指南。
Benchmark: Simple\nTotal: 10.6028ms over 100 instances\nAverage: 106.028\xc2\xb5s\nBenchmark: Iterator\nTotal: 176.4\xc2\xb5s over 100 instances\nAverage: 1.764\xc2\xb5s\nBenchmark: ForEach\nTotal: 5.8382ms over 100 instances\nAverage: 58.382\xc2\xb5s\nRun Code Online (Sandbox Code Playgroud)\nexample::bench_iterator:\n mov rax, qword ptr [rip + __rust_no_alloc_shim_is_unstable@GOTPCREL]\n movzx ecx, byte ptr [rax]\n mov ecx, 99968\n test rcx, rcx\n je .LBB1_3\n.LBB1_2:\n add rcx, -32\n test rcx, rcx\n jne .LBB1_2\n.LBB1_3:\n movzx eax, byte ptr [rax]\n ret\nRun Code Online (Sandbox Code Playgroud)\n请注意,Simple和Foreach几乎相同,但Iterator似乎速度较慢。为什么是这样?
原因是您的Iteratoruse实现iter_mut使原始向量保持活动状态。然而,这在任何方面都不是必需的,因此我们可以使用它into_iter来代替,如下所示:
use std::hint::black_box;\nuse std::time::{Duration, Instant};\n\nconst N_ITEMS: usize = 100000;\nconst N_BENCH_TIMES: i32 = 100;\n\npub fn bench_simple() {\n let mut a: Vec<i32> = black_box(vec![5; N_ITEMS]);\n for i in a.iter_mut() {\n *i += 1;\n }\n black_box(a);\n}\n\npub fn bench_iterator() {\n let mut a: Vec<i32> = black_box(vec![5; N_ITEMS]);\n let a: Vec<i32> = a.iter_mut().map(|x| *x + 1).collect();\n black_box(a);\n}\n\npub fn bench_foreach() {\n let mut a: Vec<i32> = black_box(vec![5; N_ITEMS]);\n a.iter_mut().for_each(|x| *x += 1);\n black_box(a);\n}\n\nfn benchmark(name: &str, f: &dyn Fn()) {\n println!("Benchmark: {}", name);\n let mut time = Duration::new(0, 0);\n for _ in 0..N_BENCH_TIMES {\n let start_time = Instant::now();\n f();\n time += start_time.elapsed();\n }\n println!("Total: {:?} over {} instances", time, N_BENCH_TIMES);\n println!("Average: {:?}", time.div_f64(N_BENCH_TIMES as f64));\n}\n\nfn main() {\n benchmark("Simple", &bench_simple);\n benchmark("Iterator", &bench_iterator);\n benchmark("ForEach", &bench_foreach);\n}\nRun Code Online (Sandbox Code Playgroud)\n使用新版本重新运行基准测试会产生:
\nBenchmark: Simple\nTotal: 13.9338ms over 100 instances\nAverage: 139.338\xc2\xb5s\nBenchmark: Iterator\nTotal: 4.9353ms over 100 instances\nAverage: 49.353\xc2\xb5s\nBenchmark: ForEach\nTotal: 7.554ms over 100 instances\nAverage: 75.54\xc2\xb5s\nRun Code Online (Sandbox Code Playgroud)\nBenchmark: Simple\nTotal: 6.1412ms over 100 instances\nAverage: 61.412\xc2\xb5s\nBenchmark: Iterator\nTotal: 12.5715ms over 100 instances\nAverage: 125.715\xc2\xb5s\nBenchmark: ForEach\nTotal: 14.4137ms over 100 instances\nAverage: 144.137\xc2\xb5s\nRun Code Online (Sandbox Code Playgroud)\n现在您可以看到这三个都是相同的。这是预期的,因为 Rust 迭代器(如果使用得当)是零成本抽象。
\n是的,即使是.collect()基于的向量也是相同的 - Rust 编译器可以发现旧向量被破坏,并分配了一个相同大小的新向量,并相应地进行优化。
生成的程序集显示了这一点Simple,并Foreach编译为几乎完全相同的代码。事实上,它Iterator确实编译成不同的代码;但它的表现似乎仍然相同。
这是因为虽然该Iterator版本有更多的设置/拆卸代码,但实际的主力代码仍然相同:
# x86-64 with SSE2\n xor edx, edx # EDX = 0\n pcmpeqd xmm0, xmm0 # set1_epi32(-1), cheaper than loading a vector of ones\n.LBB1_9: # do{\n movdqu xmm1, xmmword ptr [rbx + 4*rdx] # load src[rdx+0..3] with a scaled-index addressing mode\n movdqu xmm2, xmmword ptr [rbx + 4*rdx + 16] # load dst[rdx+4..7]\n psubd xmm1, xmm0 # packed subtract of dword (32-bit) elements\n psubd xmm2, xmm0\n movdqu xmmword ptr [rax + 4*rdx], xmm1 # store to a separate destination (RAX)\n movdqu xmmword ptr [rax + 4*rdx + 16], xmm2\n add rdx, 8\n cmp rcx, rdx\n jne .LBB1_9 # }while(end != idx)\nRun Code Online (Sandbox Code Playgroud)\n补充说明:
\n如果您的代码确实对性能至关重要,请考虑添加target-cpu=native到您的编译标志。这使得您的代码不再可移植,但会利用特定 CPU 上的附加 CPU 功能。
启用它后,我得到:
\nuse criterion::{criterion_group, criterion_main, Criterion};\nuse std::hint::black_box;\n\nconst N_ITEMS: usize = 100000;\n\npub fn run_simple(mut a: Vec<i32>) -> Vec<i32> {\n for i in a.iter_mut() {\n *i += 1;\n }\n a\n}\n\npub fn run_iterator(mut a: Vec<i32>) -> Vec<i32> {\n a.iter_mut().map(|x| *x + 1).collect()\n}\n\npub fn run_foreach(mut a: Vec<i32>) -> Vec<i32> {\n a.iter_mut().for_each(|x| *x += 1);\n a\n}\n\nfn do_bench(c: &mut Criterion, name: &str, f: impl Fn(Vec<i32>) -> Vec<i32>) {\n let mut a = Some(vec![5; N_ITEMS]);\n c.bench_function(name, |b| {\n b.iter(|| {\n let data = black_box(a.take().unwrap());\n let data = f(data);\n a = Some(black_box(data));\n })\n });\n}\n\nfn bench_simple(c: &mut Criterion) {\n do_bench(c, "Simple", run_simple);\n}\nfn bench_iterator(c: &mut Criterion) {\n do_bench(c, "Iterator", run_iterator);\n}\nfn bench_foreach(c: &mut Criterion) {\n do_bench(c, "Foreach", run_foreach);\n}\n\ncriterion_group!(benches, bench_simple, bench_iterator, bench_foreach);\ncriterion_main!(benches);\nRun Code Online (Sandbox Code Playgroud)\n虽然速度并没有快很多,但确实有所不同。
\n如果数据适合 L2 或尤其是 L1d 缓存,则在具有 256 位向量指令(用于 256 位整数的 AVX2)可用的典型 CPU 上,它的运行速度大约是 2 倍,并且成本与 128 位向量大致相同,但是每条指令完成的工作量是原来的两倍。Haswell 以及后来的 Intel、AMD 的 Zen 2。但 Alder Lake E 核心则不然。
\n| 归档时间: |
|
| 查看次数: |
92 次 |
| 最近记录: |