Rust 循环性能与 python 相同

Mar*_*tin 4 python performance for-loop rust

我正在研究 mandelbrot 算法来学习 Rust,我发现空的 25mil(大约 6k 图像)循环需要 0.5 秒。我发现它很慢。于是我去用python测试了一下,发现几乎花费了同样的时间。python的for循环真的是几乎零成本的抽象吗?这真的是我能用英特尔 i7 得到的最好的结果吗?

锈:

use std::time::Instant;
fn main() {
    let before = Instant::now();

    for i in 0..5000 {
        for j in 0..5000 {}
    }
    println!("Elapsed time: {:.2?}", before.elapsed());
}

>>> Elapsed time: 406.90ms
Run Code Online (Sandbox Code Playgroud)

Python:

import time

s = time.time()

for i in range(5000):
    for j in range(5000):
        pass

print(time.time()-s)
>>> 0.5715351104736328
Run Code Online (Sandbox Code Playgroud)

更新:如果我使用初始化的元组而不是范围,python 甚至比 rust 更快 -> 0.33s

Joh*_*ica 13

如果您正在进行性能测试,请始终使用--release. 默认情况下,Cargo 构建时启用调试信息并禁用优化。优化器将完全消除这些循环。在Playground上,它从 975 毫秒下降到 1.25\xc2\xb5s。

\n

让我们看一下Godbolt上的汇编,其中只有循环,没有计时器:

\n
pub fn main() {\n    for i in 0..5000 {\n        for j in 0..5000 {}\n    }\n}\n
Run Code Online (Sandbox Code Playgroud)\n

没有优化

\n
<i32 as core::iter::range::Step>::forward_unchecked:\n        push    rax\n        mov     eax, esi\n        add     edi, eax\n        mov     dword ptr [rsp + 4], edi\n        mov     eax, dword ptr [rsp + 4]\n        mov     dword ptr [rsp], eax\n        mov     eax, dword ptr [rsp]\n        pop     rcx\n        ret\n\ncore::intrinsics::copy_nonoverlapping:\n        push    rax\n        mov     qword ptr [rsp], rsi\n        mov     rsi, rdi\n        mov     rdi, qword ptr [rsp]\n        shl     rdx, 2\n        call    memcpy@PLT\n        pop     rax\n        ret\n\ncore::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt:\n        mov     eax, dword ptr [rdi]\n        cmp     eax, dword ptr [rsi]\n        setl    al\n        and     al, 1\n        movzx   eax, al\n        ret\n\ncore::mem::replace:\n        sub     rsp, 40\n        mov     qword ptr [rsp], rdi\n        mov     dword ptr [rsp + 12], esi\n        mov     byte ptr [rsp + 23], 0\n        mov     byte ptr [rsp + 23], 1\n        mov     rax, qword ptr [rip + core::ptr::read@GOTPCREL]\n        call    rax\n        mov     ecx, eax\n        mov     dword ptr [rsp + 16], ecx\n        jmp     .LBB3_1\n.LBB3_1:\n        mov     esi, dword ptr [rsp + 12]\n        mov     rdi, qword ptr [rsp]\n        mov     byte ptr [rsp + 23], 0\n        mov     rcx, qword ptr [rip + core::ptr::write@GOTPCREL]\n        call    rcx\n        jmp     .LBB3_4\n.LBB3_2:\n        test    byte ptr [rsp + 23], 1\n        jne     .LBB3_8\n        jmp     .LBB3_7\n        mov     rcx, rax\n        mov     eax, edx\n        mov     qword ptr [rsp + 24], rcx\n        mov     dword ptr [rsp + 32], eax\n        jmp     .LBB3_2\n.LBB3_4:\n        mov     eax, dword ptr [rsp + 16]\n        add     rsp, 40\n        ret\n.LBB3_5:\n        jmp     .LBB3_2\n        mov     rcx, rax\n        mov     eax, edx\n        mov     qword ptr [rsp + 24], rcx\n        mov     dword ptr [rsp + 32], eax\n        jmp     .LBB3_5\n.LBB3_7:\n        mov     rdi, qword ptr [rsp + 24]\n        call    _Unwind_Resume@PLT\n        ud2\n.LBB3_8:\n        jmp     .LBB3_7\n\ncore::ptr::read:\n        sub     rsp, 24\n        mov     qword ptr [rsp + 8], rdi\n        mov     eax, dword ptr [rsp + 20]\n        mov     dword ptr [rsp + 16], eax\n        jmp     .LBB4_2\n.LBB4_2:\n        mov     rdi, qword ptr [rsp + 8]\n        lea     rsi, [rsp + 16]\n        mov     edx, 1\n        call    qword ptr [rip + core::intrinsics::copy_nonoverlapping@GOTPCREL]\n        mov     eax, dword ptr [rsp + 16]\n        mov     dword ptr [rsp + 4], eax\n        mov     eax, dword ptr [rsp + 4]\n        add     rsp, 24\n        ret\n\ncore::ptr::write:\n        sub     rsp, 4\n        mov     dword ptr [rsp], esi\n        mov     eax, dword ptr [rsp]\n        mov     dword ptr [rdi], eax\n        add     rsp, 4\n        ret\n\ncore::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next:\n        push    rax\n        call    qword ptr [rip + <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next@GOTPCREL]\n        mov     dword ptr [rsp], eax\n        mov     dword ptr [rsp + 4], edx\n        mov     edx, dword ptr [rsp + 4]\n        mov     eax, dword ptr [rsp]\n        pop     rcx\n        ret\n\ncore::clone::impls::<impl core::clone::Clone for i32>::clone:\n        mov     eax, dword ptr [rdi]\n        ret\n\n<I as core::iter::traits::collect::IntoIterator>::into_iter:\n        mov     edx, esi\n        mov     eax, edi\n        ret\n\n<core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next:\n        sub     rsp, 40\n        mov     rsi, rdi\n        mov     qword ptr [rsp + 16], rsi\n        mov     rdi, rsi\n        add     rsi, 4\n        call    core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt\n        mov     byte ptr [rsp + 31], al\n        mov     al, byte ptr [rsp + 31]\n        test    al, 1\n        jne     .LBB9_3\n        jmp     .LBB9_2\n.LBB9_2:\n        mov     dword ptr [rsp + 32], 0\n        jmp     .LBB9_7\n.LBB9_3:\n        mov     rdi, qword ptr [rsp + 16]\n        call    core::clone::impls::<impl core::clone::Clone for i32>::clone\n        mov     dword ptr [rsp + 12], eax\n        mov     edi, dword ptr [rsp + 12]\n        mov     esi, 1\n        call    <i32 as core::iter::range::Step>::forward_unchecked\n        mov     dword ptr [rsp + 8], eax\n        mov     esi, dword ptr [rsp + 8]\n        mov     rdi, qword ptr [rsp + 16]\n        call    qword ptr [rip + core::mem::replace@GOTPCREL]\n        mov     dword ptr [rsp + 4], eax\n        mov     eax, dword ptr [rsp + 4]\n        mov     dword ptr [rsp + 36], eax\n        mov     dword ptr [rsp + 32], 1\n.LBB9_7:\n        mov     eax, dword ptr [rsp + 32]\n        mov     edx, dword ptr [rsp + 36]\n        add     rsp, 40\n        ret\n\nexample::main:\n        sub     rsp, 72\n        mov     dword ptr [rsp + 24], 0\n        mov     dword ptr [rsp + 28], 5000\n        mov     edi, dword ptr [rsp + 24]\n        mov     esi, dword ptr [rsp + 28]\n        call    qword ptr [rip + <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]\n        mov     dword ptr [rsp + 16], eax\n        mov     dword ptr [rsp + 20], edx\n        mov     eax, dword ptr [rsp + 20]\n        mov     ecx, dword ptr [rsp + 16]\n        mov     dword ptr [rsp + 32], ecx\n        mov     dword ptr [rsp + 36], eax\n.LBB10_2:\n        mov     rax, qword ptr [rip + core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]\n        lea     rdi, [rsp + 32]\n        call    rax\n        mov     dword ptr [rsp + 44], edx\n        mov     dword ptr [rsp + 40], eax\n        mov     eax, dword ptr [rsp + 40]\n        test    rax, rax\n        je      .LBB10_5\n        jmp     .LBB10_13\n.LBB10_13:\n        jmp     .LBB10_6\n        ud2\n.LBB10_5:\n        add     rsp, 72\n        ret\n.LBB10_6:\n        mov     dword ptr [rsp + 48], 0\n        mov     dword ptr [rsp + 52], 5000\n        mov     edi, dword ptr [rsp + 48]\n        mov     esi, dword ptr [rsp + 52]\n        call    qword ptr [rip + <I as core::iter::traits::collect::IntoIterator>::into_iter@GOTPCREL]\n        mov     dword ptr [rsp + 8], eax\n        mov     dword ptr [rsp + 12], edx\n        mov     eax, dword ptr [rsp + 12]\n        mov     ecx, dword ptr [rsp + 8]\n        mov     dword ptr [rsp + 56], ecx\n        mov     dword ptr [rsp + 60], eax\n.LBB10_8:\n        mov     rax, qword ptr [rip + core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next@GOTPCREL]\n        lea     rdi, [rsp + 56]\n        call    rax\n        mov     dword ptr [rsp + 68], edx\n        mov     dword ptr [rsp + 64], eax\n        mov     eax, dword ptr [rsp + 64]\n        test    rax, rax\n        je      .LBB10_11\n        jmp     .LBB10_14\n.LBB10_14:\n        jmp     .LBB10_12\n        ud2\n.LBB10_11:\n        jmp     .LBB10_2\n.LBB10_12:\n        jmp     .LBB10_8\n\n__rustc_debug_gdb_scripts_section__:\n        .asciz  "\\001gdb_load_rust_pretty_printers.py"\n\nDW.ref.rust_eh_personality:\n        .quad   rust_eh_personality\n
Run Code Online (Sandbox Code Playgroud)\n

有优化

\n
example::main:\n        ret\n
Run Code Online (Sandbox Code Playgroud)\n

  • 虽然这对于 Rust 来说是一个很好的答案,但我认为更完整的答案可以提及如何在 CPython 中实现“for”循环:与 while 循环不同,像这样的裸 for 循环的所有迭代和边界检查都是用“C”完成,而不是直接通过 Python 字节码 (2认同)