zha*_*tar 3 c performance x86 clang
代码行
\nnext += val;\nRun Code Online (Sandbox Code Playgroud)\n性能下降到 10 倍,我检查了 ASM 代码,而不是结果。
\n为什么这行代码会使性能下降 10 倍?
\n结果如下:
\n\xe2\x9e\x9c ~ clang-13 1.c -O3\n\xe2\x9e\x9c ~ ./a.out\nrand_read_1\nsum = 2624b18779c40, time = 0.19s\nrand_read_2\nsum = 2624b18779c40, time = 1.24s\nRun Code Online (Sandbox Code Playgroud)\nCPU:Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz
\n这是代码:
\n#include <stdio.h>\n#include <time.h>\n#include <stdint.h>\n#include <unistd.h>\n#include <string.h>\n#include <assert.h>\n#include <stdlib.h>\n\n#define CCR_MULTIPLY_64 6364136223846793005\n#define CCR_ADD_64 1\nstatic inline uint64_t my_rand64(uint64_t *r)\n{\n *r = *r * CCR_MULTIPLY_64 + CCR_ADD_64;\n return *r;\n}\n\n#define NUM 10000000UL\n\nuint64_t rand_read_1(uint64_t *ptr, uint64_t nr_words)\n{\n uint64_t i, next, val = 0;\n uint64_t sum;\n\n next = 0;\n sum = 0;\n for (i = 0; i < NUM; i++) {\n my_rand64(&next);\n next %= nr_words;\n val = ptr[next];\n sum += val ^ next;\n // printf("next1:%ld\\n", next);\n }\n\n return sum;\n}\n\nuint64_t rand_read_2(uint64_t *ptr, uint64_t nr_words)\n{\n uint64_t i, next, val ,next2 = 0;\n uint64_t sum;\n\n next = 0;\n sum = 0;\n for (i = 0; i < NUM; i++) {\n my_rand64(&next);\n next %= nr_words;\n val = ptr[next];\n sum += val ^ next;\n next += val;\n }\n\n return sum;\n}\n\n#define SIZE (1024*1024*1024)\n\nstatic uint64_t get_ns(void)\n{\n struct timespec val;\n uint64_t v;\n int ret;\n\n ret = clock_gettime(CLOCK_REALTIME, &val);\n if (ret != 0) {\n perror("clock_gettime");\n exit(1);\n }\n v = (uint64_t) val.tv_sec * 1000000000LL;\n v += (uint64_t) val.tv_nsec;\n return v;\n}\n\nint main(int argc, char *argv[])\n{\n uint64_t *ptr;\n uint64_t sum;\n uint64_t t0, t1, td, t2;\n\n ptr = (uint64_t *)malloc(SIZE);\n assert(ptr);\n\n memset(ptr, 0, SIZE);\n\n t0 = get_ns();\n printf("rand_read_1\\n");\n sum = rand_read_1(ptr, SIZE/8);\n t1 = get_ns();\n td = t1 - t0;\n printf("sum = %lx, time = %.2fs\\n", sum, td/1E9);\n printf("rand_read_2\\n");\n sum = rand_read_2(ptr, SIZE/8);\n t2 = get_ns();\n td = t2 - t1;\n printf("sum = %lx, time = %.2fs\\n", sum, td/1E9);\n return 0;\n}\nRun Code Online (Sandbox Code Playgroud)\n
基准测试的方法有点狡猾,但这是真实的效果。
next += val;改变了代码结构的一些基本内容:它使每次内存读取都依赖于前一次读取的结果。如果没有该行,读取是独立的(通过 存在较短的循环携带依赖链my_rand64,内存读取不属于该链)。
本质上,有了这条线,它就是一个延迟基准,如果没有这条线,它就是一个吞吐量基准。对于内存读取来说,延迟和吞吐量相差 10 倍是合理的。
在汇编级别,如果没有该行,使用 Clang 编译时 asm 看起来像这样
.LBB2_3: # =>This Inner Loop Header: Depth=1
imul rcx, r15
add rcx, 1
mov edx, ecx
and edx, 134217727
xor rdx, qword ptr [r14 + 8*rdx]
mov esi, r15d
imul esi, ecx
add rdx, rbx
add esi, 1
and esi, 134217727
mov rbx, qword ptr [r14 + 8*rsi]
xor rbx, rsi
add rbx, rdx
mov rcx, rsi
add rax, -2
jne .LBB2_3
Run Code Online (Sandbox Code Playgroud)
uiCA估计每次迭代 9.16 个周期(循环以 2 倍展开,因此这对应于原始循环每次迭代大约 4.5 个周期),但它没有考虑缓存未命中。
通过该行,程序集看起来几乎相同,但这并不意味着它以几乎相同的方式运行:
.LBB2_6: # =>This Inner Loop Header: Depth=1
imul ecx, r15d
add ecx, 1
and ecx, 134217727
mov rdx, qword ptr [r14 + 8*rcx]
mov rsi, rcx
xor rsi, rdx
add rsi, rbx
add edx, ecx
imul edx, r15d
add edx, 1
and edx, 134217727
mov rcx, qword ptr [r14 + 8*rdx]
mov rbx, rdx
xor rbx, rcx
add rbx, rsi
add rdx, rcx
mov rcx, rdx
add rax, -2
jne .LBB2_6
Run Code Online (Sandbox Code Playgroud)
现在 uiCA估计每次迭代有 24.11 个周期(该循环也展开了 2 倍),同样没有考虑缓存未命中。