Akt*_*tau 12 c optimization performance assembly simd
我的问题是基于另一个SO问题:为什么_mm_stream_ps会产生L1/LL缓存未命中?
在阅读并被它吸引之后,我试图复制结果并亲眼看看哪个更快:天真循环,展开的幼稚循环,_mm_stream_ps(展开),_mm_store_ps(展开)以及最后但并非最不重要memset_pattern4.(最后一个采用4字节模式,例如浮点数,并在目标数组上填充它,这应该与所有其他函数相同,但它可能是OS X独有的).
我已确保将数组的开头对齐在高速缓存行(64字节,我检查过),并在参数中传递数组以及上一个问题中提到的任何其他性能调整.
有人想在gamedev上知道同样的事情:http://www.gamedev.net/topic/532112-fast-memset/
该线程的结论反映了我自己:当目标数组小于最大(L3)缓存时,_mm_store_ps速度快于_mm_stream_ps.目标数组越大,_mm_stream_ps速度越快.我不完全确定为什么__mm_store_ps在第一种情况下速度更快,因为我从不在缓存中使用这些值,但我知道为什么_mm_stream_ps在后一种情况下胜出.它适用于这种情况:将字节写入内存,您不需要立即(或永远).
以下是使用gcc 4.8编译的目标数组比L3缓存大256倍(在我的情况下为1.5GB)的一些结果:
gcc-4.8 stream.c -o stream -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops && ./stream
bench L3-MASS, array 1610612736 bytes (402653184 floats, 0 remainder, 0x104803040 pointer)
warm up round...
6% ( 20.81148 ms) : MEMSET CHEAT
8% ( 28.49419 ms) : MEMSET PATTER
100% ( 371.40385 ms) : NAIVE NORMAL
54% ( 202.01147 ms) : NAIVE UNROLL
31% ( 113.53433 ms) : STREAM NORMAL
30% ( 111.41691 ms) : STREAM UNROLL
51% ( 190.70412 ms) : STORE NORMAL
51% ( 189.15338 ms) : STORE UNROLL
51% ( 189.36182 ms) : STORE PREFET
Run Code Online (Sandbox Code Playgroud)
那么我们从中学到了什么呢?memset_pattern4快得令人难以置信.我包括了沼泽标准,memset即使它只使用1字节模式进行比较.从本质上说,memset作弊,但memset_pattern4不是,它仍然是快速的邪恶.
我已经尝试查看程序memset_pattern4集中我认为是OS X字符串库中的源代码:
memset_pattern4://www.opensource.apple.com/source/Libc/Libc-825.25/string/memset_pattern.c? txt我对asm的了解(到现在为止)足够远,我看到他们正在使用重要的movdqa指令(在本LAlignedLoop节中),这基本上是整数(不是浮点数)的SSE移动指令,内在的:_mm_store_si128.这并不重要,比特和字节,对吧?
memset_pattern4,它似乎有所不同,因为它没有调用bcopy:http://www.opensource.apple.com/source/Libc/Libc-763.13/x86_64/string/memset.s(EDIT:这是正确的,通过在gdb下运行验证)...该死的,这个似乎使用非时间(_mm_stream_ps存储为非常长的数组=> movntdq %xmm0,(%rdi,%rcx)...,查看功能LVeryLong部分),这正是我所做的!那怎么能更快呢?也许这不是memset_pattern4我想要的.
那么,memset_pattern4在幕后做什么,为什么它比我最好的尝试快5倍?即使我一直在努力学习足够的x86程序集以便能够剖析函数,我担心现在有点超出我的联盟调试优化到死亡函数的性能问题.
注意:对于那些好奇的人来说,这个微基准测试也可以用来说明clang及其先进的矢量化(-fslp-vectorize)的绝对优秀,它几乎在所有情况下都设法使朴素循环成为memset的最快版本.这似乎是对的最佳组合好_mm_store_ps和_mm_stream_ps.
代码:这是我用来执行基准测试的代码(如gist:https://gist.github.com/6571379):
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
/**
* compile and run:
*
* OSX:
* clang stream.c -o stream -std=c11 -O3 -g -ftree-vectorize -fslp-vectorize -march=native -minline-all-stringops && ./stream
* gcc-4.8 stream.c -o stream -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops && ./stream
*
* linux:
* clang stream.c -o stream -lrt -std=c11 -O3 -ftree-vectorize -fslp-vectorize -march=native && ./stream
* gcc-4.8 stream.c -o stream -lrt -std=c11 -O3 -ftree-vectorize -march=native && ./stream
*
* to generate the assembly:
* gcc-4.8 -S stream.c -o stream.s -std=c11 -O3 -g3 -ftree-vectorize -march=native -minline-all-stringops
* gobjdump -dS stream > stream.obj.s
*
* clang is the (very clear) winner here, the SLP vectorizer is absolutely killer, it even turns the
* plain naive loop into something hyper-performant
*/
/* posix headers */
#include <sys/time.h>
/* intrinsics */
#include <x86intrin.h>
#define ARRAY_SIZE(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x])))))
/**
* some stats from my system
*
* sudo sysctl -a | grep cache
*
* hw.cachelinesize = 64
* hw.l1icachesize = 32768
* hw.l1dcachesize = 32768
* hw.l2cachesize = 262144
* hw.l3cachesize = 6291456
*/
/* most processors these days (2013) have a 64 byte cache line */
#define FACTOR 1024
#define CACHE_LINE 64
#define FLOATS_PER_LINE (CACHE_LINE / sizeof(float))
#define L1_CACHE_BYTES 32768
#define L2_CACHE_BYTES 262144
#define L3_CACHE_BYTES 6291456
#ifdef __MACH__
#include <mach/mach_time.h>
double ns_conversion_factor;
double us_conversion_factor;
double ms_conversion_factor;
void timeinit() {
mach_timebase_info_data_t timebase;
mach_timebase_info(&timebase);
ns_conversion_factor = (double)timebase.numer / (double)timebase.denom;
us_conversion_factor = (double)timebase.numer / (double)timebase.denom / 1000;
ms_conversion_factor = (double)timebase.numer / (double)timebase.denom / 1000000;
}
double nsticks() {
return mach_absolute_time() * ns_conversion_factor;
}
double msticks() {
return mach_absolute_time() * ms_conversion_factor;
}
#else
void timeinit() {
/* do nothing */
}
double nsticks() {
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ((double)ts.tv_sec) / 1000000000 + ((double)ts.tv_nsec);
}
double msticks() {
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ((double)ts.tv_sec) / 1000 + ((double)ts.tv_nsec) * 1000000;
}
#endif
void *aligned_malloc(size_t size, size_t alignment) {
void *pa, *ptr;
pa = malloc((size+alignment-1)+sizeof(void *));
if (!pa) return NULL;
ptr=(void*)( ((intptr_t)pa+sizeof(void *)+alignment-1)&~(alignment-1) );
*((void **)ptr-1)=pa;
return ptr;
}
void aligned_free(void *ptr) {
if (ptr) free(*((void **)ptr-1));
}
void pollute_cache(uint8_t volatile *arr, size_t length) {
for (int i = 0; i < length; ++i) {
arr[i] = (arr[i] > 0xFE) ? 0xAA : 0x55;
}
}
void pollute_cache_standalone() {
const size_t pollute_len = 2 * L3_CACHE_BYTES;
uint8_t *arr = aligned_malloc(pollute_len * sizeof(uint8_t), 64);
for (int i = 0; i < pollute_len; ++i) {
arr[i] = (arr[i] > 0xFE) ? 0xAA : 0x55;
}
aligned_free(arr);
}
/**
* returns the time passed, in milliseconds
*/
double tim(const char *name, double baseline, void (*pre)(void), void (*func)(float *, size_t), float * restrict arr, size_t length) {
struct timeval t1, t2;
if (pre) pre();
const double ms1 = msticks();
func(arr, length);
const double ms2 = msticks();
const double ms = (ms2 - ms1);
if (baseline == -2.0) return ms;
/* first run, equal to baseline (itself) by definition */
if (baseline == -1.0) baseline = ms;
if (baseline != 0.0) {
fprintf(stderr, "%7.0f%% (%10.5f ms) : %s\n", (ms / baseline) * 100, ms, name);
}
else {
fprintf(stderr, "%7.3f ms : %s\n", ms, name);
}
return ms;
}
void func0(float * const restrict arr, size_t length) {
memset(arr, 0x05, length);
}
#ifdef __MACH__
void funcB(float * const restrict arr, size_t length) {
const float val = 5.0f;
memset_pattern4(arr, &val,length);
}
#endif
void func1(float * const restrict arr, size_t length) {
for (int i = 0; i < length; ++i) {
arr[i] = 5.0f;
}
}
void func2(float * const restrict arr, size_t length) {
for(int i = 0; i < length; i += 4) {
arr[i] = 5.0f;
arr[i+1] = 5.0f;
arr[i+2] = 5.0f;
arr[i+3] = 5.0f;
}
}
void func3(float * const restrict arr, size_t length) {
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 4) {
_mm_stream_ps(&arr[i], buf);
}
_mm_mfence();
}
void func4(float * const restrict arr, size_t length) {
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 16) {
_mm_stream_ps(&arr[i + 0], buf);
_mm_stream_ps(&arr[i + 4], buf);
_mm_stream_ps(&arr[i + 8], buf);
_mm_stream_ps(&arr[i + 12], buf);
}
_mm_mfence();
}
void func5(float * const restrict arr, size_t length) {
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 4) {
_mm_store_ps(&arr[i], buf);
}
}
void fstore_prefetch(float * const restrict arr, size_t length) {
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 16) {
__builtin_prefetch(&arr[i + FLOATS_PER_LINE * 32], 1, 0);
_mm_store_ps(&arr[i + 0], buf);
_mm_store_ps(&arr[i + 4], buf);
_mm_store_ps(&arr[i + 8], buf);
_mm_store_ps(&arr[i + 12], buf);
}
}
void func6(float * const restrict arr, size_t length) {
const __m128 buf = _mm_setr_ps(5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 16) {
_mm_store_ps(&arr[i + 0], buf);
_mm_store_ps(&arr[i + 4], buf);
_mm_store_ps(&arr[i + 8], buf);
_mm_store_ps(&arr[i + 12], buf);
}
}
#ifdef __AVX__
void func7(float * restrict arr, size_t length) {
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 8) {
_mm256_stream_ps(&arr[i], buf);
}
}
void func8(float * restrict arr, size_t length) {
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 32) {
_mm256_stream_ps(&arr[i + 0], buf);
_mm256_stream_ps(&arr[i + 8], buf);
_mm256_stream_ps(&arr[i + 16], buf);
_mm256_stream_ps(&arr[i + 24], buf);
}
}
void func9(float * restrict arr, size_t length) {
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 8) {
_mm256_store_ps(&arr[i], buf);
}
}
void funcA(float * restrict arr, size_t length) {
const __m256 buf = _mm256_setr_ps(5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f);
for (int i = 0; i < length; i += 32) {
_mm256_store_ps(&arr[i + 0], buf);
_mm256_store_ps(&arr[i + 8], buf);
_mm256_store_ps(&arr[i + 16], buf);
_mm256_store_ps(&arr[i + 24], buf);
}
}
#endif
void bench(const char * restrict name, float * restrict arr, size_t length) {
fprintf(stderr, "bench %s, array %zu bytes (%zu floats, %zu remainder, %p pointer)\n", name, length, length / sizeof(float), length % sizeof(float), arr);
size_t nfloats = length / sizeof(float);
fprintf(stderr, "warm up round...");
func1(arr, nfloats);
fprintf(stderr, "done\n");
double baseline = tim("func1: NAIVE ", -2.0, NULL, func1, arr, nfloats);
tim("MEMSET CHEAT ", baseline, NULL, func0, arr, nfloats);
#ifdef __MACH__
tim("MEMSET PATTER", baseline, NULL, funcB, arr, nfloats);
#endif
tim("NAIVE NORMAL", -1.0, NULL, func1, arr, nfloats);
tim("NAIVE UNROLL", baseline, NULL, func2, arr, nfloats);
tim("STREAM NORMAL", baseline, NULL, func3, arr, nfloats);
tim("STREAM UNROLL", baseline, NULL, func4, arr, nfloats);
tim("STORE NORMAL", baseline, NULL, func5, arr, nfloats);
tim("STORE UNROLL", baseline, NULL, func6, arr, nfloats);
tim("STORE PREFET", baseline, NULL, fstore_prefetch, arr, nfloats);
// for (int i = 0; i < 1; ++i) {
// tim("func0: MEMSET (cache polluted)", NULL, func0, arr, nfloats);
// tim("func1: NAIVE (cache polluted)", pollute_cache_standalone, func1, arr, nfloats);
// tim("func2: UNROLL (cache polluted)", pollute_cache_standalone, func2, arr, nfloats);
// tim("func3: STREAM (cache polluted)", pollute_cache_standalone, func3, arr, nfloats);
// tim("func4: STRUN (cache polluted)", pollute_cache_standalone, func4, arr, nfloats);
// tim("func5: STORE (cache polluted)", pollute_cache_standalone, func5, arr, nfloats);
// tim("func6: STOUN (cache polluted)", pollute_cache_standalone, func6, arr, nfloats);
// }
}
int main() {
timeinit();
static const struct {
const char *name;
size_t bytes;
} sizes[] = {
{ "L1-HALF", L1_CACHE_BYTES / 2 },
{ "L1-FULL", L1_CACHE_BYTES },
{ "L2-HALF", L2_CACHE_BYTES / 2 },
{ "L2-FULL", L2_CACHE_BYTES },
{ "L3-HALF", L3_CACHE_BYTES / 2 },
{ "L3-FULL", L3_CACHE_BYTES },
{ "L3-DOUB", L3_CACHE_BYTES * 2 },
{ "L3-HUGE", L3_CACHE_BYTES * 64 },
{ "L3-MASS", L3_CACHE_BYTES * 256 }
};
for (int i = 0; i < ARRAY_SIZE(sizes); ++i) {
size_t bytes = sizes[i].bytes;
/* align to cache line */
float *arr = aligned_malloc(bytes, CACHE_LINE);
bench(sizes[i].name, arr, bytes);
aligned_free(arr);
}
return 0;
}
Run Code Online (Sandbox Code Playgroud)
编辑:我进一步深入挖掘并编辑了gcc生成的程序集,使其与苹果使用的程序大致相同(memset.s标签LVeryLong,即:movntdq紧密循环中的4个展开指令).令我惊讶的是,我获得与使用_mm_store_ps(movaps)的函数相同的性能.这让我感到困惑,正如我所预料的那样
memset_pattern4(大概是展开的movntdq)一样快_mm_stream_ps(movntdq)但不,它似乎是一样的_mm_store_ps,想象一下,也许我做错了什么.在生成的二进制文件上运行objdump确认它正在使用movntdq,这让我更加惊讶,到底是怎么回事?
因为我在那里遇到了死胡同,所以我决定在调试器中逐步执行可执行文件并设置断点memset_pattern4.走进这个功能,我注意到它完全按照我的想法行事,一个四个展开的紧密循环movntdq:
0x00007fff92a5f7d2 <+318>: jmp 0x7fff92a5f7e0 <memset_pattern4+332>
0x00007fff92a5f7d4 <+320>: nopw 0x0(%rax,%rax,1)
0x00007fff92a5f7da <+326>: nopw 0x0(%rax,%rax,1)
0x00007fff92a5f7e0 <+332>: movntdq %xmm0,(%rdi,%rcx,1)
0x00007fff92a5f7e5 <+337>: movntdq %xmm0,0x10(%rdi,%rcx,1)
0x00007fff92a5f7eb <+343>: movntdq %xmm0,0x20(%rdi,%rcx,1)
0x00007fff92a5f7f1 <+349>: movntdq %xmm0,0x30(%rdi,%rcx,1)
0x00007fff92a5f7f7 <+355>: add $0x40,%rcx
=> 0x00007fff92a5f7fb <+359>: jne 0x7fff92a5f7e0 <memset_pattern4+332>
0x00007fff92a5f7fd <+361>: sfence
Run Code Online (Sandbox Code Playgroud)
那么,是什么让Apple的酱汁比我的更神奇,我想...
编辑2:我在这里错了两次,Apple的魔法酱不是那么神奇,我只是传递了一个比我传递给我的函数小4倍的数组.感谢@PaulR注意!其次我正在编辑函数的程序集,但是gcc已经内联它了.所以我正在编辑一个从未使用过的副本.
结论:
我发现的其他一些事情:
memset调用中,清除了另一个令人困惑的结果.memset非常优化,它将根据要写入的数组的长度自动在常规存储和非临时存储(流)之间切换.我不确定在OSX以外的平台上有多少是真的编辑:我最近偶然发现了英特尔优化指南,如果对这些事情感兴趣,请先阅读这部分内容(或许从3.7.6开始).
我认为你这里有几个错误:
void func0(float * const restrict arr, size_t length) {
memset(arr, 0x05, length);
}
Run Code Online (Sandbox Code Playgroud)
和这里类似:
void funcB(float * const restrict arr, size_t length) {
const float val = 5.0f;
memset_pattern4(arr, &val,length);
}
Run Code Online (Sandbox Code Playgroud)
这些实际上应该是:
void func0(float * const restrict arr, size_t length) {
memset(arr, 0x05, length * sizeof(float));
}
Run Code Online (Sandbox Code Playgroud)
和:
void funcB(float * const restrict arr, size_t length) {
const float val = 5.0f;
memset_pattern4(arr, &val, length * sizeof(float));
}
Run Code Online (Sandbox Code Playgroud)
这将给出比这两种情况应有的乐观 4 倍的时机。
在我用了 3 年的 Core i7 MacBook Pro(8 GB RAM)上,固定代码给了我:
bench L3-HUGE, array 402653184 bytes (100663296 floats, 0 remainder, 0x108ed8040 pointer)
warm up round...done
99% ( 69.43037 ms) : MEMSET CHEAT
106% ( 73.98113 ms) : MEMSET PATTER
100% ( 72.40429 ms) : NAIVE NORMAL
120% ( 83.98352 ms) : NAIVE UNROLL
102% ( 71.75789 ms) : STREAM NORMAL
102% ( 71.59420 ms) : STREAM UNROLL
115% ( 80.63817 ms) : STORE NORMAL
123% ( 86.58758 ms) : STORE UNROLL
123% ( 86.22740 ms) : STORE PREFET
bench L3-MASS, array 1610612736 bytes (402653184 floats, 0 remainder, 0x108ed8040 pointer)
warm up round...done
83% ( 274.71955 ms) : MEMSET CHEAT
83% ( 275.19793 ms) : MEMSET PATTER
100% ( 272.21942 ms) : NAIVE NORMAL
94% ( 309.73151 ms) : NAIVE UNROLL
82% ( 271.38751 ms) : STREAM NORMAL
82% ( 270.27244 ms) : STREAM UNROLL
94% ( 308.49498 ms) : STORE NORMAL
94% ( 308.72266 ms) : STORE UNROLL
95% ( 311.64157 ms) : STORE PREFET
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
1497 次 |
| 最近记录: |