Ray*_*ger 6 c optimization assembly x86-64 intrinsics
我正在查看GCC-4.8为x86_64生成的代码,并想知道是否有更好(更快)的方法来计算三个值的最小值.
下面是Python的摘录收集模块,计算最低的m,rightindex+1和leftindex:
ssize_t m = n;
if (m > rightindex + 1)
m = rightindex + 1;
if (m > leftindex)
m = leftindex;
Run Code Online (Sandbox Code Playgroud)
GCC使用CMOV生成依赖于序列的依赖代码:
leaq 1(%rbp), %rdx
cmpq %rsi, %rdx
cmovg %rsi, %rdx
cmpq %rbx, %rdx
cmovg %rbx, %rdx
Run Code Online (Sandbox Code Playgroud)
是否有更快的代码可以通过删除数据依赖性来利用处理器无序并行执行?我想知道是否存在用于计算多个值的最小值而不使用条件或谓词指令的已知技巧.我也想知道是否有一些饱和的算术内在函数可以帮助解决这种情况.
EDITS:
以下是建议方法的基准测试结果.处理器是Intel Core-i7 2600K,运行频率为4GHz.每个循环的单位是纳秒(越小越好).测量包括伪随机测试数据生成和函数调用开销,以及测试中的实际min3代码.
gcc cmov conditional classical
sequence branches branchless
pseudo-random data 2.24 6.31 2.39
fixed data 2.24 2.99 2.39
Run Code Online (Sandbox Code Playgroud)
functions.s:评估的3个解决方案:
//----------------------------------------------------------------------------
.text
.intel_syntax noprefix
//-----------------------------------------------------------------------------
// uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_a
min3_a:
mov rax, rcx
cmp rax, rdx
cmovg rax, rdx
cmp rax, r8
cmovg rax, r8
retq
//-----------------------------------------------------------------------------
// uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_b
min3_b:
mov rax, rcx
cmp rax, rdx
jle skip1
mov rax, rdx
skip1:
cmp rax, r8
jle skip2
mov rax, r8
skip2:
retq
//-----------------------------------------------------------------------------
// uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_c
min3_c:
sub rdx, rcx
sbb rax, rax
and rax, rdx
add rcx, rax
sub r8, rcx
sbb rax, rax
and rax, r8
add rax, rcx
retq
//-----------------------------------------------------------------------------
Run Code Online (Sandbox Code Playgroud)
min3test.c,主程序(mingw64/windows):
#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>
uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8);
//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.
uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.
uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;
QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}
//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
{
uint64_t carryOut = data >> 63;
uint64_t maskOrZ = -carryOut;
return (data << 1) ^ (maskOrZ & mask);
}
//---------------------------------------------------------------------------
uint64_t min3 (uint64_t a, uint64_t b, uint64_t c)
{
uint64_t smallest;
smallest = a;
if (smallest > b) smallest = b;
if (smallest > c) smallest = c;
return smallest;
}
//---------------------------------------------------------------------------
static void runtests (uint64_t pattern, uint64_t mask)
{
uint64_t startCount, elapsed, index, loops = 800000000;
double ns;
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
}
//---------------------------------------------------------------------------
int main (void)
{
uint64_t index, pattern, mask;
uint64_t count_a = 0, count_b = 0, count_c = 0;
mask = 0xBEFFFFFFFFFFFFFF;
pattern = 1;
// sanity check the asm functions
for (index = 0; index < 1000000; index++)
{
uint64_t expected, result_a, result_b, result_c;
uint64_t pattern1 = (pattern >> 0) & 0xFFFFF;
uint64_t pattern2 = (pattern >> 20) & 0xFFFFF;
uint64_t pattern3 = (pattern >> 40) & 0xFFFFF;
pattern = lfsr64gpr (pattern, mask);
expected = min3 (pattern1, pattern2, pattern3);
result_a = min3_a (pattern1, pattern2, pattern3);
result_b = min3_b (pattern1, pattern2, pattern3);
result_c = min3_c (pattern1, pattern2, pattern3);
if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3);
if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3);
if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3);
if (expected == pattern1) count_a++;
if (result_b == pattern2) count_b++;
if (result_c == pattern3) count_c++;
}
printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c);
// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);
printf ("using pseudo-random data\n");
runtests (1, mask);
printf ("using fixed data\n");
runtests (0, mask);
return 0;
}
//---------------------------------------------------------------------------
Run Code Online (Sandbox Code Playgroud)
构建命令行:
gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s
Run Code Online (Sandbox Code Playgroud)
最少两个无符号数具有经典解决方案:
; eax = min(eax, ebx), ecx - scratch register.
.min2:
sub ebx, eax
sbb ecx, ecx
and ecx, ebx
add eax, ecx
Run Code Online (Sandbox Code Playgroud)
这种方法可能比使用cmov的解决方案更快,但是为了更高的速度,指令必须由其他指令分开以进行并行执行.
可以为三个数字实现此方法:
; eax = min(eax, ebx, edx), ecx - scratch register.
.min3:
sub ebx, eax
sbb ecx, ecx
and ecx, ebx
add eax, ecx
sub edx, eax
sbb ecx, ecx
and ecx, edx
add eax, ecx
Run Code Online (Sandbox Code Playgroud)
另一种尝试是使用条件跳转来测试变体.对于现代处理器,它可能更快,特别是如果跳跃是高度可预测的:
.min3:
cmp eax, ebx
jle @f
mov eax, ebx
@@:
cmp eax, edx
jle @f
mov eax, edx
@@:
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
1256 次 |
| 最近记录: |