uNi*_*acy 7 c++ assembly sse simd
标题可能看似无稽之谈,但让我解释一下.前几天,当我遇到以下汇编代码时,我正在研究一个程序:
movaps xmm3, xmmword ptr [rbp-30h]
lea rdx, [rdi+1320h]
movaps xmm5, xmm3
movaps xmm6, xmm3
movaps xmm0, xmm3
movss dword ptr [rdx], xmm3
shufps xmm5, xmm3, 55h
shufps xmm6, xmm3, 0AAh
shufps xmm0, xmm3, 0FFh
movaps xmm4, xmm3
movss dword ptr [rdx+4], xmm5
movss dword ptr [rdx+8], xmm6
movss dword ptr [rdx+0Ch], xmm0
mulss xmm4, xmm3
Run Code Online (Sandbox Code Playgroud)
而且似乎主要只是将[rbp-30h]的四个浮点数复制到[rdx].这些shufpss仅用于选择四个浮点中的一个xmm3(例如,shufps xmm5, xmm3, 55h选择第二个浮点并将其放入xmm5).
这使我怀疑,如果编译器这样做是因为shufps实际上比存储器存取(像快movss xmm0, dword ptr [rbp-30h],movss dword ptr [rdx], xmm0).
所以我写了一些测试来比较这两种方法,发现shufps总是慢于多次内存访问.现在我想也许使用shufps与性能无关.它可能只是在那里混淆代码,因此反编译器不能轻易地生成干净的代码(尝试使用IDA pro并且确实过于复杂).
虽然我可能永远不会在任何实际程序中shufps明确地使用(通过使用_mm_shuffle_ps),因为编译器最有可能比我更聪明,我仍然想知道为什么编译程序的编译器生成了这样的代码.它既不快也不小.这没有道理.
无论如何,我将提供我在下面写的测试.
#include <Windows.h>
#include <iostream>
using namespace std;
__declspec(noinline) DWORD profile_routine(void (*routine)(void *), void *arg, int iterations = 1)
{
DWORD startTime = GetTickCount();
while (iterations--)
{
routine(arg);
}
DWORD timeElapsed = GetTickCount() - startTime;
return timeElapsed;
}
struct Struct
{
float x, y, z, w;
};
__declspec(noinline) Struct shuffle1(float *arr)
{
float x = arr[3];
float y = arr[2];
float z = arr[0];
float w = arr[1];
return {x, y, z, w};
}
#define SS0 (0x00)
#define SS1 (0x55)
#define SS2 (0xAA)
#define SS3 (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
Struct r;
__m128 packed = *reinterpret_cast<__m128 *>(arr);
__m128 x = _mm_shuffle_ps(packed, packed, SS3);
__m128 y = _mm_shuffle_ps(packed, packed, SS2);
__m128 z = _mm_shuffle_ps(packed, packed, SS0);
__m128 w = _mm_shuffle_ps(packed, packed, SS1);
_mm_store_ss(&r.x, x);
_mm_store_ss(&r.y, y);
_mm_store_ss(&r.z, z);
_mm_store_ss(&r.w, w);
return r;
}
void profile_shuffle_r1(void *arg)
{
float *arr = static_cast<float *>(arg);
Struct q = shuffle1(arr);
arr[0] += q.w;
arr[1] += q.z;
arr[2] += q.y;
arr[3] += q.x;
}
void profile_shuffle_r2(void *arg)
{
float *arr = static_cast<float *>(arg);
Struct q = shuffle2(arr);
arr[0] += q.w;
arr[1] += q.z;
arr[2] += q.y;
arr[3] += q.x;
}
int main(int argc, char **argv)
{
int n = argc + 3;
float arr1[4], arr2[4];
for (int i = 0; i < 4; i++)
{
arr1[i] = static_cast<float>(n + i);
arr2[i] = static_cast<float>(n + i);
}
int iterations = 20000000;
DWORD time1 = profile_routine(profile_shuffle_r1, arr1, iterations);
cout << "time1 = " << time1 << endl;
DWORD time2 = profile_routine(profile_shuffle_r2, arr2, iterations);
cout << "time2 = " << time2 << endl;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
在上面的测试中,我有两个shuffle方法,shuffle1并shuffle2做同样的事情.使用MSVC -O2编译时,它会生成以下代码:
shuffle1:
mov eax,dword ptr [rdx+0Ch]
mov dword ptr [rcx],eax
mov eax,dword ptr [rdx+8]
mov dword ptr [rcx+4],eax
mov eax,dword ptr [rdx]
mov dword ptr [rcx+8],eax
mov eax,dword ptr [rdx+4]
mov dword ptr [rcx+0Ch],eax
mov rax,rcx
ret
shuffle2:
movaps xmm2,xmmword ptr [rdx]
mov rax,rcx
movaps xmm0,xmm2
shufps xmm0,xmm2,0FFh
movss dword ptr [rcx],xmm0
movaps xmm0,xmm2
shufps xmm0,xmm2,0AAh
movss dword ptr [rcx+4],xmm0
movss dword ptr [rcx+8],xmm2
shufps xmm2,xmm2,55h
movss dword ptr [rcx+0Ch],xmm2
ret
Run Code Online (Sandbox Code Playgroud)
shuffle1始终比shuffle2我的机器快至少30%.我注意到shuffle2有两个更多的指令,shuffle1实际上使用eax而不是xmm0所以我想如果我添加一些垃圾算术运算,结果会有所不同.
所以我修改了它们如下:
__declspec(noinline) Struct shuffle1(float *arr)
{
float x0 = arr[3];
float y0 = arr[2];
float z0 = arr[0];
float w0 = arr[1];
float x = x0 + y0 + z0;
float y = y0 + z0 + w0;
float z = z0 + w0 + x0;
float w = w0 + x0 + y0;
return {x, y, z, w};
}
#define SS0 (0x00)
#define SS1 (0x55)
#define SS2 (0xAA)
#define SS3 (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
Struct r;
__m128 packed = *reinterpret_cast<__m128 *>(arr);
__m128 x0 = _mm_shuffle_ps(packed, packed, SS3);
__m128 y0 = _mm_shuffle_ps(packed, packed, SS2);
__m128 z0 = _mm_shuffle_ps(packed, packed, SS0);
__m128 w0 = _mm_shuffle_ps(packed, packed, SS1);
__m128 yz = _mm_add_ss(y0, z0);
__m128 x = _mm_add_ss(x0, yz);
__m128 y = _mm_add_ss(w0, yz);
__m128 wx = _mm_add_ss(w0, x0);
__m128 z = _mm_add_ss(z0, wx);
__m128 w = _mm_add_ss(y0, wx);
_mm_store_ss(&r.x, x);
_mm_store_ss(&r.y, y);
_mm_store_ss(&r.z, z);
_mm_store_ss(&r.w, w);
return r;
}
Run Code Online (Sandbox Code Playgroud)
现在组件看起来更公平,因为它们具有相同数量的指令并且都需要使用xmm寄存器.
shuffle1:
movss xmm5,dword ptr [rdx+8]
mov rax,rcx
movss xmm3,dword ptr [rdx+0Ch]
movaps xmm0,xmm5
movss xmm2,dword ptr [rdx]
addss xmm0,xmm3
movss xmm4,dword ptr [rdx+4]
movaps xmm1,xmm2
addss xmm1,xmm5
addss xmm0,xmm2
addss xmm1,xmm4
movss dword ptr [rcx],xmm0
movaps xmm0,xmm4
addss xmm0,xmm2
addss xmm4,xmm3
movss dword ptr [rcx+4],xmm1
addss xmm0,xmm3
addss xmm4,xmm5
movss dword ptr [rcx+8],xmm0
movss dword ptr [rcx+0Ch],xmm4
ret
shuffle2:
movaps xmm4,xmmword ptr [rdx]
mov rax,rcx
movaps xmm3,xmm4
movaps xmm5,xmm4
shufps xmm5,xmm4,0AAh
movaps xmm2,xmm4
shufps xmm2,xmm4,0FFh
movaps xmm0,xmm5
addss xmm0,xmm3
shufps xmm4,xmm4,55h
movaps xmm1,xmm4
addss xmm1,xmm2
addss xmm2,xmm0
addss xmm4,xmm0
addss xmm3,xmm1
addss xmm5,xmm1
movss dword ptr [rcx],xmm2
movss dword ptr [rcx+4],xmm4
movss dword ptr [rcx+8],xmm3
movss dword ptr [rcx+0Ch],xmm5
ret
Run Code Online (Sandbox Code Playgroud)
但没关系.shuffle1仍然快30%!
如果没有更广泛的背景,很难确定,但是......在针对较新的处理器进行优化时,您必须考虑不同端口的使用。请参阅 Agners 此处: http: //www.agner.org/optimize/instruction_tables.pdf
在这种情况下,虽然看起来不太可能,但如果我们假设装配实际上是经过优化的,那么我就会想到一些可能性。
最后,具体到这种优化以及我使用类似内容的地方。假设您有一个运行时几乎 100% 可预测的分支,但在编译时则不然。让我们想象一下,假设在分支之后有一次读取,通常是缓存未命中。你想尽快阅读。如果您不使用读取端口,无序调度程序将提前读取并开始执行该读取。这可以使 shufps 指令本质上“自由”执行。这是那个例子:
MOV ecx, [some computed, mostly constant at run-time global]
label loop:
ADD rdi, 16
ADD rbp, 16
CALL shuffle
SUB ecx, 1
JNE loop
MOV rax, [rdi]
;do a read that could be "predicted" properly
MOV rbx, [rax]
Run Code Online (Sandbox Code Playgroud)但老实说,它看起来像是编写得不好的程序集或生成得不好的机器代码,所以我不会花太多心思。我给出的例子是不太可能的。
| 归档时间: |
|
| 查看次数: |
340 次 |
| 最近记录: |