bka*_*sbk 8 optimization x86 assembly sse visual-c++
我正在尝试找到RGB8到RGB32图像转换的汇编优化方法.
Source是8位灰度图像,Destination应该是32位灰度图像(BGRA),第4通道(alpha)可以忽略.源地址不保证16字节对齐,Count是16的倍数,目标地址是16字节对齐.
这是我的优化汇编代码.是否有更快的转换方式?
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
__asm {
mov esi, Source
mov edi, Destination
mov edx, Count
xor ecx, ecx
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
movdqu xmm0, xmmword ptr [esi + ecx]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi + 0], xmm0
movntdq [edi + 16], xmm1
movntdq [edi + 32], xmm2
movntdq [edi + 48], xmm3
add edi, 64
add ecx, 16
cmp ecx, edx
jb l1
}
}
Run Code Online (Sandbox Code Playgroud)
还有另外一种使用PUNPCKLBW和PUNPCKHBW的方法,但这似乎比较慢.
更新:这是基本的非优化算法:
BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
Destination[i].Blue = Source[i];
Destination[i].Green = Source[i];
Destination[i].Red = Source[i];
}
Run Code Online (Sandbox Code Playgroud)
PS:我也尝试使用MS VS2008 SSE编译器内在函数的C代码.原来,编译器生成了大量不必要的内存移动,这导致代码比纯组件慢10-20%.
更新2:这是仅使用intirnsics的相同代码.
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));
for (unsigned int i = 0; i < Count / 16; i++) {
__m128i r0 = _mm_load_si128(Source + i);
_mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
_mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
_mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
_mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
}
}
Run Code Online (Sandbox Code Playgroud)
更新3:这是编译器生成的代码(美化)(Visual Studio 2012,所有优化):
push ebp
mov ebp, esp
mov edx, dword ptr [ebp+8]
movdqa xmm1, xmmword ptr ds:[Masks + 0]
movdqa xmm2, xmmword ptr ds:[Masks + 16]
movdqa xmm3, xmmword ptr ds:[Masks + 32]
movdqa xmm4, xmmword ptr ds:[Masks + 48]
push esi
test ecx, ecx
je l2
lea esi, [ecx-1]
shr esi, 4
inc esi
l1:
mov ecx, edx
movdqu xmm0, xmmword ptr [ecx]
mov ecx, eax
movdqa xmm5, xmm0
pshufb xmm5, xmm1
movdqa xmmword ptr [ecx], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm2
movdqa xmmword ptr [eax+10h], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm3
movdqa xmmword ptr [eax+20h], xmm5
lea ecx, [eax+30h]
add edx, 10h
add eax, 40h
dec esi
pshufb xmm0, xmm4
movdqa xmmword ptr [ecx], xmm0
jne l1
l2:
pop esi
pop ebp
ret
Run Code Online (Sandbox Code Playgroud)
似乎交错movdqa
与pshufb
某些更快.
更新4:这似乎是最优手动优化代码:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
movdqa [edi - 64], xmm0
pshufb xmm1, xmm5
movdqa [edi - 48], xmm1
pshufb xmm2, xmm6
movdqa [edi - 32], xmm2
pshufb xmm3, xmm7
movdqa [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
ja l1
}
Run Code Online (Sandbox Code Playgroud)
更新5:此转换算法使用punpck
指令.但是,这个转换例程比使用掩码慢一点pushfb
.
for (unsigned int i = 0; i < Count; i += 16) {
register __m128i r0 = _mm_load_si128(Source++);
register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
register __m128i r6 = _mm_unpacklo_epi8(r2, r2);
_mm_store_si128(Destination++, r6);
_mm_store_si128(Destination++, r5);
_mm_store_si128(Destination++, r4);
_mm_store_si128(Destination++, r3);
}
Run Code Online (Sandbox Code Playgroud)
更新6:为了完整起见,这是将32位转换回8位灰度图像的逆方法.
static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned char __declspec(align(64)) Masks[] = {
0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));
for (unsigned int i = 0; i < Count / 64; i++) {
__m128i a = _mm_load_si128(Source + (i * 4) + 0);
__m128i b = _mm_load_si128(Source + (i * 4) + 1);
__m128i c = _mm_load_si128(Source + (i * 4) + 2);
__m128i d = _mm_load_si128(Source + (i * 4) + 3);
a = _mm_shuffle_epi8(a, m0);
b = _mm_shuffle_epi8(b, m1);
c = _mm_shuffle_epi8(c, m2);
d = _mm_shuffle_epi8(d, m3);
__m128i e = _mm_or_si128(a, b);
__m128i f = _mm_or_si128(c, d);
__m128i g = _mm_or_si128(e, f);
_mm_stream_si128(Destination + i, g);
}
}
Run Code Online (Sandbox Code Playgroud)
会尝试:
__asm { mov esi, Source mov edi, Destination mov ecx, Count movdqu xmm0, xmmword ptr [esi] movdqa xmm4, xmmword ptr [Masks + 0] movdqa xmm5, xmmword ptr [Masks + 16] movdqa xmm6, xmmword ptr [Masks + 32] movdqa xmm7, xmmword ptr [Masks + 48] l1: dec ecx lea edi, [ edi + 64 ] lea esi, [ esi + 16 ] movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 pshufb xmm0, xmm4 pshufb xmm1, xmm5 pshufb xmm2, xmm6 pshufb xmm3, xmm7 movntdq [edi - 64], xmm0 movntdq [edi - 48], xmm1 movntdq [edi - 32], xmm2 movntdq [edi - 16], xmm3 movdqu xmm0, xmmword ptr [esi] ja l1 }
但是没有对它进行基准测试; 这些变化背后的假设:
movdqu xmm0,...
延迟可以是环内多一点隐藏(您的代码具有的负载xmm0
,通过使用在该寄存器中的值的指令之后直接跟随)add
两个注册的操作以及cmp
实际上并非全部必要; 地址生成(lea
)和dec
/ ja
可以使用隐式零测试.这样,EFLAGS
由于循环中唯一的ALU操作递减循环计数器,因此ecx
/ esi
/ 上的操作不会产生依赖性edi
.最后,这可能是任何情况下的加载/存储限制,因此算术是"免费游戏"; 因此,即使有给出的论据,我也没有什么区别.
如果输入很大,则将"未对齐的头/尾"剥离是有意义的,即为第一个/最后一个[0..15]
字节执行duff的设备,使用主循环movdqa
.
编辑:
通过gcc -msse4.2 -O8 -c
(GCC 4.7.1)运行内在函数源提供以下程序集:
Disassembly of section .text: 0000000000000000 <ConvertGreyToRgb32Assembler>: 0: 85 d2 test edx,edx 2: 74 76 je 7a <ConvertGreyToRgb32Assembler+0x7a> 4: 66 0f 6f 2d 00 00 00 00 movdqa xmm5,XMMWORD PTR [rip+0x0] # c <ConvertGreyToRgb32Assembler+0xc> c: 48 89 f8 mov rax,rdi f: 66 0f 6f 25 00 00 00 00 movdqa xmm4,XMMWORD PTR [rip+0x0] # 17 <ConvertGreyToRgb32Assembler+0x17> 17: 66 0f 6f 1d 00 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 1f <ConvertGreyToRgb32Assembler+0x1f> 1f: 66 0f 6f 15 00 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 27 <ConvertGreyToRgb32Assembler+0x27> 27: 66 0f 1f 84 00 00 00 00 00 nop WORD PTR [rax+rax*1+0x0] 30: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax] 34: 48 89 f1 mov rcx,rsi 37: 48 83 c0 10 add rax,0x10 3b: 66 0f 6f c8 movdqa xmm1,xmm0 3f: 66 0f 38 00 cd pshufb xmm1,xmm5 44: 66 0f e7 0e movntdq XMMWORD PTR [rsi],xmm1 48: 66 0f 6f c8 movdqa xmm1,xmm0 4c: 66 0f 38 00 cc pshufb xmm1,xmm4 51: 66 0f e7 4e 10 movntdq XMMWORD PTR [rsi+0x10],xmm1 56: 66 0f 6f c8 movdqa xmm1,xmm0 5a: 66 0f 38 00 c2 pshufb xmm0,xmm2 5f: 66 0f 38 00 cb pshufb xmm1,xmm3 64: 66 0f e7 4e 20 movntdq XMMWORD PTR [rsi+0x20],xmm1 69: 66 0f e7 41 30 movntdq XMMWORD PTR [rcx+0x30],xmm0 6e: 89 c1 mov ecx,eax 70: 29 f9 sub ecx,edi 72: 48 83 c6 40 add rsi,0x40 76: 39 ca cmp edx,ecx 78: 77 b6 ja 30 <ConvertGreyToRgb32Assembler+0x30> 7a: f3 c3 repz ret
这让我非常强烈地提醒您初始汇编代码.如果MSVC创建了比这更糟糕的东西,我会说这是你使用的编译器(版本)中的错误/限制.
归档时间: |
|
查看次数: |
3135 次 |
最近记录: |