Joh*_*ith 19 c optimization assembly sse x86-64
我试图在i7上以最有效的方式计算浮点数和位向量之间的点积.实际上,我在128或256维向量上进行此操作,但为了说明,让我编写64维代码来说明问题:
// a has 64 elements. b is a bitvector of 64 dimensions.
float dot(float *restrict a, uint64_t b) {
float sum = 0;
for(int i=0; b && i<64; i++, b>>=1) {
if (b & 1) sum += a[i];
}
return sum;
}
Run Code Online (Sandbox Code Playgroud)
当然,这是有效的,但问题是,这是整个程序的时间要点(占用50分钟运行的95%CPU时间)所以我迫切需要让它更快.
我的猜测是上面的分支是游戏杀手(防止无序执行,导致坏分支预测).我不确定矢量指令是否可以在这里使用和帮助.使用gcc 4.8和-std = c99 -march = native -mtune = native -Ofast -funroll-loops,我现在得到这个输出
movl $4660, %edx
movl $5, %ecx
xorps %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L4:
testb $1, %cl
je .L2
addss (%rdx), %xmm0
.L2:
leaq 4(%rdx), %rax
shrq %rcx
testb $1, %cl
je .L8
addss 4(%rdx), %xmm0
.L8:
shrq %rcx
testb $1, %cl
je .L9
addss 4(%rax), %xmm0
.L9:
shrq %rcx
testb $1, %cl
je .L10
addss 8(%rax), %xmm0
.L10:
shrq %rcx
testb $1, %cl
je .L11
addss 12(%rax), %xmm0
.L11:
shrq %rcx
testb $1, %cl
je .L12
addss 16(%rax), %xmm0
.L12:
shrq %rcx
testb $1, %cl
je .L13
addss 20(%rax), %xmm0
.L13:
shrq %rcx
testb $1, %cl
je .L14
addss 24(%rax), %xmm0
.L14:
leaq 28(%rax), %rdx
shrq %rcx
cmpq $4916, %rdx
jne .L4
ret
Run Code Online (Sandbox Code Playgroud)
编辑可以置换数据(只要所有参数的排列相同),排序无关紧要.
我想知道是否有一些东西可以在Chris Dodd的SSE2代码的3倍速度下工作.
新笔记:AVX/AVX2代码也欢迎!
编辑2 给定一个位向量,我必须将它与128(或256,如果它是256位)不同的浮点向量相乘(所以它也可以一次涉及更多的单个浮点向量).这是整个过程.任何能加速整个过程的东西也欢迎!
Chr*_*odd 16
最好的选择是使用SSE ps指令,一次操作4个浮点数.您可以利用浮点数0.0都是0位的事实,使用andps指令来屏蔽不需要的元素:
#include <stdint.h>
#include <xmmintrin.h>
union {
uint32_t i[4];
__m128 xmm;
} mask[16] = {
{ 0, 0, 0, 0 },
{ ~0, 0, 0, 0 },
{ 0, ~0, 0, 0 },
{ ~0, ~0, 0, 0 },
{ 0, 0, ~0, 0 },
{ ~0, 0, ~0, 0 },
{ 0, ~0, ~0, 0 },
{ ~0, ~0, ~0, 0 },
{ 0, 0, 0, ~0 },
{ ~0, 0, 0, ~0 },
{ 0, ~0, 0, ~0 },
{ ~0, ~0, 0, ~0 },
{ 0, 0, ~0, ~0 },
{ ~0, 0, ~0, ~0 },
{ 0, ~0, ~0, ~0 },
{ ~0, ~0, ~0, ~0 },
};
float dot(__m128 *a, uint64_t b) {
__m128 sum = { 0.0 };
for (int i = 0; i < 16; i++, b>>=4)
sum += _mm_and_ps(a[i], mask[b&0xf].xmm);
return sum[0] + sum[1] + sum[2] + sum[3];
}
Run Code Online (Sandbox Code Playgroud)
如果你希望掩码中有很多0,那么短时间缩短0可能会更快:
for (int i = 0; b; i++, b >>= 4)
if (b & 0xf)
sum += _mm_and_ps(a[i], mask[b&0xf].xmm);
Run Code Online (Sandbox Code Playgroud)
但如果b是随机的,这将会更慢.
To expand on Aki Suihkonen's answer, reshaping the bitstring is helpful for conditionally moving floats. In the solution below, a two-stage bit permutation using the SSE instructions PMOVMASKB and PSHUFB, plus the instruction BLENDVPS has been used to achieve 1.25 elements handled/cycle on a Core 2 Duo 2.26GHz, which is 20 times the speed of my reference C code.
[EDIT: An AVX2 implementation was added. Performance is unknown because I cannot test it myself, but is expected to be double the speed. ]
Here is my implementation and testbench, the explanation follows.
/* Includes */
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <smmintrin.h> /* SSE 4.1 */
#include <time.h>
/* Defines */
#define ALIGNTO(n) __attribute__((aligned(n)))
#define USE_PINSRW 1
#define NUM_ITERS 2260000
/**
* Bit mask shuffle.
*
* This version uses a loop to store eight u16 and reloads them as one __m128i.
*/
__m128 bitMaskShuffleStoreAndReload(__m128i mask){
const __m128i perm ALIGNTO(16) = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4,
11, 3, 10, 2, 9, 1, 8, 0);
int i;
uint16_t interMask[8] ALIGNTO(16);
/* Shuffle bitmask */
/* Stage 1 */
for(i=7;i>=0;i--){
interMask[i] = _mm_movemask_epi8(mask);
mask = _mm_slli_epi32(mask, 1);
}
/* Stage 2 */
return _mm_castsi128_ps(
_mm_shuffle_epi8(
_mm_load_si128((const __m128i*)interMask),
perm)
);
}
/**
* Bit mask shuffle.
*
* This version uses the PINSTRW instruction.
*/
__m128 bitMaskShufflePINSRW(__m128i mask){
const __m128i perm ALIGNTO(16) = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4,
11, 3, 10, 2, 9, 1, 8, 0);
__m128i imask ALIGNTO(16);
/* Shuffle bitmask */
/* Stage 1 */
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 7);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 6);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 5);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 4);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 3);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 2);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 1);
mask = _mm_slli_epi16(mask, 1);
imask = _mm_insert_epi16(imask, _mm_movemask_epi8(mask), 0);
/* Stage 2 */
return _mm_castsi128_ps(
_mm_shuffle_epi8(
imask,
perm)
);
}
/**
* SSE 4.1 implementation.
*/
float dotSSE41(__m128 f[32], unsigned char maskArg[16]){
int i, j, k;
__m128i mask ALIGNTO(16) = _mm_load_si128((const __m128i*)maskArg);
__m128 shufdMask ALIGNTO(16);
__m128 zblended ALIGNTO(16);
__m128 sums ALIGNTO(16) = _mm_setzero_ps();
float sumsf[4] ALIGNTO(16);
/* Shuffle bitmask */
#if USE_PINSRW
shufdMask = bitMaskShufflePINSRW(mask);
#else
shufdMask = bitMaskShuffleStoreAndReload(mask);
#endif
/* Dot product */
for(i=1;i>=0;i--){
for(j=1;j>=0;j--){
for(k=7;k>=0;k--){
zblended = _mm_setzero_ps();
zblended = _mm_blendv_ps(zblended, f[i*16+j+k*2], shufdMask);
sums = _mm_add_ps(sums, zblended);
shufdMask = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(shufdMask), 1));
}
}
}
/* Final Summation */
_mm_store_ps(sumsf, sums);
return sumsf[0] + sumsf[1] + sumsf[2] + sumsf[3];
}
/**
* Reference C implementation
*/
float dotRefC(float f[128], unsigned char mask[16]){
float sum = 0.0;
int i;
for(i=0;i<128;i++){
sum += ((mask[i>>3]>>(i&7))&1) ? f[i] : 0.0;
}
return sum;
}
/**
* Main
*/
int main(void){
/* Variables */
/* Loop Counter */
int i;
/* Data to process */
float data[128] ALIGNTO(16);
unsigned char mask[16] ALIGNTO(16);
float refCSum, sseSum;
/* Time tracking */
clock_t t1, t2, t3;
double refCTime, sseTime;
/* Initialize mask and float arrays with some random data. */
for(i=0;i<128;i++){
if(i<16)
mask[i]=rand();
data[i] = random();
}
/* RUN TESTS */
t1 = clock();
for(i=0;i<NUM_ITERS;i++){
refCSum = dotRefC(data, mask);
}
t2 = clock();
for(i=0;i<NUM_ITERS;i++){
sseSum = dotSSE41((__m128*)data, mask);
}
t3 = clock();
/* Compute time taken */
refCTime = (double)(t2-t1)/CLOCKS_PER_SEC;
sseTime = (double)(t3-t2)/CLOCKS_PER_SEC;
/* Print out results */
printf("Results:\n"
"RefC: Time: %f Value: %f\n"
"SSE: Time: %f Value: %f\n",
refCTime, refCSum,
sseTime, sseSum);
return 0;
}
Run Code Online (Sandbox Code Playgroud)
BLENDVPS uses the top bit in all four 32-bit lanes of the 128-bit register XMM0 to determine whether to move or not to move the value in the corresponding lane of its source operand into its destination operand. When loading data with MOVAPS, one gets 4 consecutive floats: For instance, the 8th, 9th, 10th and 11th floats. Of course, their selection or deselection must be controlled by the corresponding set of bits: For instance, the 8th, 9th, 10th and 11th bits in the bit string.
The problem is that when the mask is first loaded, the bits of these sets are right besides each other (in the 8th, 9th, 10th and 11th positions), when in fact they should be 32 bits apart; Remember, at some point they will have to occupy the the 31st bit position of each lane (the 31st, 63rd, 95th and 127th positions within the XMM0 register).
理想情况下会发生一个位转换,它将第0,第4,第8,第12位......放在第0道,第1,第5,第9,第13 ......,第1,第2,第6,第10,第14位,...在第2道和第3,第7,第11,第15,......第3道.因此,先前连续的所有4比特组现在跨越32比特,在四个32比特通道中的每一个中一个.然后所需要的是一个循环,循环32次,每次移动到每个通道的最高位位置一组新的4位.
不幸的是,x86没有很好的位操作指令,所以由于缺乏一种完美换位的简洁方法,这里的合理折衷就是这样.
在掩码中,128位
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
Run Code Online (Sandbox Code Playgroud)
通过八个PMOVMASKB和八个PSLLW指令进行置换,首先是
0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120
1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121
2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122
3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123
4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124
5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125
6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126
7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127
Run Code Online (Sandbox Code Playgroud)
并通过单个PSHUFB指令
0 8 16 24 32 40 48 56 4 12 20 28 36 44 52 60
64 72 80 88 96 104 112 120 68 76 84 92 100 108 116 124
1 9 17 25 33 41 49 57 5 13 21 29 37 45 53 61
65 73 81 89 97 105 113 121 69 77 85 93 101 109 117 125
2 10 18 26 34 42 50 58 6 14 22 30 38 46 54 62
66 74 82 90 98 106 114 122 70 78 86 94 102 110 118 126
3 11 19 27 35 43 51 59 7 15 23 31 39 47 55 63
67 75 83 91 99 107 115 123 71 79 87 95 103 111 119 127
Run Code Online (Sandbox Code Playgroud)
.我们现在迭代四次"运行",每次运行包含8组,每组间隔32位(如我们所希望的那样),使用这些集合作为BLENDVPS的屏蔽控制.位洗牌的固有尴尬导致了看起来笨拙的三重嵌套循环dotSSE41()
,但是
clang -Ofast -ftree-vectorize -finline-functions -funroll-loops -msse4.1 -mssse3 dot.c -o dottest
Run Code Online (Sandbox Code Playgroud)
无论如何都要展开循环.内循环迭代由16次重复组成
blendvps 0x90(%rsi),%xmm1
addps %xmm4,%xmm1
pslld $0x1,%xmm2
movdqa %xmm2,%xmm0
xorps %xmm4,%xmm4
Run Code Online (Sandbox Code Playgroud)
.
顺便说一句,我无法确定我的两个shuffle版本中哪一个最快,所以我在答案中给出了两个实现.
/* Includes */
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <immintrin.h> /* AVX2 */
#include <time.h>
/* Defines */
#define ALIGNTO(n) __attribute__((aligned(n)))
#define NUM_ITERS 2260000
/**
* Bit mask shuffle.
*
* This version uses the PINSTRW instruction.
*/
__m256 bitMaskShufflePINSRW(__m256i mask){
__m256i imask ALIGNTO(32);
/* Shuffle bitmask */
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 7);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 6);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 5);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 4);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 3);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 2);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 1);
mask = _mm256_slli_epi32(mask, 1);
imask = _mm256_insert_epi32(imask, _mm256_movemask_epi8(mask), 0);
/* Return bitmask */
return _mm256_castsi256_ps(imask);
}
/**
* AVX2 implementation.
*/
float dotAVX2(__m256 f[16], unsigned char maskArg[16]){
int i, j, k;
/* Use _mm_loadu_si128 */
__m256i mask ALIGNTO(32) = _mm256_castsi128_si256(_mm_load_si128((const __m128i*)maskArg));
__m256 shufdMask ALIGNTO(32);
__m256 zblended ALIGNTO(32);
__m256 sums ALIGNTO(32) = _mm256_setzero_ps();
float sumsf[8] ALIGNTO(32);
/* Shuffle bitmask */
shufdMask = bitMaskShufflePINSRW(mask);
shufdMask = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(shufdMask), 16));
/* Dot product */
for(i=15;i>=0;i--){
zblended = _mm256_setzero_ps();
/* Replace f[i] with _mm256_loadu_ps((float*)&f[i]) */
zblended = _mm256_blendv_ps(zblended, f[i], shufdMask);
sums = _mm256_add_ps(sums, zblended);
shufdMask = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(shufdMask), 1));
}
/* Final Summation */
_mm256_store_ps(sumsf, sums);
return sumsf[0] + sumsf[1] + sumsf[2] + sumsf[3] + sumsf[4] + sumsf[5] + sumsf[6] + sumsf[7];
}
/**
* Reference C implementation
*/
float dotRefC(float f[128], unsigned char mask[16]){
float sum = 0.0;
int i;
for(i=0;i<128;i++){
sum += ((mask[i>>3]>>(i&7))&1) ? f[i] : 0.0;
}
return sum;
}
/**
* Main
*/
int main(void){
/* Variables */
/* Loop Counter */
int i;
/* Data to process */
float data[128] ALIGNTO(32);
unsigned char mask[16] ALIGNTO(32);
float refCSum, sseSum;
/* Time tracking */
clock_t t1, t2, t3;
double refCTime, sseTime;
/* Initialize mask and float arrays with some random data. */
for(i=0;i<128;i++){
if(i<16)
mask[i]=rand();
data[i] = random();
}
/* RUN TESTS */
t1 = clock();
for(i=0;i<NUM_ITERS;i++){
refCSum = dotRefC(data, mask);
}
t2 = clock();
for(i=0;i<NUM_ITERS;i++){
sseSum = dotAVX2((__m256*)data, mask);
}
t3 = clock();
/* Compute time taken */
refCTime = (double)(t2-t1)/CLOCKS_PER_SEC;
sseTime = (double)(t3-t2)/CLOCKS_PER_SEC;
/* Print out results */
printf("Results:\n"
"RefC: Time: %f Value: %f\n"
"SSE: Time: %f Value: %f\n",
refCTime, refCSum,
sseTime, sseSum);
return 0;
}
Run Code Online (Sandbox Code Playgroud)
The same concept as for SSE4.1 is used. The difference is that now we're processing 8 floats at a time and making use of AVX2's 256-bit registers and PMOVMASKB from ymm registers (which gather 256/8 = 32 bits). Because of this, we now have a simpler bitmask shuffle, and a much simpler loop.
In the mask, the 256 bits
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
Run Code Online (Sandbox Code Playgroud)
are permuted, using 8 PMOVMASKB and 8 PSLLW instructions, to
0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120
128 136 144 152 160 168 176 184 192 200 208 216 224 232 240 248
1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121
129 137 145 153 161 169 177 185 193 201 209 217 225 233 241 249
2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122
130 138 146 154 162 170 178 186 194 202 210 218 226 234 242 250
3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123
131 139 147 155 163 171 179 187 195 203 211 219 227 235 243 251
4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124
132 140 148 156 164 172 180 188 196 204 212 220 228 236 244 252
5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125
133 141 149 157 165 173 181 189 197 205 213 221 229 237 245 253
6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126
134 142 150 158 166 174 182 190 198 206 214 222 230 238 246 254
7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127
135 143 151 159 167 175 183 191 199 207 215 223 231 239 247 255
Run Code Online (Sandbox Code Playgroud)
. For 128-element bit-with-float dot-products, we then iterate in parallel on eight sets of 16 elements. This implementation can easily be extended for 256-element DPs by iterating on 32-element sets. Only one loop is required now.
Specifically, to change this to work for 256-elment dot products, you would
__m256 f[32], unsigned char maskArg[32]
.= _mm256_castsi128_si256(_mm_load_si128((const __m128i*)maskArg));
) with = _mm256_load_si256((const __m256i*)maskArg);
.bitMaskShufflePINSRW
. for(i=31;i>=0;i--)
I can neither test nor even run the code as my CPU is SSE4.1, but Clang with
clang -Ofast -ftree-vectorize -finline-functions -funroll-loops -mavx2 -msse4.1 -mssse3 dotavx2.c -o dottest
Run Code Online (Sandbox Code Playgroud)
compiled cleanly (you may get faster code without unrolling), producing this:
(gdb) disas dotAVX2
Dump of assembler code for function dotAVX2:
0x0000000100001730 <+0>: push %rbp
0x0000000100001731 <+1>: mov %rsp,%rbp
0x0000000100001734 <+4>: vmovdqa (%rsi),%xmm0
0x0000000100001738 <+8>: vpslld $0x1,%ymm0,%ymm1
0x000000010000173d <+13>: vpslld $0x1,%ymm1,%ymm2
0x0000000100001742 <+18>: vpmovmskb %ymm2,%eax
0x0000000100001746 <+22>: vpslld $0x1,%ymm2,%ymm2
0x000000010000174b <+27>: vpmovmskb %ymm2,%ecx
0x000000010000174f <+31>: vxorps %ymm3,%ymm3,%ymm3
0x0000000100001753 <+35>: vmovd %ecx,%xmm4
0x0000000100001757 <+39>: vpinsrd $0x1,%eax,%xmm4,%xmm4
0x000000010000175d <+45>: vpmovmskb %ymm1,%eax
0x0000000100001761 <+49>: vpinsrd $0x2,%eax,%xmm4,%xmm1
0x0000000100001767 <+55>: vpslld $0x1,%ymm2,%ymm2
0x000000010000176c <+60>: vpslld $0x1,%ymm2,%ymm4
0x0000000100001771 <+65>: vpslld $0x1,%ymm4,%ymm5
0x0000000100001776 <+70>: vpmovmskb %ymm0,%eax
0x000000010000177a <+74>: vpinsrd $0x3,%eax,%xmm1,%xmm0
0x0000000100001780 <+80>: vpmovmskb %ymm5,%eax
0x0000000100001784 <+84>: vpslld $0x1,%ymm5,%ymm1
0x0000000100001789 <+89>: vpmovmskb %ymm1,%ecx
0x000000010000178d <+93>: vmovd %ecx,%xmm1
0x0000000100001791 <+97>: vpinsrd $0x1,%eax,%xmm1,%xmm1
0x0000000100001797 <+103>: vpmovmskb %ymm4,%eax
0x000000010000179b <+107>: vpinsrd $0x2,%eax,%xmm1,%xmm1
0x00000001000017a1 <+113>: vpmovmskb %ymm2,%eax
0x00000001000017a5 <+117>: vpinsrd $0x3,%eax,%xmm1,%xmm1
0x00000001000017ab <+123>: vinserti128 $0x1,%xmm0,%ymm1,%ymm0
0x00000001000017b1 <+129>: vpslld $0x10,%ymm0,%ymm0
0x00000001000017b6 <+134>: vblendvps %ymm0,0x1e0(%rdi),%ymm3,%ymm1
0x00000001000017c0 <+144>: vpslld $0x1,%ymm0,%ymm0
0x00000001000017c5 <+149>: vblendvps %ymm0,0x1c0(%rdi),%ymm3,%ymm2
0x00000001000017cf <+159>: vaddps %ymm2,%ymm1,%ymm1
0x00000001000017d3 <+163>: vpslld $0x1,%ymm0,%ymm0
0x00000001000017d8 <+168>: vblendvps %ymm0,0x1a0(%rdi),%ymm3,%ymm2
0x00000001000017e2 <+178>: vaddps %ymm2,%ymm1,%ymm1
0x00000001000017e6 <+182>: vpslld $0x1,%ymm0,%ymm0
0x00000001000017eb <+187>: vblendvps %ymm0,0x180(%rdi),%ymm3,%ymm2
0x00000001000017f5 <+197>: vaddps %ymm2,%ymm1,%ymm1
0x00000001000017f9 <+201>: vpslld $0x1,%ymm0,%ymm0
0x00000001000017fe <+206>: vblendvps %ymm0,0x160(%rdi),%ymm3,%ymm2
0x0000000100001808 <+216>: vaddps %ymm2,%ymm1,%ymm1
0x000000010000180c <+220>: vpslld $0x1,%ymm0,%ymm0
0x0000000100001811 <+225>: vblendvps %ymm0,0x140(%rdi),%ymm3,%ymm2
0x000000010000181b <+235>: vaddps %ymm2,%ymm1,%ymm1
0x000000010000181f <+239>: vpslld $0x1,%ymm0,%ymm0
0x0000000100001824 <+244>: vblendvps %ymm0,0x120(%rdi),%ymm3,%ymm2
0x000000010000182e <+254>: vaddps %ymm2,%ymm1,%ymm1
0x0000000100001832 <+258>: vpslld $0x1,%ymm0,%ymm0
0x0000000100001837 <+263>: vblendvps %ymm0,0x100(%rdi),%ymm3,%ymm2
0x0000000100001841 <+273>: vaddps %ymm2,%ymm1,%ymm1
0x0000000100001845 <+277>: vpslld $0x1,%ymm0,%ymm0
0x000000010000184a <+282>: vblendvps %ymm0,0xe0(%rdi),%ymm3,%ymm2
0x0000000100001854 <+292>: vaddps %ymm2,%ymm1,%ymm1
0x0000000100001858 <+296>: vpslld $0x1,%ymm0,%ymm0
0x000000010000185d <+301>: vblendvps %ymm0,0xc0(%rdi),%ymm3,%ymm2
0x0000000100001867 <+311>: vaddps %ymm2,%ymm1,%ymm1
0x000000010000186b <+315>: vpslld $0x1,%ymm0,%ymm0
0x0000000100001870 <+320>: vblendvps %ymm0,0xa0(%rdi),%ymm3,%ymm2
0x000000010000187a <+330>: vaddps %ymm2,%ymm1,%ymm1
0x000000010000187e <+334>: vpslld $0x1,%ymm0,%ymm0
0x0000000100001883 <+339>: vblendvps %ymm0,0x80(%rdi),%ymm3,%ymm2
0x000000010000188d <+349>: vaddps %ymm2,%ymm1,%ymm1
0x0000000100001891 <+353>: vpslld $0x1,%ymm0,%ymm0
0x0000000100001896 <+358>: vblendvps %ymm0,0x60(%rdi),%ymm3,%ymm2
0x000000010000189d <+365>: vaddps %ymm2,%ymm1,%ymm1
0x00000001000018a1 <+369>: vpslld $0x1,%ymm0,%ymm0
0x00000001000018a6 <+374>: vblendvps %ymm0,0x40(%rdi),%ymm3,%ymm2
0x00000001000018ad <+381>: vaddps %ymm2,%ymm1,%ymm1
0x00000001000018b1 <+385>: vpslld $0x1,%ymm0,%ymm0
0x00000001000018b6 <+390>: vblendvps %ymm0,0x20(%rdi),%ymm3,%ymm2
0x00000001000018bd <+397>: vaddps %ymm2,%ymm1,%ymm1
0x00000001000018c1 <+401>: vpslld $0x1,%ymm0,%ymm0
0x00000001000018c6 <+406>: vblendvps %ymm0,(%rdi),%ymm3,%ymm0
0x00000001000018cc <+412>: vaddps %ymm0,%ymm1,%ymm0
0x00000001000018d0 <+416>: vpshufd $0x1,%xmm0,%xmm1
0x00000001000018d5 <+421>: vaddss %xmm1,%xmm0,%xmm1
0x00000001000018d9 <+425>: vmovhlps %xmm0,%xmm0,%xmm2
0x00000001000018dd <+429>: vaddss %xmm1,%xmm2,%xmm1
0x00000001000018e1 <+433>: vpshufd $0x3,%xmm0,%xmm2
0x00000001000018e6 <+438>: vaddss %xmm1,%xmm2,%xmm1
0x00000001000018ea <+442>: vextracti128 $0x1,%ymm0,%xmm0
0x00000001000018f0 <+448>: vaddss %xmm1,%xmm0,%xmm1
0x00000001000018f4 <+452>: vpshufd $0x1,%xmm0,%xmm2
0x00000001000018f9 <+457>: vaddss %xmm1,%xmm2,%xmm1
0x00000001000018fd <+461>: vpshufd $0x3,%xmm0,%xmm2
0x0000000100001902 <+466>: vmovhlps %xmm0,%xmm0,%xmm0
0x0000000100001906 <+470>: vaddss %xmm1,%xmm0,%xmm0
0x000000010000190a <+474>: vaddss %xmm0,%xmm2,%xmm0
0x000000010000190e <+478>: pop %rbp
0x000000010000190f <+479>: vzeroupper
0x0000000100001912 <+482>: retq
End of assembler dump.
Run Code Online (Sandbox Code Playgroud)
As we can see, the kernel is 3 instructions (vblendvps, vaddps, vpslld) now.
这里有一些值得尝试的事情。
尝试使用编译器CMOV
而不是分支。(请注意,以这种方式使用联合在 C11 中定义良好,但在 C++11 中未定义。)
union {
int i;
float f;
} u;
u.i = 0;
if (b & 1) {
u.f = a[i];
}
sum += u.f;
Run Code Online (Sandbox Code Playgroud)
使用乘法而不是分支。
sum += (b & 1) * a[i];
Run Code Online (Sandbox Code Playgroud)
保留几个总和并将它们添加到末尾以减少数据流依赖性。(您可以将上述任一建议与此建议结合起来。)
float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
for (int i = 0; i < 64; i += 4; b >>= 4) {
if (b & 1) sum0 += a[i];
if (b & 2) sum1 += a[i+1];
if (b & 4) sum2 += a[i+2];
if (b & 8) sum3 += a[i+3];
}
return sum0 + sum1 + sum2 + sum3;
Run Code Online (Sandbox Code Playgroud)
通过一次处理几个位来减少分支数量:
for (int i = 0; i < 64; i += 4, b >>= 4) {
switch (b & 0xf) {
case 0:
break;
case 1:
sum += a[i];
break;
case 2:
sum += a[i + 1];
break;
case 3:
sum += a[i] + a[i+1];
break;
case 4:
sum += a[i+2];
break;
// etc. for cases up to and including 15
}
}
Run Code Online (Sandbox Code Playgroud)
您可以保留多个总和,并为每个总和一次处理多个位。在这种情况下,您可能想要使用宏或内联函数并调用它四次。