use*_*112 6 c++ arrays cpu optimization performance
我有700个项目,每个循环700项,我获得项目的三个属性,并执行一些基本的计算.我使用两种技术实现了这个:
1)三个700元素阵列,三个属性中的每一个都有一个阵列.所以:
item0.a = array1[0]
item0.b = array2[0]
item0.e = array3[0]
Run Code Online (Sandbox Code Playgroud)
2)一个2100个元素的数组,连续包含三个属性的数据.所以:
item0.a = array[(0*3)+0]
item0.b = array[(0*3)+1]
item0.e = array[(0*3)+2]
Run Code Online (Sandbox Code Playgroud)
现在是三个项目属性a,b并且e在循环中一起使用 - 因此,如果将它们存储在一个数组中,性能应该比使用三阵列技术(由于空间局部性)更好.然而:
以下是2100阵列技术的代码:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array[2100];
//I have left out code for simplicity. You can assume by now the array is populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned short j = i * 3;
unsigned int a = array[j + 0];
unsigned int b = array[j + 1];
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array[j + 2];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
Run Code Online (Sandbox Code Playgroud)
这是三个700元素数组技术的代码:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array1[700];
unsigned int array2[700];
unsigned int array3[700];
//I have left out code for simplicity. You can assume by now the arrays are populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned int a= array1[i]; //Array 1
unsigned int b= array2[i]; //Array 2
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array3[i]; //Array 3
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
Run Code Online (Sandbox Code Playgroud)
为什么不使用one-2100元素阵列的技术更快?这应该是因为每700个项目一起使用这三个属性.
我使用了MSVC 2012,Win 7 64
3x 700元素阵列技术的装配:
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r8,[this]
or rax,rdx
mov dword ptr [r8],ecx
mov r8d,8ch
mov r9,rax
lea rdx,[rbx+0Ch]
for(int i=0; i < 700; i++){
sub rdi,rbx
unsigned int a = array1[i];
unsigned int b = array2[i];
data_for_all_items = data_for_all_items & (a != -1 & b != -1);
cmp dword ptr [rdi+rdx-0Ch],0FFFFFFFFh
lea rdx,[rdx+14h]
setne cl
cmp dword ptr [rdi+rdx-1Ch],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-18h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-10h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-14h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-20h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-1Ch],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-18h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-10h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-14h],0FFFFFFFFh
setne al
and cl,al
and r15b,cl
dec r8
jne 013F26DA53h
unsigned int e = array3[i];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
Run Code Online (Sandbox Code Playgroud)
2100元素阵列技术的汇编程序:
start = __rdtscp(&x);
rdtscp
lea r8,[this]
shl rdx,20h
or rax,rdx
mov dword ptr [r8],ecx
for(int i=0; i < 700; i++){
xor r8d,r8d
mov r10,rax
unsigned short j = i*3;
movzx ecx,r8w
add cx,cx
lea edx,[rcx+r8]
unsigned int a = array[j + 0];
unsigned int b = array[j + 1];
data_for_all_items = data_for_all_items & (best_ask != -1 & best_bid != -1);
movzx ecx,dx
cmp dword ptr [r9+rcx*4+4],0FFFFFFFFh
setne dl
cmp dword ptr [r9+rcx*4],0FFFFFFFFh
setne al
inc r8d
and dl,al
and r14b,dl
cmp r8d,2BCh
jl 013F05DA10h
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
Run Code Online (Sandbox Code Playgroud)
“空间局部性”的概念让你有点困惑。对于这两种解决方案,您的处理器可能会尽力缓存阵列。
不幸的是,使用一个数组的代码版本还执行了一些额外的数学运算。这可能是您额外的周期所花费的地方。
| 归档时间: |
|
| 查看次数: |
148 次 |
| 最近记录: |