hus*_*sik 1 c++ optimization assembly
我刚刚开始学习汇编并制作一些自定义循环,用于使用C++的asm {}体交换两个变量,使用C-Free 5.0中的Digital-Mars编译器
启用-o(优化)
并得到了结果:
time of for-loop(cycles) 844
time of while-loop(cycles) 735
time of custom-loop-1(cycles) 562
time of custom-loop-2(cycles) 469
Run Code Online (Sandbox Code Playgroud)
我无法找到Digital-Mars编译器"asm output"选项进行比较.构建选项中没有其他优化选项.我应该改变我的编译器吗?如果是的话,哪一个?你能看一下下面的代码并告诉我为什么自定义循环更快?
这是循环的标准:
t1=clock();
for(int i=0;i<200000000;i++)
{
temp=a;//instruction 1
a=b;//instruction 2
b=temp;//3 instructions total
}
t2=clock();
printf("\n time of for-loop(increasing) %i \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)
这是标准的while循环:
t1=clock();
while(j<200000000)
{
temp=a;//again it is three instructions
a=b;
b=temp;
j++;
}
t2=clock();
printf("\n time of while-loop(cycles) %i \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)
这是我的自定义循环1:
t1=clock();
j=200000000;//setting the count
__asm
{
pushf //backup
push eax //backup
push ebx //backup
push ecx //backup
push edx //backup
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
do_it_again: //begin to loop
mov eax,a //basic swap steps between cpu and mem(cache)
mov ebx,b
mov b,eax
mov a,ebx //four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again // end of loop block
pop edx //rolling back to history
pop ecx
pop ebx
pop eax
popf
}
t2=clock();
printf("\n time of custom-loop-1(cycles) %i \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)
这是我的第二个自定义循环:
t1=clock();
j=200000000;//setting the count
__asm
{
pushf //backup
push eax
push ebx
push ecx
push edx
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
mov eax,a //getting variables to registers
mov ebx,b
do_it_again2: //begin to loop
//swapping with using only 2 variables(only in cpu)
sub eax,ebx //a is now a-b
add ebx,eax //b is now a
sub eax,ebx //a is now -b
xor eax,80000000h //a is now b and four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again2 // end of loop block
pop edx //rollback
pop ecx
pop ebx
pop eax
popf
}
t2=clock();
printf("\n time of custom-loop-2(cycles) %i \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)
完整代码:
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
int main()
{
int j=0;
int a=0,b=0,temp=0;
srand(time(0));
time_t t1=0;
time_t t2=0;
t1=clock();
for(int i=0;i<200000000;i++)
{
temp=a;//instruction 1
a=b;//instruction 2
b=temp;//3 instructions total
}
t2=clock();
printf("\n time of for-loop(cycles) %i \n",(t2-t1));
t1=clock();
while(j<200000000)
{
temp=a;//again it is three instructions
a=b;
b=temp;
j++;
}
t2=clock();
printf("\n time of while-loop(cycles) %i \n",(t2-t1));
t1=clock();
j=200000000;//setting the count
__asm
{
pushf //backup
push eax //backup
push ebx //backup
push ecx //backup
push edx //backup
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
do_it_again: //begin to loop
mov eax,a //basic swap steps between cpu and mem(cache)
mov ebx,b
mov b,eax
mov a,ebx //four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again // end of loop block
pop edx //rolling back to history
pop ecx
pop ebx
pop eax
popf
}
t2=clock();
printf("\n time of custom-loop-1(cycles) %i \n",(t2-t1));
t1=clock();
j=200000000;//setting the count
__asm
{
pushf //backup
push eax
push ebx
push ecx
push edx
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
mov eax,a //getting variables to registers
mov ebx,b
do_it_again2: //begin to loop
//swapping with using only 2 variables(only in cpu)
sub eax,ebx //a is now a-b
add ebx,eax //b is now a
sub eax,ebx //a is now -b
xor eax,80000000h //a is now b and four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again2 // end of loop block
pop edx //rollback
pop ecx
pop ebx
pop eax
popf
}
t2=clock();
printf("\n time of custom-loop-2(cycles) %i \n",(t2-t1));
return 0;
}
Run Code Online (Sandbox Code Playgroud)
我只是学习c ++和汇编,并想知道事情是怎么回事.谢谢
windows xp,奔腾4(2 GHz)数字火星在C-Free中
在没有看到它创建的汇编语言结果的情况下,有点难以猜测编译器可能正在做什么.使用VC++ 10,我得到以下结果:
time of for-loop(cycles) 155
time of while-loop(cycles) 158
time of custom-loop-1(cycles) 369
time of custom-loop-2(cycles) 314
Run Code Online (Sandbox Code Playgroud)
我没有看输出,但我的直接猜测是for和while循环之间的区别只是噪音.两者显然比你手写的汇编代码快一点.
编辑:查看汇编代码,我是对的 - 代码for和它while是完全相同的.它看起来像这样:
call _clock
mov ecx, DWORD PTR _a$[ebp]
cdq
mov ebx, edx
mov edx, DWORD PTR _b$[ebp]
mov edi, eax
mov esi, 200000000
$LL2@main:
; Line 28
dec esi
; Line 30
mov eax, ecx
; Line 31
mov ecx, edx
; Line 32
mov edx, eax
jne SHORT $LL2@main
mov DWORD PTR _b$[ebp], edx
mov DWORD PTR _a$[ebp], ecx
; Line 35
call _clock
Run Code Online (Sandbox Code Playgroud)
虽然可以说比第二个循环更"聪明",但现代CPU倾向于使用简单的代码做得最好.它在循环中只有较少的指令(并且根本不引用循环内的内存).这些并不是衡量效率的唯一手段,但通过这种简单的循环,它们是相当具有指示性的.
编辑2:
只是为了好玩,我写了一个新版本,它增加了三重XOR交换,以及一个使用CPU xchg指令(因为如果我不太关心速度等,我可能会手动编写它).虽然英特尔/ AMD通常建议不要使用更复杂的指令,但它似乎没有引起问题 - 它似乎至少与其他任何东西一样快:
time of for-loop(cycles) 156
time of while-loop(cycles) 160
time swap between register and cache 284
time to swap using add/sub: 308
time to swap using xchg: 155
time to swap using triple-xor 233
Run Code Online (Sandbox Code Playgroud)
资源:
// Note: updated source -- it was just too ugly to live. Same results though.
#include<stdlib.h>
#include<time.h>
#include <iostream>
#include <string>
#include <iomanip>
#include <sstream>
namespace {
int a, b;
const int loops = 200000000;
}
template <class swapper>
struct timer {
timer(std::string const &label) {
clock_t t1 = clock();
swapper()();
clock_t t2 = clock();
std::ostringstream buffer;
buffer << "Time for swap using " << label;
std::cout << std::left << std::setw(30) << buffer.str() << " = " << (t2-t1) << "\n";
}
};
struct for_loop {
void operator()() {
int temp;
for(int i=0;i<loops;i++) {
temp=a;//instruction 1
a=b;//instruction 2
b=temp;//3 instructions total
}
}
};
struct while_loop {
void operator()() {
int j = 0;
int temp;
while(j<loops) {
temp=a;//again it is three instructions
a=b;
b=temp;
j++;
}
}
};
struct reg_mem {
void operator()() {
int j=loops;//setting the count
__asm {
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
do_it_again: //begin to loop
mov eax,a //basic swap steps between cpu and mem(cache)
mov ebx,b
mov b,eax
mov a,ebx //four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again // end of loop block
}
}
};
struct add_sub {
void operator()() {
int j=loops;//setting the count
__asm {
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
mov eax,a //getting variables to registers
mov ebx,b
do_it_again2: //begin to loop
//swapping with using only 2 variables(only in cpu)
sub eax,ebx //a is now a-b
add ebx,eax //b is now a
sub eax,ebx //a is now -b
xor eax,80000000h //a is now b and four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again2 // end of loop block
mov a, eax
mov b, ebx
}
}
};
struct xchg {
void operator()() {
__asm {
mov ecx, loops
mov eax, a
mov ebx, b
do_it_again3:
dec ecx
xchg eax, ebx
jne do_it_again3
mov a, eax
mov b, ebx
}
}
};
struct xor3 {
void operator()() {
_asm {
mov ecx, loops
mov eax, a
mov edx, b
do_swap4:
xor eax, edx
xor edx, eax
xor eax, edx
dec ecx
jnz do_swap4
mov a, eax
mov b, edx
}
}
};
int main() {
timer<for_loop>("for loop");
timer<while_loop>("while loop");
timer<reg_mem>("reg<->mem");
timer<add_sub>("add/sub");
timer<xchg>("xchg");
timer<xor3>("triple xor");
return 0;
}
Run Code Online (Sandbox Code Playgroud)
一句话:至少对于这项微不足道的任务而言,你不会在足够关心的情况下打败一个不错的编译器(可能根本不会,除了可能的微小代码).
该编译器生成的代码非常糟糕.在反汇编目标文件之后objconv,这就是我对第一个for循环的看法.
?_001: cmp dword [ebp-4H], 200000000 ; 0053 _ 81. 7D, FC, 0BEBC200
jge ?_002 ; 005A _ 7D, 17
inc dword [ebp-4H] ; 005C _ FF. 45, FC
mov eax, dword [ebp-18H] ; 005F _ 8B. 45, E8
mov dword [ebp-10H], eax ; 0062 _ 89. 45, F0
mov eax, dword [ebp-14H] ; 0065 _ 8B. 45, EC
mov dword [ebp-18H], eax ; 0068 _ 89. 45, E8
mov eax, dword [ebp-10H] ; 006B _ 8B. 45, F0
mov dword [ebp-14H], eax ; 006E _ 89. 45, EC
jmp ?_001 ; 0071 _ EB, E0
Run Code Online (Sandbox Code Playgroud)
任何看过一些集会的人都应该清楚这些问题.
循环非常依赖于放入的值eax.由于每个下一条指令在该寄存器上创建的依赖性,这使得任何乱序执行几乎不可能.
有可用的6个通用寄存器(因为ebp并esp没有真正通用的大部分设置的),但是你的编译器使用没有他们,回落到使用本地栈.当速度是优化目标时,这是绝对不可接受的.我们甚至可以看到当前循环索引存储在[ebp-4H],而它可以很容易地存储在寄存器中.
该cmp指令使用内存和立即操作数.这是操作数的最慢混合,并且在性能受到威胁时不应该使用.
并且不要让我开始使用代码大小.这些说明中有一半是不必要的.
总而言之,我要做的第一件事就是尽早放弃编译器.但话说回来,看到它提供"记忆模型"作为其选择之一,人们似乎真的没有太多希望.