自定义循环更快的原因是什么?坏编译器?不安全的自定义代码?运气?(幸运缓存命中)

hus*_*sik 1 c++ optimization assembly

我刚刚开始学习汇编并制作一些自定义循环,用于使用C++的asm {}体交换两个变量,使用C-Free 5.0中的Digital-Mars编译器

启用-o(优化)

并得到了结果:

 time of for-loop(cycles)        844
 time of while-loop(cycles)      735
 time of custom-loop-1(cycles)   562
 time of custom-loop-2(cycles)   469
Run Code Online (Sandbox Code Playgroud)

我无法找到Digital-Mars编译器"asm output"选项进行比较.构建选项中没有其他优化选项.我应该改变我的编译器吗?如果是的话,哪一个?你能看一下下面的代码并告诉我为什么自定义循环更快?

这是循环的标准:

t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(increasing) %i  \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)

这是标准的while循环:

t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
            j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)

这是我的自定义循环1:

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)

这是我的第二个自定义循环:

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));
Run Code Online (Sandbox Code Playgroud)

完整代码:

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
int j=0;

int a=0,b=0,temp=0;

srand(time(0));
time_t t1=0;
time_t t2=0;


t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
    j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

return 0;

}
Run Code Online (Sandbox Code Playgroud)

我只是学习c ++和汇编,并想知道事情是怎么回事.谢谢

windows xp,奔腾4(2 GHz)数字火星在C-Free中

Jer*_*fin 5

在没有看到它创建的汇编语言结果的情况下,有点难以猜测编译器可能正在做什么.使用VC++ 10,我得到以下结果:

time of for-loop(cycles) 155

time of while-loop(cycles)  158

time of custom-loop-1(cycles)   369

time of custom-loop-2(cycles)  314
Run Code Online (Sandbox Code Playgroud)

我没有看输出,但我的直接猜测是forwhile循环之间的区别只是噪音.两者显然比你手写的汇编代码快一点.

编辑:查看汇编代码,我是对的 - 代码for和它while是完全相同的.它看起来像这样:

        call    _clock
        mov     ecx, DWORD PTR _a$[ebp]
        cdq
        mov     ebx, edx
        mov     edx, DWORD PTR _b$[ebp]
        mov     edi, eax
        mov     esi, 200000000
$LL2@main:
; Line 28
        dec     esi
; Line 30
        mov     eax, ecx
; Line 31
        mov     ecx, edx
; Line 32
        mov     edx, eax
        jne     SHORT $LL2@main
        mov     DWORD PTR _b$[ebp], edx
        mov     DWORD PTR _a$[ebp], ecx
; Line 35
        call    _clock
Run Code Online (Sandbox Code Playgroud)

虽然可以说比第二个循环更"聪明",但现代CPU倾向于使用简单的代码做得最好.它在循环中只有较少的指令(并且根本不引用循环内的内存).这些并不是衡量效率的唯一手段,但通过这种简单的循环,它们是相当具有指示性的.

编辑2:

只是为了好玩,我写了一个新版本,它增加了三重XOR交换,以及一个使用CPU xchg指令(因为如果我不太关心速度等,我可能会手动编写它).虽然英特尔/ AMD通常建议不要使用更复杂的指令,但它似乎没有引起问题 - 它似乎至少与其他任何东西一样快:

 time of for-loop(cycles) 156

 time of while-loop(cycles)  160

 time swap between register and cache  284

 time to swap using add/sub:  308

 time to swap using xchg:  155

 time to swap using triple-xor  233
Run Code Online (Sandbox Code Playgroud)

资源:

// Note: updated source -- it was just too ugly to live. Same results though.
#include<stdlib.h>
#include<time.h>
#include <iostream>
#include <string>
#include <iomanip>
#include <sstream>

namespace { 
    int a, b;
    const int loops = 200000000;
}

template <class swapper>
struct timer {
    timer(std::string const &label) { 
        clock_t t1 = clock();
        swapper()();
        clock_t t2 = clock();
        std::ostringstream buffer;
        buffer << "Time for swap using " << label;
        std::cout << std::left << std::setw(30) << buffer.str() << " = " << (t2-t1) << "\n";
    }
};

struct for_loop {
    void operator()() {
        int temp;
        for(int i=0;i<loops;i++) {
            temp=a;//instruction 1
            a=b;//instruction 2
            b=temp;//3 instructions total   
        }
    }
};

struct while_loop {
    void operator()() { 
        int j = 0;
        int temp;
        while(j<loops) {
            temp=a;//again it is three instructions
            a=b;
            b=temp; 
            j++;
        }
    }
};

struct reg_mem {
    void operator()() {
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j
    do_it_again:    //begin to loop
            mov eax,a       //basic swap steps between cpu and mem(cache)
            mov ebx,b       
            mov b,eax       
            mov a,ebx       //four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again  // end of loop block
        }
    }
};

struct add_sub {
    void operator()() { 
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j

            mov eax,a       //getting variables to registers
            mov ebx,b

    do_it_again2:   //begin to loop

            //swapping with using only 2 variables(only in cpu)
            sub eax,ebx         //a is now a-b
            add ebx,eax         //b is now a
            sub eax,ebx         //a is now -b
            xor eax,80000000h   //a is now b and four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again2  // end of loop block

            mov a, eax
            mov b, ebx
        }
    }
};

struct xchg {
    void operator()() {
        __asm {
            mov ecx, loops
            mov eax, a
            mov ebx, b
    do_it_again3:
            dec ecx
            xchg eax, ebx
            jne do_it_again3
            mov a, eax
            mov b, ebx
        }
    }
};

struct xor3 {
    void operator()() { 
        _asm { 
            mov ecx, loops
            mov eax, a
            mov edx, b
    do_swap4:
            xor eax, edx
            xor edx, eax
            xor eax, edx
            dec ecx
            jnz do_swap4

            mov a, eax
            mov b, edx
        }
    }
};

int main() {
    timer<for_loop>("for loop");
    timer<while_loop>("while loop");
    timer<reg_mem>("reg<->mem");
    timer<add_sub>("add/sub");
    timer<xchg>("xchg");
    timer<xor3>("triple xor");
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

一句话:至少对于这项微不足道的任务而言,你不会在足够关心的情况下打败一个不错的编译器(可能根本不会,除了可能的微小代码).


Dan*_*zar 5

该编译器生成的代码非常糟糕.在反汇编目标文件之后objconv,这就是我对第一个for循环的看法.

?_001:  cmp     dword [ebp-4H], 200000000               ; 0053 _ 81. 7D, FC, 0BEBC200
        jge     ?_002                                   ; 005A _ 7D, 17
        inc     dword [ebp-4H]                          ; 005C _ FF. 45, FC
        mov     eax, dword [ebp-18H]                    ; 005F _ 8B. 45, E8
        mov     dword [ebp-10H], eax                    ; 0062 _ 89. 45, F0
        mov     eax, dword [ebp-14H]                    ; 0065 _ 8B. 45, EC
        mov     dword [ebp-18H], eax                    ; 0068 _ 89. 45, E8
        mov     eax, dword [ebp-10H]                    ; 006B _ 8B. 45, F0
        mov     dword [ebp-14H], eax                    ; 006E _ 89. 45, EC
        jmp     ?_001                                   ; 0071 _ EB, E0
Run Code Online (Sandbox Code Playgroud)

任何看过一些集会的人都应该清楚这些问题.

  1. 循环非常依赖于放入的值eax.由于每个下一条指令在该寄存器上创建的依赖性,这使得任何乱序执行几乎不可能.

  2. 有可用的6个通用寄存器(因为ebpesp没有真正通用的大部分设置的),但是你的编译器使用没有他们,回落到使用本地栈.当速度是优化目标时,这是绝对不可接受的.我们甚至可以看到当前循环索引存储在[ebp-4H],而它可以很容易地存储在寄存器中.

  3. cmp指令使用内存和立即操作数.这是操作数的最慢混合,并且在性能受到威胁时不应该使用.

  4. 并且不要让我开始使用代码大小.这些说明中有一半是不必要的.

总而言之,我要做的第一件事就是尽早放弃编译器.但话说回来,看到它提供"记忆模型"作为其选择之一,人们似乎真的没有太多希望.

  • 是的,对齐可能是一个问题 - 但在这种情况下,问题在于一个糟糕的编译器,它似乎不知道除累加器之外的任何其他寄存器. (3认同)
  • 请阅读[英特尔基础架构手册](http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol- 1-manual.pdf)再次运行编译器/汇编器之前. (3认同)