Cor*_*rey 14 c++ optimization inlining expression-templates
我已经完成了第一个数学库版本,下一步我想转向表达式模板来提高代码的性能.但是,我的初步结果与我的预期不同.我正在MSVC 2010中编译,在vanilla Release模式下(并且可以使用C++ 0x).
对于我将向您展示的大量代码提前道歉,在让人们看到我正在做的事情的同时,我尽可能地做到这一点.分析框架:
#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
namespace math
{
    class vector; // to be determined
    std::ostream& operator<<(std::ostream& stream, const vector& vec)
    {
        for (std::size_t i = 0; i < 4; ++i)
            stream << vec[i] << " ";
        return stream;
    }
}
// test framework
typedef std::vector<math::vector> array_type[3];
typedef std::vector<math::vector> vector_type;
float generate_float()
{
    return static_cast<float>(rand());
}
math::vector generate_vector()
{
    return math::vector(generate_float(), generate_float(),
                        generate_float(), generate_float());
}
vector_type generate_source(std::size_t count)
{
    vector_type result; result.reserve(count);
    std::generate_n(std::back_inserter(result), count, generate_vector);
    return result;
}
double test(const array_type& source,
            vector_type& results, std::size_t iterations)
{
    // time
    std::clock_t begin = std::clock();
    for (std::size_t i = 0; i < iterations; ++i)
    {
        const math::vector& v0 = source[0][i];
        const math::vector& v1 = source[1][i];
        const math::vector& v2 = source[2][i];
        math::vector result(v0 + v1 + v2);
        results.push_back(result);
    }
    std::clock_t end = std::clock();
    // print time
    double elapsed = static_cast<double>(end - begin) / CLOCKS_PER_SEC;
    std::cout << "time: " << elapsed << "\n";
    return elapsed;
}
int main()
{
    // prepare tests
    const std::size_t time_count = 50; // number of times to get time count
    const std::size_t test_count = 10000000; // number of iterations in a test
    std::cout << "allocating..." << std::endl;
    std::vector<double> timeResults; timeResults.reserve(time_count);
    array_type source;
    for (std::size_t i = 0; i < 3; ++i)
        source[i] = generate_source(test_count);
    vector_type results;
    results.reserve(test_count);
    // pre tests
    std::cout << "pre-testing..." << std::endl;
    for (std::size_t i = 0; i < time_count / 10; ++i)
    {
        timeResults.push_back(test(source, results, test_count));
        results.clear();
    }
    timeResults.clear();
    // begin tests
    std::cout << "testing..." << std::endl;
    for (std::size_t i = 0; i < time_count; ++i)
    {
        timeResults.push_back(test(source, results, test_count));
        results.clear();
    }
    // can be turned into functors for non-C++0x, for testing in C++03
    double min = std::numeric_limits<double>::max();
    double max = std::numeric_limits<double>::min();
    std::for_each(timeResults.begin(), timeResults.end(),
                    [&min, &max](double x)
                    {
                        min = std::min(x, min);
                        max = std::max(x, max);
                    });
    double sum = 0; // throws out max and min results
    bool minFlag = false, maxFlag = false;
    std::for_each(timeResults.begin(), timeResults.end(),
                    [min, max, &sum, &minFlag, &maxFlag](double x)
                    {
                        if (!minFlag && x <= min)
                            minFlag = true; // skip
                        else if (!maxFlag && x >= max)
                            maxFlag = true; // skip
                        else
                            sum += x; // add
                    });
    // print results
    double average = sum / (timeResults.size() - 2);
    std::cout << "\ntotal time: " << sum << " average time: " << average
                << "\n" << "min: " << min << " max: " << max << std::endl;
}
表达模板载体:
namespace math
{
    // core expression template
    template <typename E>
    class vector_expression
    {
    public:
        template <typename std::size_t I>
        float get() const
        {
            return static_cast<const E&>(*this).get<I>();
        }
    protected:
        ~vector_expression() {} // not a public base
    };
    // vector class
    class vector : public vector_expression<vector>
    {
    public:
        vector()
        {
            data[0] = data[1] = data[2] = data[3] = 0;
        }
        vector(float x, float y, float z, float w)
        {
            data[0] = x; data[1] = y; data[2] = z; data[3] = w;
        }
        template <typename E>
        vector(const vector_expression<E>& e)
        {
            evaluate<0>(e);
        }
        template <std::size_t I>
        float get() const
        {
            return data[I];
        }
        float operator[](std::size_t index) const
        {
            return data[index];
        }
    private:
        template <std::size_t I, typename E>
        void evaluate(const vector_expression<E>& e,
                        typename std::enable_if<I < 4>::type* = nullptr)
        {
            data[I] = e.get<I>();
            evaluate<I + 1>(e);
        }
        template <std::size_t I, typename E>
        void evaluate(const vector_expression<E>& e,
                        typename std::enable_if<I >= 4>::type* = nullptr)
        {
            // done
        }
        float data[4];
    };
    template <typename E1, typename E2>
    class vector_expression_sum :
        public vector_expression<vector_expression_sum<E1, E2>>
    {
    public:
        vector_expression_sum(const vector_expression<E1>& first,
                                const vector_expression<E2>& second) :
        mFirst(first),
        mSecond(second)
        {}
        template <typename std::size_t I>
        float get() const
        {
            return mFirst.get<I>() + mSecond.get<I>();
        }
    private:
        const vector_expression<E1>& mFirst;
        const vector_expression<E2>& mSecond;
    };
    template <typename E1, typename E2>
    vector_expression_sum<E1, E2>
        operator+(const vector_expression<E1>& first,
                    const vector_expression<E2>& second)
    {
        return vector_expression_sum<E1, E2>(first, second);
    }
}
手动内联:
namespace math
{
    // same definition
}
// ...
double test(const array_type& source,
            vector_type& results, std::size_t iterations)
{
    // ...
    {
        // ...
        math::vector result(v0.get<0>() + v1.get<0>() + v2.get<0>(),
                            v0.get<1>() + v1.get<1>() + v2.get<1>(),
                            v0.get<2>() + v1.get<2>() + v2.get<2>(),
                            v0.get<3>() + v1.get<3>() + v2.get<3>());
        // ...
    }
    // ...
}
// ...
结果:
表达模板:
总时间:14.172平均时间:0.29525
分钟:0.281最大值:0.422手动内联:
总时间:8.438平均时间:0.175792
分钟:0.171最大值:0.188
正如您所看到的,表达式模板(显然)并没有变成完全内联的代码.这是test()最后一次调用的反汇编std::clock():
表达模板汇编:
test:
00401110  push        ebp
00401111  mov         ebp,esp
00401113  sub         esp,38h
00401116  mov         eax,dword ptr [___security_cookie (404018h)]
0040111B  xor         eax,ebp
0040111D  mov         dword ptr [ebp-4],eax
00401120  push        ebx
00401121  push        esi
00401122  mov         esi,ecx
00401124  mov         dword ptr [ebp-28h],esi
00401127  call        dword ptr [__imp__clock (4030DCh)]
0040112D  xor         ebx,ebx
0040112F  mov         dword ptr [ebp-1Ch],eax
00401132  mov         dword ptr [ebp-24h],ebx
00401135  jmp         test+2Ah (40113Ah)
00401137  mov         esi,dword ptr [ebp-28h]
0040113A  mov         eax,dword ptr [esi+20h]
0040113D  mov         edx,dword ptr [esi+10h]
00401140  mov         ecx,dword ptr [esi]
00401142  add         eax,ebx
00401144  mov         dword ptr [ebp-18h],eax
00401147  add         edx,ebx
00401149  add         ecx,ebx
0040114B  lea         eax,[ebp-30h]
0040114E  call        math::operator+<math::vector,math::vector> (401E60h)
00401153  mov         edx,dword ptr [ebp-18h]
00401156  mov         ecx,eax
00401158  lea         eax,[ebp-38h]
0040115B  call        math::operator+<math::vector,math::vector> (401E60h)
00401160  mov         ecx,dword ptr [eax]
00401162  mov         edx,dword ptr [ecx+4]
00401165  fld         dword ptr [edx]
00401167  mov         edx,dword ptr [ecx]
00401169  fadd        dword ptr [edx]
0040116B  mov         eax,dword ptr [eax+4]
0040116E  mov         edx,dword ptr [ecx+4]
00401171  fstp        dword ptr [ebp-18h]
00401174  fld         dword ptr [ebp-18h]
00401177  fadd        dword ptr [eax]
00401179  fstp        dword ptr [ebp-14h]
0040117C  fld         dword ptr [edx+4]
0040117F  mov         edx,dword ptr [ecx]
00401181  fadd        dword ptr [edx+4]
00401184  mov         edx,dword ptr [ecx+4]
00401187  fstp        dword ptr [ebp-18h]
0040118A  fld         dword ptr [ebp-18h]
0040118D  fadd        dword ptr [eax+4]
00401190  fstp        dword ptr [ebp-10h]
00401193  fld         dword ptr [edx+8]
00401196  mov         edx,dword ptr [ecx]
00401198  fadd        dword ptr [edx+8]
0040119B  mov         edx,dword ptr [ecx+4]
0040119E  mov         ecx,dword ptr [ecx]
004011A0  fstp        dword ptr [ebp-18h]
004011A3  fld         dword ptr [ebp-18h]
004011A6  fadd        dword ptr [eax+8]
004011A9  fstp        dword ptr [ebp-0Ch]
004011AC  fld         dword ptr [edx+0Ch]
004011AF  lea         edx,[ebp-14h]
004011B2  fadd        dword ptr [ecx+0Ch]
004011B5  fstp        dword ptr [ebp-18h]
004011B8  fld         dword ptr [ebp-18h]
004011BB  fadd        dword ptr [eax+0Ch]
004011BE  mov         eax,dword ptr [edi+4]
004011C1  fstp        dword ptr [ebp-8]
004011C4  cmp         edx,eax
004011C6  jae         test+12Ch (40123Ch)
004011C8  mov         edx,dword ptr [edi]
004011CA  lea         ecx,[ebp-14h]
004011CD  cmp         edx,ecx
004011CF  ja          test+12Ch (40123Ch)
004011D1  mov         esi,ecx
004011D3  mov         ecx,dword ptr [edi+8]
004011D6  sub         esi,edx
004011D8  cmp         eax,ecx
004011DA  jne         test+10Bh (40121Bh)
004011DC  sub         eax,edx
004011DE  sar         eax,4
004011E1  cmp         eax,0FFFFFFEh
004011E6  ja          test+201h (401311h)
004011EC  sub         ecx,edx
004011EE  inc         eax
004011EF  sar         ecx,4
004011F2  cmp         eax,ecx
004011F4  jbe         test+10Bh (40121Bh)
004011F6  mov         edx,ecx
004011F8  shr         edx,1
004011FA  mov         ebx,0FFFFFFFh
004011FF  sub         ebx,edx
00401201  cmp         ebx,ecx
00401203  jae         test+0F9h (401209h)
00401205  xor         ecx,ecx
00401207  jmp         test+0FBh (40120Bh)
00401209  add         ecx,edx
0040120B  cmp         ecx,eax
0040120D  jae         test+101h (401211h)
0040120F  mov         ecx,eax
00401211  mov         edx,edi
00401213  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401930h)
00401218  mov         ebx,dword ptr [ebp-24h]
0040121B  mov         eax,dword ptr [edi+4]
0040121E  and         esi,0FFFFFFF0h
00401221  add         esi,dword ptr [edi]
00401223  test        eax,eax
00401225  je          test+18Fh (40129Fh)
00401227  mov         edx,dword ptr [esi]
00401229  mov         dword ptr [eax],edx
0040122B  mov         ecx,dword ptr [esi+4]
0040122E  mov         dword ptr [eax+4],ecx
00401231  mov         edx,dword ptr [esi+8]
00401234  mov         dword ptr [eax+8],edx
00401237  mov         ecx,dword ptr [esi+0Ch]
0040123A  jmp         test+18Ch (40129Ch)
0040123C  mov         ecx,dword ptr [edi+8]
0040123F  cmp         eax,ecx
00401241  jne         test+171h (401281h)
00401243  mov         edx,dword ptr [edi]
00401245  sub         eax,edx
00401247  sar         eax,4
0040124A  cmp         eax,0FFFFFFEh
0040124F  ja          test+201h (401311h)
00401255  sub         ecx,edx
00401257  inc         eax
00401258  sar         ecx,4
0040125B  cmp         eax,ecx
0040125D  jbe         test+171h (401281h)
0040125F  mov         edx,ecx
00401261  shr         edx,1
00401263  mov         esi,0FFFFFFFh
00401268  sub         esi,edx
0040126A  cmp         esi,ecx
0040126C  jae         test+162h (401272h)
0040126E  xor         ecx,ecx
00401270  jmp         test+164h (401274h)
00401272  add         ecx,edx
00401274  cmp         ecx,eax
00401276  jae         test+16Ah (40127Ah)
00401278  mov         ecx,eax
0040127A  mov         edx,edi
0040127C  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401930h)
00401281  mov         eax,dword ptr [edi+4]
00401284  test        eax,eax
00401286  je          test+18Fh (40129Fh)
00401288  mov         edx,dword ptr [ebp-14h]
0040128B  mov         ecx,dword ptr [ebp-10h]
0040128E  mov         dword ptr [eax],edx
00401290  mov         edx,dword ptr [ebp-0Ch]
00401293  mov         dword ptr [eax+4],ecx
00401296  mov         ecx,dword ptr [ebp-8]
00401299  mov         dword ptr [eax+8],edx
0040129C  mov         dword ptr [eax+0Ch],ecx
0040129F  add         dword ptr [edi+4],10h
004012A3  add         ebx,10h
004012A6  mov         dword ptr [ebp-24h],ebx
004012A9  cmp         ebx,9896800h
004012AF  jb          test+27h (401137h)
004012B5  call        dword ptr [__imp__clock (4030DCh)]
手动内联汇编:
test:
004010B0  push        ebp
004010B1  mov         ebp,esp
004010B3  sub         esp,28h
004010B6  mov         eax,dword ptr [___security_cookie (404018h)]
004010BB  xor         eax,ebp
004010BD  mov         dword ptr [ebp-4],eax
004010C0  push        ebx
004010C1  push        esi
004010C2  mov         esi,ecx
004010C4  mov         dword ptr [ebp-24h],esi
004010C7  call        dword ptr [__imp__clock (4030DCh)]
004010CD  xor         ebx,ebx
004010CF  mov         dword ptr [ebp-1Ch],eax
004010D2  mov         dword ptr [ebp-18h],ebx
004010D5  mov         eax,dword ptr [esi]
004010D7  mov         ecx,dword ptr [esi+10h]
004010DA  fld         dword ptr [eax+ebx]
004010DD  fadd        dword ptr [ecx+ebx]
004010E0  mov         edx,dword ptr [esi+20h]
004010E3  add         eax,ebx
004010E5  add         ecx,ebx
004010E7  fadd        dword ptr [edx+ebx]
004010EA  add         edx,ebx
004010EC  fstp        dword ptr [ebp-14h]
004010EF  fld         dword ptr [ecx+4]
004010F2  fadd        dword ptr [eax+4]
004010F5  fadd        dword ptr [edx+4]
004010F8  fstp        dword ptr [ebp-10h]
004010FB  fld         dword ptr [ecx+8]
004010FE  fadd        dword ptr [eax+8]
00401101  fadd        dword ptr [edx+8]
00401104  fstp        dword ptr [ebp-0Ch]
00401107  fld         dword ptr [ecx+0Ch]
0040110A  lea         ecx,[ebp-14h]
0040110D  fadd        dword ptr [eax+0Ch]
00401110  mov         eax,dword ptr [edi+4]
00401113  fadd        dword ptr [edx+0Ch]
00401116  fstp        dword ptr [ebp-8]
00401119  cmp         ecx,eax
0040111B  jae         test+0E4h (401194h)
0040111D  mov         edx,dword ptr [edi]
0040111F  cmp         edx,ecx
00401121  ja          test+0E4h (401194h)
00401123  mov         esi,ecx
00401125  mov         ecx,dword ptr [edi+8]
00401128  sub         esi,edx
0040112A  cmp         eax,ecx
0040112C  jne         test+0BDh (40116Dh)
0040112E  sub         eax,edx
00401130  sar         eax,4
00401133  cmp         eax,0FFFFFFEh
00401138  ja          test+1BCh (40126Ch)
0040113E  sub         ecx,edx
00401140  inc         eax
00401141  sar         ecx,4
00401144  cmp         eax,ecx
00401146  jbe         test+0BDh (40116Dh)
00401148  mov         edx,ecx
0040114A  shr         edx,1
0040114C  mov         ebx,0FFFFFFFh
00401151  sub         ebx,edx
00401153  cmp         ebx,ecx
00401155  jae         test+0ABh (40115Bh)
00401157  xor         ecx,ecx
00401159  jmp         test+0ADh (40115Dh)
0040115B  add         ecx,edx
0040115D  cmp         ecx,eax
0040115F  jae         test+0B3h (401163h)
00401161  mov         ecx,eax
00401163  mov         edx,edi
00401165  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401890h)
0040116A  mov         ebx,dword ptr [ebp-18h]
0040116D  mov         eax,dword ptr [edi+4]
00401170  and         esi,0FFFFFFF0h
00401173  add         esi,dword ptr [edi]
00401175  test        eax,eax
00401177  je          test+0DFh (40118Fh)
00401179  mov         edx,dword ptr [esi]
0040117B  mov         dword ptr [eax],edx
0040117D  mov         ecx,dword ptr [esi+4]
00401180  mov         dword ptr [eax+4],ecx
00401183  mov         edx,dword ptr [esi+8]
00401186  mov         dword ptr [eax+8],edx
00401189  mov         ecx,dword ptr [esi+0Ch]
0040118C  mov         dword ptr [eax+0Ch],ecx
0040118F  mov         esi,dword ptr [ebp-24h]
00401192  jmp         test+14Ah (4011FAh)
00401194  mov         ecx,dword ptr [edi+8]
00401197  cmp         eax,ecx
00401199  jne         test+12Ch (4011DCh)
0040119B  mov         edx,dword ptr [edi]
0040119D  sub         eax,edx
0040119F  sar         eax,4
004011A2  cmp         eax,0FFFFFFEh
004011A7  ja          test+1BCh (40126Ch)
004011AD  sub         ecx,edx
004011AF  inc         eax
004011B0  sar         ecx,4
004011B3  cmp         eax,ecx
004011B5  jbe         test+12Ch (4011DCh)
004011B7  mov         edx,ecx
004011B9  shr         edx,1
004011BB  mov         esi,0FFFFFFFh
004011C0  sub         esi,edx
004011C2  cmp         esi,ecx
004011C4  jae         test+11Ah (4011CAh)
004011C6  xor         ecx,ecx
004011C8  jmp         test+11Ch (4011CCh)
004011CA  add         ecx,edx
004011CC  cmp         ecx,eax
004011CE  jae         test+122h (4011D2h)
004011D0  mov         ecx,eax
004011D2  mov         edx,edi
004011D4  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (401890h)
004011D9  mov         esi,dword ptr [ebp-24h]
004011DC  mov         eax,dword ptr [edi+4]
004011DF  test        eax,eax
004011E1  je          test+14Ah (4011FAh)
004011E3  mov         edx,dword ptr [ebp-14h]
004011E6  mov         ecx,dword ptr [ebp-10h]
004011E9  mov         dword ptr [eax],edx
004011EB  mov         edx,dword ptr [ebp-0Ch]
004011EE  mov         dword ptr [eax+4],ecx
004011F1  mov         ecx,dword ptr [ebp-8]
004011F4  mov         dword ptr [eax+8],edx
004011F7  mov         dword ptr [eax+0Ch],ecx
004011FA  add         dword ptr [edi+4],10h
004011FE  add         ebx,10h
00401201  mov         dword ptr [ebp-18h],ebx
00401204  cmp         ebx,9896800h
0040120A  jb          test+25h (4010D5h)
00401210  call        dword ptr [__imp__clock (4030DCh)]
结论:无论出于何种原因,MSVC2010都没有内联调用operator+.有谁知道这是为什么?即使推__forceinline(我想避免)也没有内联.
更新:正如jdv-Jan de Vaan所提到的,当我删除析构函数时:
// ~vector_expression() {} // not a public base
它内联operator+.奇怪的是它将它内联到不同的程序集,我的测试表明这个输出虽然比我原来的表现更好,但仍然没有达到与手动内联版本相同的状态.任何想法为什么会这样?
00A710B0  push        ebp  
00A710B1  mov         ebp,esp  
00A710B3  sub         esp,28h  
00A710B6  mov         eax,dword ptr [___security_cookie (0A74018h)]  
00A710BB  xor         eax,ebp  
00A710BD  mov         dword ptr [ebp-4],eax  
00A710C0  push        ebx  
00A710C1  push        esi  
00A710C2  mov         esi,ecx  
00A710C4  mov         dword ptr [ebp-24h],esi  
00A710C7  call        dword ptr [__imp__clock (0A730DCh)]  
00A710CD  xor         ebx,ebx  
00A710CF  mov         dword ptr [ebp-1Ch],eax  
00A710D2  mov         dword ptr [ebp-28h],ebx  
00A710D5  mov         eax,dword ptr [esi]  
00A710D7  mov         ecx,dword ptr [esi+10h]  
00A710DA  fld         dword ptr [eax+ebx]  
00A710DD  fadd        dword ptr [ecx+ebx]  
00A710E0  mov         edx,dword ptr [esi+20h]  
00A710E3  add         eax,ebx  
00A710E5  add         ecx,ebx  
00A710E7  fstp        dword ptr [ebp-18h]  
00A710EA  add         edx,ebx  
00A710EC  fld         dword ptr [ebp-18h]  
00A710EF  fadd        dword ptr [edx]  
00A710F1  fstp        dword ptr [ebp-14h]  
00A710F4  fld         dword ptr [eax+4]  
00A710F7  fadd        dword ptr [ecx+4]  
00A710FA  fstp        dword ptr [ebp-18h]  
00A710FD  fld         dword ptr [ebp-18h]  
00A71100  fadd        dword ptr [edx+4]  
00A71103  fstp        dword ptr [ebp-10h]  
00A71106  fld         dword ptr [eax+8]  
00A71109  fadd        dword ptr [ecx+8]  
00A7110C  fstp        dword ptr [ebp-18h]  
00A7110F  fld         dword ptr [ebp-18h]  
00A71112  fadd        dword ptr [edx+8]  
00A71115  fstp        dword ptr [ebp-0Ch]  
00A71118  fld         dword ptr [eax+0Ch]  
00A7111B  mov         eax,dword ptr [edi+4]  
00A7111E  fadd        dword ptr [ecx+0Ch]  
00A71121  lea         ecx,[ebp-14h]  
00A71124  fstp        dword ptr [ebp-18h]  
00A71127  fld         dword ptr [ebp-18h]  
00A7112A  fadd        dword ptr [edx+0Ch]  
00A7112D  fstp        dword ptr [ebp-8]  
00A71130  cmp         ecx,eax  
00A71132  jae         test+0FBh (0A711ABh)  
00A71134  mov         edx,dword ptr [edi]  
00A71136  cmp         edx,ecx  
00A71138  ja          test+0FBh (0A711ABh)  
00A7113A  mov         esi,ecx  
00A7113C  mov         ecx,dword ptr [edi+8]  
00A7113F  sub         esi,edx  
00A71141  cmp         eax,ecx  
00A71143  jne         test+0D4h (0A71184h)  
00A71145  sub         eax,edx  
00A71147  sar         eax,4  
00A7114A  cmp         eax,0FFFFFFEh  
00A7114F  ja          test+1D3h (0A71283h)  
00A71155  sub         ecx,edx  
00A71157  inc         eax  
00A71158  sar         ecx,4  
00A7115B  cmp         eax,ecx  
00A7115D  jbe         test+0D4h (0A71184h)  
00A7115F  mov         edx,ecx  
00A71161  shr         edx,1  
00A71163  mov         ebx,0FFFFFFFh  
00A71168  sub         ebx,edx  
00A7116A  cmp         ebx,ecx  
00A7116C  jae         test+0C2h (0A71172h)  
00A7116E  xor         ecx,ecx  
00A71170  jmp         test+0C4h (0A71174h)  
00A71172  add         ecx,edx  
00A71174  cmp         ecx,eax  
00A71176  jae         test+0CAh (0A7117Ah)  
00A71178  mov         ecx,eax  
00A7117A  mov         edx,edi  
00A7117C  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (0A718A0h)  
00A71181  mov         ebx,dword ptr [ebp-28h]  
00A71184  mov         eax,dword ptr [edi+4]  
00A71187  and         esi,0FFFFFFF0h  
00A7118A  add         esi,dword ptr [edi]  
00A7118C  test        eax,eax  
00A7118E  je          test+0F6h (0A711A6h)  
00A71190  mov         edx,dword ptr [esi]  
00A71192  mov         dword ptr [eax],edx  
00A71194  mov         ecx,dword ptr [esi+4]  
00A71197  mov         dword ptr [eax+4],ecx  
00A7119A  mov         edx,dword ptr [esi+8]  
00A7119D  mov         dword ptr [eax+8],edx  
00A711A0  mov         ecx,dword ptr [esi+0Ch]  
00A711A3  mov         dword ptr [eax+0Ch],ecx  
00A711A6  mov         esi,dword ptr [ebp-24h]  
00A711A9  jmp         test+161h (0A71211h)  
00A711AB  mov         ecx,dword ptr [edi+8]  
00A711AE  cmp         eax,ecx  
00A711B0  jne         test+143h (0A711F3h)  
00A711B2  mov         edx,dword ptr [edi]  
00A711B4  sub         eax,edx  
00A711B6  sar         eax,4  
00A711B9  cmp         eax,0FFFFFFEh  
00A711BE  ja          test+1D3h (0A71283h)  
00A711C4  sub         ecx,edx  
00A711C6  inc         eax  
00A711C7  sar         ecx,4  
00A711CA  cmp         eax,ecx  
00A711CC  jbe         test+143h (0A711F3h)  
00A711CE  mov         edx,ecx  
00A711D0  shr         edx,1  
00A711D2  mov         esi,0FFFFFFFh  
00A711D7  sub         esi,edx  
00A711D9  cmp         esi,ecx  
00A711DB  jae         test+131h (0A711E1h)  
00A711DD  xor         ecx,ecx  
00A711DF  jmp         test+133h (0A711E3h)  
00A711E1  add         ecx,edx  
00A711E3  cmp         ecx,eax  
00A711E5  jae         test+139h (0A711E9h)  
00A711E7  mov         ecx,eax  
00A711E9  mov         edx,edi  
00A711EB  call        std::vector<math::vector,std::allocator<math::vector> >::reserve (0A718A0h)  
00A711F0  mov         esi,dword ptr [ebp-24h]  
00A711F3  mov         eax,dword ptr [edi+4]  
00A711F6  test        eax,eax  
00A711F8  je          test+161h (0A71211h)  
00A711FA  mov         edx,dword ptr [ebp-14h]  
00A711FD  mov         ecx,dword ptr [ebp-10h]  
00A71200  mov         dword ptr [eax],edx  
00A71202  mov         edx,dword ptr [ebp-0Ch]  
00A71205  mov         dword ptr [eax+4],ecx  
00A71208  mov         ecx,dword ptr [ebp-8]  
00A7120B  mov         dword ptr [eax+8],edx  
00A7120E  mov         dword ptr [eax+0Ch],ecx  
00A71211  add         dword ptr [edi+4],10h  
00A71215  add         ebx,10h  
00A71218  mov         dword ptr [ebp-28h],ebx  
00A7121B  cmp         ebx,9896800h  
00A71221  jb          test+25h (0A710D5h)  
00A71227  call        dword ptr [__imp__clock (0A730DCh)]
小智 4
我之前已经评论过这个问题。我担心存在空的用户定义析构函数,这可能会禁用内联。经过一番谷歌搜索后,我更有信心这可能就是答案。
这个答案描述的情况与您在问题中描述的情况非常接近。此处,用户定义的析构函数可防止设置operator+即使__forceinline设置的内联。这里还可以找到有用的调试技巧。
microsoft connect 中还有一个错误报告。我第一次听说它是在 Channel9 上有关 safeint 库的讨论中。