alp*_*pha 9 c++ performance shared-ptr c++11
我刚刚编写了一个测试程序,以找到分配和释放许多管理对象的最快方法shared_ptr
.
我试图shared_ptr
用new
,shared_ptr
用pool
,make_shared
,allocate_shared
.是什么让我感到惊讶的allocate_shared
是慢shared_ptr
用pool
.
我vs2017+win10
用发布版本测试代码.发布版本设置为默认值(/ O2).我也测试它gcc4.8.5+centos6.2
与g++ -std=c++11 -O3
.
代码是:
#include <memory>
#include <iostream>
#include <vector>
#include <assert.h>
#include <chrono>
#include <mutex>
using namespace std;
struct noncopyable {
protected:
noncopyable() = default;
~noncopyable() = default;
private:
noncopyable(const noncopyable&) = delete;
noncopyable& operator=(const noncopyable&) = delete;
noncopyable(noncopyable&&) = delete;
noncopyable& operator=(noncopyable&&) = delete;
};
class BlockPool : noncopyable {
public:
BlockPool(size_t block_size) :block_size_(block_size) {}
~BlockPool() {
assert(total_count_ == datas_.size());
for (size_t i = 0; i < datas_.size(); ++i) {
free(datas_[i]);
}
}
size_t size() const { return block_size_; }
void* pop() {
std::lock_guard<std::mutex> lock(mutex_);
if (datas_.empty()) {
const size_t kNextSize = 1024;
for (size_t i = 0; i < kNextSize; ++i) {
void* p = malloc(block_size_);
datas_.push_back(p);
}
total_count_ += kNextSize;
}
void* p = datas_.back();
datas_.pop_back();
return p;
}
void push(void* data) {
std::lock_guard<std::mutex> lock(mutex_);
datas_.push_back(data);
}
void reserve(size_t count) {
std::lock_guard<std::mutex> lock(mutex_);
if (count <= datas_.size()) return;
datas_.reserve(count);
count -= datas_.size();
for (size_t i = 0; i < count; ++i) {
void* p = malloc(block_size_);
datas_.push_back(p);
}
total_count_ += count;
}
private:
size_t const block_size_;
size_t total_count_{ 0 };
std::vector<void*> datas_;
std::mutex mutex_;
};
struct Packet : noncopyable {
Packet() = default;
~Packet() = default;
char data_[1000];
};
const uint32_t kLoopCount = 1000 * 1000;
BlockPool pool(sizeof(Packet) + 64);
std::vector<shared_ptr<Packet>> packets;
void test_make_shared() {
auto begin = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < kLoopCount; ++i) {
auto packet = make_shared<Packet>();
packets.emplace_back(std::move(packet));
}
packets.clear();
auto end = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << "make_shared: " << ms << " ms\n";
}
void test_shared_ptr_with_pool() {
auto begin = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < kLoopCount; ++i) {
Packet* p = (Packet*)pool.pop();
new(p)Packet();
shared_ptr<Packet> packet(p, [](Packet* packet) {
packet->~Packet();
pool.push(packet);
});
packets.emplace_back(std::move(packet));
}
packets.clear();
auto end = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << "shared_ptr with pool: " << ms << " ms\n";
}
void test_shared_ptr_with_new() {
auto begin = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < kLoopCount; ++i) {
shared_ptr<Packet> packet(new Packet);
packets.emplace_back(std::move(packet));
}
packets.clear();
auto end = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << "shared_ptr with new: " << ms << " ms\n";
}
template <class T>
struct Mallocator {
typedef T value_type;
Mallocator(BlockPool* pool) : pool_(pool) { }
template <class U> Mallocator(const Mallocator<U>& u) {
pool_ = u.pool_;
}
inline T* allocate(std::size_t n) {
#ifdef _DEBUG
assert(n == 1);
auto len = n * sizeof(T);
assert(len <= pool_->size());
#endif
return static_cast<T*>(pool_->pop());
}
inline void deallocate(T* p, std::size_t n) {
#ifdef _DEBUG
assert(n == 1);
auto len = n * sizeof(T);
assert(len <= pool_->size());
#endif
pool_->push(p);
}
BlockPool* pool_;
};
template <class T, class U>
bool operator==(const Mallocator<T>&, const Mallocator<U>&) { return true; }
template <class T, class U>
bool operator!=(const Mallocator<T>&, const Mallocator<U>&) { return false; }
void test_allocate_shared() {
Mallocator<Packet> alloc(&pool);
auto begin = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < kLoopCount; ++i) {
shared_ptr<Packet> packet = allocate_shared<Packet, Mallocator<Packet>>(alloc);
packets.emplace_back(std::move(packet));
}
packets.clear();
auto end = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << "allocate_shared: " << ms << " ms\n";
}
void test_new_delete() {
std::vector<Packet*> raw_packets;
raw_packets.reserve(kLoopCount);
auto begin = std::chrono::steady_clock::now();
for (uint32_t i = 0; i < kLoopCount; ++i) {
raw_packets.push_back(new Packet);
}
for (uint32_t i = 0; i < kLoopCount; ++i) {
delete raw_packets[i];
}
raw_packets.clear();
auto end = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
std::cout << "new_delete: " << ms << " ms\n";
}
int main() {
std::cout << "loop for " << kLoopCount << " times to ceate and free shared_ptr\n\n";
packets.reserve(kLoopCount);
for (int i = 0; i < 3; ++i) {
test_make_shared();
}
std::cout << "======\n";
pool.reserve(kLoopCount);
for (int i = 0; i < 3; ++i) {
test_shared_ptr_with_new();
}
std::cout << "======\n";
for (int i = 0; i < 3; ++i) {
test_shared_ptr_with_pool();
}
std::cout << "======\n";
for (int i = 0; i < 3; ++i) {
test_allocate_shared();
}
std::cout << "======\n";
for (int i = 0; i < 3; ++i) {
test_new_delete();
}
return 0;
}
Run Code Online (Sandbox Code Playgroud)
在我的电脑(vs2017,Windows 10)中,结果是:
loop for 1000000 times to ceate and free shared_ptr
make_shared: 616 ms
make_shared: 586 ms
make_shared: 581 ms
======
shared_ptr with new: 532 ms
shared_ptr with new: 541 ms
shared_ptr with new: 525 ms
======
shared_ptr with pool: 292 ms
shared_ptr with pool: 293 ms
shared_ptr with pool: 290 ms
======
allocate_shared: 346 ms
allocate_shared: 340 ms
allocate_shared: 345 ms
======
new_delete: 424 ms
new_delete: 408 ms
new_delete: 403 ms
Run Code Online (Sandbox Code Playgroud)
我也用gcc 4.8,centos6.2测试了它,结果是一样的,那就是速度,shared_ptr_with_pool > allocate_shared > shared_ptr_with_new > make_shared
.
据我所知,shared_ptr :: shared_ptr(T*p)需要分配一个小内存来保存refcount和deleter,所以需要分配两次,而make_shared只需要分配一次,而allocate_shared则不需要甚至一次分配.
据我了解,速度关系应该是allocate_shared > shared_ptr_with_pool > make_shared > shared_ptr_with_new
,但不是shared_ptr_with_pool > allocate_shared > shared_ptr_with_new > make_shared
.
有人能告诉我原因,非常感谢!
更新:
经过vs2017 + windows10的一些挖掘后,我发现std::allocate_shared
或boost::allocate_shared
调用了memset(p, 0, sizeof(Packet))
哪个减慢了while操作.
这是因为在vs2017库头中有些代码如下所示:
class Pair {
public:
template<class ... T>
Pair(T&...t) : v_(std::forward<T>(t)...){
}
std::_Align_type<char, 1500> v_;
};
void test_align() {
Pair p;
}
Run Code Online (Sandbox Code Playgroud)
Pair构造函数调用memset(addr, 0, sizeof(Pair))
.
我不知道为什么Pair构造函数调用memset
,我写了一些测试代码:
struct A {
char data_[1500];
};
class B {
public:
template<class ... T> B(T&...t)
: a_(std::forward<T>(t)...) {
}
A a_;
};
int main() {
B b;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
我用vs2017编译了代码,发现memset(addr,0,1500)被调用.asm代码(Debug build,Release build相同)是:
class B {
public:
template<class ... T> B(T&...t)
: a_(std::forward<T>(t)...) {
00C516A0 push ebp
00C516A1 mov ebp,esp
00C516A3 sub esp,0CCh
00C516A9 push ebx
00C516AA push esi
00C516AB push edi
00C516AC push ecx
00C516AD lea edi,[ebp-0CCh]
00C516B3 mov ecx,33h
00C516B8 mov eax,0CCCCCCCCh
00C516BD rep stos dword ptr es:[edi]
00C516BF pop ecx
00C516C0 mov dword ptr [this],ecx
00C516C3 push 5DCh
00C516C8 push 0
00C516CA mov eax,dword ptr [this]
00C516CD push eax
00C516CE call _memset (0C510BEh)
00C516D3 add esp,0Ch
}
00C516D6 mov eax,dword ptr [this]
00C516D9 pop edi
00C516DA pop esi
00C516DB pop ebx
00C516DC add esp,0CCh
00C516E2 cmp ebp,esp
00C516E4 call __RTC_CheckEsp (0C51118h)
00C516E9 mov esp,ebp
00C516EB pop ebp
00C516EC ret
Run Code Online (Sandbox Code Playgroud)
如果我添加一个空构造函数,如下所示:
struct A {
A() {}
char data_[1500];
};
class B {
public:
template<class ... T> B(T&...t)
: a_(std::forward<T>(t)...) {
}
A a_;
};
int main() {
B b;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
asm代码(Debug build,Release build相同)是:
class B {
public:
template<class ... T> B(T&...t)
: a_(std::forward<T>(t)...) {
010A1D40 push ebp
010A1D41 mov ebp,esp
010A1D43 sub esp,0CCh
010A1D49 push ebx
010A1D4A push esi
010A1D4B push edi
010A1D4C push ecx
010A1D4D lea edi,[ebp-0CCh]
010A1D53 mov ecx,33h
010A1D58 mov eax,0CCCCCCCCh
010A1D5D rep stos dword ptr es:[edi]
010A1D5F pop ecx
010A1D60 mov dword ptr [this],ecx
010A1D63 mov ecx,dword ptr [this]
010A1D66 call A::A (010A1456h)
}
010A1D6B mov eax,dword ptr [this]
010A1D6E pop edi
010A1D6F pop esi
010A1D70 pop ebx
010A1D71 add esp,0CCh
010A1D77 cmp ebp,esp
010A1D79 call __RTC_CheckEsp (010A126Ch)
010A1D7E mov esp,ebp
010A1D80 pop ebp
010A1D81 ret
Run Code Online (Sandbox Code Playgroud)
该call _memset (0C510BEh)
变更为call A::A (010A1456h)
.
所以看起来如果类型A具有构造函数,则a_(std::forward<T>(t)...)
调用构造函数,如果类型A没有构造函数,则a_(std::forward<T>(t)...)
调用memset(addr,0,sizeof(A))
.(为什么?)
std :: allocate_shared调用memset的原因是因为下面的代码(vs2017, xutility, in my computer, at C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.10.25017\include
):
template<class _Ty1,
class _Ty2>
class _Compressed_pair<_Ty1, _Ty2, false> final
{ // store a pair of values, not deriving from first
private:
_Ty1 _Myval1;
_Ty2 _Myval2;
public:
template<class... _Other2>
constexpr explicit _Compressed_pair(_Zero_then_variadic_args_t,
_Other2&&... _Val2)
: _Myval1(), _Myval2(_STD forward<_Other2>(_Val2)...)
{ // construct from forwarded values
}
template<class _Other1,
class... _Other2>
_Compressed_pair(_One_then_variadic_args_t,
_Other1&& _Val1, _Other2&&... _Val2)
: _Myval1(_STD forward<_Other1>(_Val1)),
_Myval2(_STD forward<_Other2>(_Val2)...)
{ // construct from forwarded values
}
Run Code Online (Sandbox Code Playgroud)
_Myval2的类型是std :: _ Align_type,它定义为
template<class _Ty,
size_t _Len>
union _Align_type
{ // union with size _Len bytes and alignment of _Ty
_Ty _Val;
char _Pad[_Len];
};
Run Code Online (Sandbox Code Playgroud)
在_Align_type
没有构造,所以_Myval2(_STD forward<_Other2>(_Val2)...)
调用memset(addr,0, sizeof(T))
.
所以我改变了_Align_type定义(添加一个虚拟构造函数)并再次测试,我发现std :: allocate_shared没有调用memset
,并且比以前快得多.
template<class _Ty,
size_t _Len>
union _Align_type
{ // union with size _Len bytes and alignment of _Ty
_Ty _Val;
char _Pad[_Len];
_Align_type() { }
};
Run Code Online (Sandbox Code Playgroud)
在我更改_Align_type的定义之后,现在速度test_allocate_shared
等于或略快于test_shared_ptr_with_pool
.
直到现在,我知道为什么std::allocate_shared
这么慢,但我仍然不知道为什么代码调用memset
时类型T没有构造函数但memset
在T具有构造函数时不调用.
template<class ... T> B(T&...t)
: a_(std::forward<T>(t)...) {}
Run Code Online (Sandbox Code Playgroud)
它是c ++标准吗?
而且,由于allocate_shared不应该调用memset(sizeof(T)),它是否是编译器的错误?
更新:
struct A {
//A() {}
char data_[1500];
void dummy() {
for (int i = 0; i < sizeof(data_); ++i) {
data_[i] = rand();
}
}
int dummy2() { // avoid optimize erase by compiler
int ret = 0;
for (int i = 0; i < sizeof(data_); ++i) {
ret += data_[i];
}
return ret;
}
};
class B {
public:
template<class ... T> B(T&...t)
: a_(std::forward<T>(t)...) {
}
A a_;
};
class C {
public:
C() : a_() {
}
A a_;
};
int main() {
//B b;
C c;
c.a_.dummy();
return c.a_.dummy2();
}
Run Code Online (Sandbox Code Playgroud)
我通过vs2017,x86发布版本编译上面的代码,asm代码是:
int main() {
009E1000 push ebp
009E1001 mov ebp,esp
009E1003 sub esp,5E0h
009E1009 mov eax,dword ptr [__security_cookie (09E3004h)]
009E100E xor eax,ebp
009E1010 mov dword ptr [ebp-4],eax
009E1013 push ebx
009E1014 push esi
009E1015 push edi
//B b;
C c;
009E1016 push 5DCh
009E101B lea eax,[c]
009E1021 push 0
009E1023 push eax
009E1024 call _memset (09E1BCAh)
c.a_.dummy();
009E1029 mov edi,dword ptr [__imp__rand (09E20B4h)]
//B b;
C c;
009E102F add esp,0Ch
c.a_.dummy();
009E1032 xor esi,esi
009E1034 call edi
009E1036 mov byte ptr c[esi],al
009E103D inc esi
009E103E cmp esi,5DCh
009E1044 jb main+34h (09E1034h)
return c.a_.dummy2();
009E1046 xor esi,esi
009E1048 xor edx,edx
009E104A xor edi,edi
009E104C xor ebx,ebx
return c.a_.dummy2();
009E104E xchg ax,ax
009E1050 movsx eax,byte ptr c[edx]
009E1058 movsx ecx,byte ptr [ebp+edx-5DEh]
009E1060 add esi,eax
009E1062 movsx eax,byte ptr [ebp+edx-5DFh]
009E106A add edi,ecx
009E106C add edx,3
009E106F add ebx,eax
009E1071 cmp edx,5DCh
009E1077 jb main+50h (09E1050h)
}
009E1079 mov ecx,dword ptr [ebp-4]
009E107C lea eax,[edi+ebx]
009E107F pop edi
009E1080 add eax,esi
009E1082 xor ecx,ebp
009E1084 pop esi
009E1085 pop ebx
009E1086 call __security_check_cookie (09E108Fh)
009E108B mov esp,ebp
009E108D pop ebp
009E108E ret
Run Code Online (Sandbox Code Playgroud)
还有一个memset(addr,0,1500)!
更新:视觉工作室2017似乎有一个错误std::allocate_shared
.代码尝试完美转发构造一个std::_Align_type
没有构造函数的构造函数,因此值初始化std::_Align_type
,即memset
.
阅读完为什么 C++ 使用 memset(addr,0,sizeof(T)) 来构造对象?标准或编译器错误?和默认初始化与零初始化,现在我明白为什么有一个memset
.
这是因为vs2017中的allocate_shared实现使用了一个类型_Align_type
,而这个类型没有构造函数。当 allocate_shared 尝试对 进行值初始化时_Align_type
,它会调用memset
.
这似乎是vs2017的一个错误。
在错误修复之前,我认为解决它可能不是个好主意。
更新:
我向 MS 发布了错误报告,他们已经确认了。
更新:这个bug在vs2017 update 3中依然存在。
归档时间: |
|
查看次数: |
713 次 |
最近记录: |