Aur*_*oms 5 c++ performance stdvector
在我实施Eratosthenes筛选时,我遇到了一个问题std::vector<bool>:无法访问原始数据.
所以我决定使用自定义简约实现,我可以访问数据指针.
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H
#include <algorithm>
template <typename B>
class bits_t{
public:
typedef B block_t;
static const size_t block_size = sizeof(block_t) * 8;
block_t* data;
size_t size;
size_t blocks;
class bit_ref{
public:
block_t* const block;
const block_t mask;
bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}
inline void operator=(bool v) const noexcept{
if(v) *block |= mask;
else *block &= ~mask;
}
inline operator bool() const noexcept{
return (bool)(*block & mask);
}
};
bits_t() noexcept : data(nullptr){}
void resize(const size_t n, const bool v) noexcept{
block_t fill = v ? ~block_t(0) : block_t(0);
size = n;
blocks = (n + block_size - 1) / block_size;
data = new block_t[blocks];
std::fill(data, data + blocks, fill);
}
inline block_t& block_at_index(const size_t i) const noexcept{
return data[i / block_size];
}
inline size_t index_in_block(const size_t i) const noexcept{
return i % block_size;
}
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), block_t(1) << index_in_block(i));
}
~bits_t(){
delete[] data;
}
};
#endif // LIB_BITS_T_H
Run Code Online (Sandbox Code Playgroud)
代码几乎与/usr/include/c++/4.7/bits/stl_bvector.h中的代码相同,但速度较慢.
我尝试过优化,
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H
#include <algorithm>
template <typename B>
class bits_t{
const B mask[64] = {
0b0000000000000000000000000000000000000000000000000000000000000001,
0b0000000000000000000000000000000000000000000000000000000000000010,
0b0000000000000000000000000000000000000000000000000000000000000100,
0b0000000000000000000000000000000000000000000000000000000000001000,
0b0000000000000000000000000000000000000000000000000000000000010000,
0b0000000000000000000000000000000000000000000000000000000000100000,
0b0000000000000000000000000000000000000000000000000000000001000000,
0b0000000000000000000000000000000000000000000000000000000010000000,
0b0000000000000000000000000000000000000000000000000000000100000000,
0b0000000000000000000000000000000000000000000000000000001000000000,
0b0000000000000000000000000000000000000000000000000000010000000000,
0b0000000000000000000000000000000000000000000000000000100000000000,
0b0000000000000000000000000000000000000000000000000001000000000000,
0b0000000000000000000000000000000000000000000000000010000000000000,
0b0000000000000000000000000000000000000000000000000100000000000000,
0b0000000000000000000000000000000000000000000000001000000000000000,
0b0000000000000000000000000000000000000000000000010000000000000000,
0b0000000000000000000000000000000000000000000000100000000000000000,
0b0000000000000000000000000000000000000000000001000000000000000000,
0b0000000000000000000000000000000000000000000010000000000000000000,
0b0000000000000000000000000000000000000000000100000000000000000000,
0b0000000000000000000000000000000000000000001000000000000000000000,
0b0000000000000000000000000000000000000000010000000000000000000000,
0b0000000000000000000000000000000000000000100000000000000000000000,
0b0000000000000000000000000000000000000001000000000000000000000000,
0b0000000000000000000000000000000000000010000000000000000000000000,
0b0000000000000000000000000000000000000100000000000000000000000000,
0b0000000000000000000000000000000000001000000000000000000000000000,
0b0000000000000000000000000000000000010000000000000000000000000000,
0b0000000000000000000000000000000000100000000000000000000000000000,
0b0000000000000000000000000000000001000000000000000000000000000000,
0b0000000000000000000000000000000010000000000000000000000000000000,
0b0000000000000000000000000000000100000000000000000000000000000000,
0b0000000000000000000000000000001000000000000000000000000000000000,
0b0000000000000000000000000000010000000000000000000000000000000000,
0b0000000000000000000000000000100000000000000000000000000000000000,
0b0000000000000000000000000001000000000000000000000000000000000000,
0b0000000000000000000000000010000000000000000000000000000000000000,
0b0000000000000000000000000100000000000000000000000000000000000000,
0b0000000000000000000000001000000000000000000000000000000000000000,
0b0000000000000000000000010000000000000000000000000000000000000000,
0b0000000000000000000000100000000000000000000000000000000000000000,
0b0000000000000000000001000000000000000000000000000000000000000000,
0b0000000000000000000010000000000000000000000000000000000000000000,
0b0000000000000000000100000000000000000000000000000000000000000000,
0b0000000000000000001000000000000000000000000000000000000000000000,
0b0000000000000000010000000000000000000000000000000000000000000000,
0b0000000000000000100000000000000000000000000000000000000000000000,
0b0000000000000001000000000000000000000000000000000000000000000000,
0b0000000000000010000000000000000000000000000000000000000000000000,
0b0000000000000100000000000000000000000000000000000000000000000000,
0b0000000000001000000000000000000000000000000000000000000000000000,
0b0000000000010000000000000000000000000000000000000000000000000000,
0b0000000000100000000000000000000000000000000000000000000000000000,
0b0000000001000000000000000000000000000000000000000000000000000000,
0b0000000010000000000000000000000000000000000000000000000000000000,
0b0000000100000000000000000000000000000000000000000000000000000000,
0b0000001000000000000000000000000000000000000000000000000000000000,
0b0000010000000000000000000000000000000000000000000000000000000000,
0b0000100000000000000000000000000000000000000000000000000000000000,
0b0001000000000000000000000000000000000000000000000000000000000000,
0b0010000000000000000000000000000000000000000000000000000000000000,
0b0100000000000000000000000000000000000000000000000000000000000000,
0b1000000000000000000000000000000000000000000000000000000000000000
};
public:
typedef B block_t;
static const size_t block_size = sizeof(block_t) * 8;
block_t* data;
size_t size;
size_t blocks;
class bit_ref{
public:
block_t* const block;
const block_t mask;
bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}
inline void operator=(bool v) const noexcept{
if(v) *block |= mask;
else *block &= ~mask;
}
inline operator bool() const noexcept{
return (bool)(*block & mask);
}
};
bits_t() noexcept : data(nullptr){}
void resize(const size_t n, const bool v) noexcept{
block_t fill = v ? ~block_t(0) : block_t(0);
size = n;
blocks = (n + block_size - 1) / block_size;
data = new block_t[blocks];
std::fill(data, data + blocks, fill);
}
inline block_t& block_at_index(const size_t i) const noexcept{
return data[i / block_size];
}
inline size_t index_in_block(const size_t i) const noexcept{
return i % block_size;
}
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}
~bits_t(){
delete[] data;
}
};
#endif // LIB_BITS_T_H
Run Code Online (Sandbox Code Playgroud)
(用g ++ 4.7 -O3编译)
Eratosthenes筛分算法(33.333.333位)
std::vector<bool> 19.1s
bits_t<size_t> 19.9s
bits_t<size_t> (with lookup table) 19.7s
ctor + resize(33.333.333位)+ dtor
std::vector<bool> 120毫秒
bits_t<size_t> 150毫秒
问题:减速从何而来?
显然,函数中的包装i % block_size是罪魁祸首
inline size_t index_in_block ( const size_t i ) const noexcept {
return i % block_size;
}
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << index_in_block( i ) );
}
Run Code Online (Sandbox Code Playgroud)
所以将上面的代码替换为
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}
Run Code Online (Sandbox Code Playgroud)
解决了这个问题。然而,我仍然不知道为什么会这样。我最好的猜测是,我没有正确获得 index_in_block 的签名,因此优化器无法以与手动内联方式类似的方式内联此函数。
这是新代码。
#ifndef LIB_BITS_2_T_H
#define LIB_BITS_2_T_H
#include <algorithm>
template <typename B>
class bits_2_t {
public:
typedef B block_t;
static const int block_size = sizeof( block_t ) * __CHAR_BIT__;
private:
block_t* _data;
size_t _size;
size_t _blocks;
public:
class bit_ref {
public:
block_t* const block;
const block_t mask;
bit_ref ( block_t& block, const block_t mask) noexcept
: block( &block ), mask( mask ) {}
inline bool operator= ( const bool v ) const noexcept {
if ( v ) *block |= mask;
else *block &= ~mask;
return v;
}
inline operator bool() const noexcept {
return (bool)( *block & mask );
}
};
bits_2_t () noexcept : _data( nullptr ), _size( 0 ), _blocks( 0 ) {}
bits_2_t ( const size_t n ) noexcept : _data( nullptr ), _size( n ) {
_blocks = number_of_blocks_needed( n );
_data = new block_t[_blocks];
const block_t fill( 0 );
std::fill( _data, _data + _blocks, fill );
}
bits_2_t ( const size_t n, const bool v ) noexcept : _data( nullptr ), _size( n ) {
_blocks = number_of_blocks_needed( n );
_data = new block_t[_blocks];
const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
std::fill( _data, _data + _blocks, fill );
}
void resize ( const size_t n ) noexcept {
resize( n, false );
}
void resize ( const size_t n, const bool v ) noexcept {
const size_t tmpblocks = number_of_blocks_needed( n );
const size_t copysize = std::min( _blocks, tmpblocks );
block_t* tmpdata = new block_t[tmpblocks];
std::copy( _data, _data + copysize, tmpdata );
const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
std::fill( tmpdata + copysize, tmpdata + tmpblocks, fill );
delete[] _data;
_data = tmpdata;
_blocks = tmpblocks;
_size = n;
}
inline size_t number_of_blocks_needed ( const size_t n ) const noexcept {
return ( n + block_size - 1 ) / block_size;
}
inline block_t& block_at_index ( const size_t i ) const noexcept {
return _data[i / block_size];
}
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}
inline bool operator[] ( const size_t i ) const noexcept {
return (bool)( block_at_index( i ) & ( block_t( 1 ) << ( i % block_size ) ) );
}
inline block_t* data () {
return _data;
}
inline const block_t* data () const {
return _data;
}
inline size_t size () const {
return _size;
}
void clear () noexcept {
delete[] _data;
_size = 0;
_blocks = 0;
_data = nullptr;
}
~bits_2_t () {
clear();
}
};
#endif // LIB_BITS_2_T_H
Run Code Online (Sandbox Code Playgroud)
以下是这个新代码在我的 amd64 机器上的素数高达1.000.000.000(3 次运行中最好的,实时)的结果。
埃拉托色尼筛法,每个数字有 1 个存储单元(不跳过 2 的倍数)。
位_t<uint8_t>
真实 0m23.614s 用户 0m23.493s 系统 0m0.092s
位_t<uint16_t>
真实 0m24.399s 用户 0m24.294s 系统 0m0.084s
位_t<uint32_t>
真实 0m23.501s 用户 0m23.372s 系统 0m0.108s <-- 最佳
位_t<uint64_t>
真实 0m24.393s 用户 0m24.304s 系统 0m0.068s
std::向量<布尔>
真实 0m24.362s 用户 0m24.276s 系统 0m0.056s
std::向量<uint8_t>
真实 0m38.303s 用户 0m37.570s 系统 0m0.683s
这是筛子的代码(其中(...)应替换为您选择的位数组)。
#include <iostream>
typedef (...) array_t;
int main ( int argc, char const *argv[] ) {
if ( argc != 2 ) {
std::cout << "#0 missing" << std::endl;
return 1;
}
const size_t count = std::stoull( argv[1] );
array_t prime( count, true );
prime[0] = prime[1] = false;
for ( size_t k = 2 ; k * k < count ; ++k ) {
if ( prime[k] ) {
for ( size_t i = k * k ; i < count ; i += k ) {
prime[i] = false;
}
}
}
return 0;
}
Run Code Online (Sandbox Code Playgroud)