Dr.*_*eon 12 c c++ 64-bit bits bit-manipulation
好吧,听起来有点复杂,但这正是我想要做的:
10101010101{ 0, 2, 4, 6, 8, 10 }- 一个包含所有位置的数组这是我的代码:
UINT DQBitboard::firstBit(U64 bitboard)
{
    static const int index64[64] = {
    63,  0, 58,  1, 59, 47, 53,  2,
    60, 39, 48, 27, 54, 33, 42,  3,
    61, 51, 37, 40, 49, 18, 28, 20,
    55, 30, 34, 11, 43, 14, 22,  4,
    62, 57, 46, 52, 38, 26, 32, 41,
    50, 36, 17, 19, 29, 10, 13, 21,
    56, 45, 25, 31, 35, 16,  9, 12,
    44, 24, 15,  8, 23,  7,  6,  5  };
    static const U64 debruijn64 = 0x07EDD5E59A4E28C2ULL;
    #pragma warning (disable: 4146)
    return index64[((bitboard & -bitboard) * debruijn64) >> 58];  
}
vector<UINT> DQBitboard::bits(U64 bitboard)
{
    vector<UINT> res;
    while (bitboard)
    {
        UINT first = DQBitboard::firstBit(bitboard);
        res.push_back(first);
        bitboard &= ~(1ULL<<first);
    }
    return res;
}
代码肯定会起作用.
我的观点是:
提示:
UINT 是一个typedef unsigned intU64 是一个typedef unsigned long longstatic inline.这是另一个可以分析的建议(可以与其他建议结合进行进一步优化).注意,这里的循环是O(number of set bits).
vector<UINT> bits_set (UINT64 data) 
{
    UINT n;
    vector<UINT> res;
    res.reserve(64);
    for (n = 0; data != 0; n++, data &= (data - 1))
    {
        res.push_back(log2(data & ~(data-1)));
    }
    return res;
}
比特转换非常便宜.查找表会占用缓存空间,您的查找也会有整数乘法.我期待,蛮力将比聪明的技术更快.
vector<UINT> DQBitboard::bits(U64 bitboard)
{
    vector<UINT> res;
    res.reserve(64);
    uint_fast8_t pos = 1;
    do {
        if (bitboard & 1) res.push_back(pos);
        ++pos;
    } while (bitboard >>= 1);
    return res;
}
您可以稍微展开循环,这可能会使它更快.
std::vector到目前为止,这是最昂贵的部分.考虑直接使用位板.例如:
struct bitboard_end_iterator{};
struct bitboard_iterator
{
    U64 value;
    uint_fast8_t pos;
    bitboard_iterator(U64 bitboard) : value(bitboard), pos(0)
    {
        ++(*this);
    }
    UINT operator*() const { return pos + 1; }
    bool operator==( bitboard_end_iterator ) const { return pos == 64; }
    operator bool() const { return pos < 64; }
    bitboard_iterator& operator++()
    {
        while (U64 prev = value) {
            value >>= 1;
            ++pos;
            if (prev & 1) return *this;
        }
        pos = 64;
        return *this;
    }
};
现在你可以写了
for( bitboard_iterator it(bitboard); it; ++it )
    cout << *it;
而且我想你会得到你的位列表.
版本2:
class bitboard_fast_iterator
{
    U64 value;
    uint_fast8_t pos;
public:
    bitboard_fast_iterator(U64 bitboard = 0) : value(bitboard), pos(__builtin_ctzll(value)) {}
    UINT operator*() const { return pos + 1; }
    operator bool() const { return value != 0; }
    bitboard_iterator& operator++()
    {
        value &= ~(1ULL << pos);
        pos = __builtin_ctzll(value);
        return *this;
    }
};
我一直想知道它是否能更快地使用bst汇编指令.所以我尝试了3次实现,并获得了1000万次迭代的以下结果:
你的实施(Dr.Kameleon)1.77秒
log2()实现(icepack)2.17秒
我的装配实施(我)0.16秒
输出:
bits version:
Function started at 0
           ended at 177
              spent 177 (1.770000 seconds)
c version:
Function started at 177
           ended at 394
              spent 217 (2.170000 seconds)
c version:
Function started at 394
           ended at 410
              spent 16 (0.160000 seconds)
有关C/C++的一点,静态是可怕的.它实际上是在CPU指令列表中编译的(不是我所期望的!!!)而是在无名空间中使用函数外部的数组.这将产生预期的效果.虽然在汇编中你可以使用.long(或其他一些大小)然后%rip来引用来自IP的数据.
注意:编译后,我没有看到我的汇编版本中使用的size(n),所以我不太确定返回的数组是否有效.除此之外,代码本身变成了5个汇编指令的循环,因此速度微小增加(大约x10).
log2()慢的原因是它将数字转换为xmm寄存器,然后调用另一个函数.然后它将xmm寄存器转换回常规寄存器.
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <unistd.h>
#include <sys/times.h>
#include <string.h>
#include <math.h>
#include <vector>
using namespace std;
namespace
{
const int index64[64] = {
    63,  0, 58,  1, 59, 47, 53,  2,
    60, 39, 48, 27, 54, 33, 42,  3,
    61, 51, 37, 40, 49, 18, 28, 20,
    55, 30, 34, 11, 43, 14, 22,  4,
    62, 57, 46, 52, 38, 26, 32, 41,
    50, 36, 17, 19, 29, 10, 13, 21,
    56, 45, 25, 31, 35, 16,  9, 12,
    44, 24, 15,  8, 23,  7,  6,  5  };
const uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL;
}
int firstBit(uint64_t bitboard)
{
    return index64[((bitboard & -bitboard) * debruijn64) >> 58];  
}
vector<int> bits(uint64_t bitboard)
{
    vector<int> res;
    res.reserve(64);
    while(bitboard)
    {
        int first = firstBit(bitboard);
        res.push_back(first);
        bitboard &= ~(1ULL << first);
    }
    return res;
}
vector<int> bits_c(uint64_t bitboard)
{
    int n;
    vector<int> res;
    res.reserve(64);
    for (n = 0; bitboard != 0; n++, bitboard &= (bitboard - 1))
    {
        res.push_back(log2(bitboard & ~(bitboard - 1)));
    }
    return res;
}
vector<int> bits_asm(uint64_t bitboard)
{
    int64_t n(0);
    int res[64];
    asm(
    "bsf %[b], %%rax\n\t"
    "je exit\n\t"
    ".align 16\n"
"loop:\n\t"
    "mov %%eax, (%[r],%[n],4)\n\t"
    "btr %%rax, %[b]\n\t"
    "inc %[n]\n\t"
    "bsf %[b], %%rax\n\t"
    "je loop\n"
"exit:\n\t"
    : /* output */ "=r" (n)
    : /* input */ [n] "r" (n), [r] "r" (res), [b] "r" (bitboard)
    : /* state */ "eax", "cc"
    );
    return vector<int>(res, res + n);
}
class run_timer
{
public:
    run_timer()
    {
    }
    void start()
    {
        times(&f_start);
    }
    void stop()
    {
        times(&f_stop);
    }
    void report(const char *msg)
    {
        printf("%s:\n"
               "Function started at %ld\n"
               "           ended at %ld\n"
               "              spent %ld (%f seconds)\n",
               msg,
               f_start.tms_utime,
               f_stop.tms_utime,
               f_stop.tms_utime - f_start.tms_utime,
               (double)(f_stop.tms_utime - f_start.tms_utime)/(double)sysconf(_SC_CLK_TCK));
    }
    struct tms f_start;
    struct tms f_stop;
};
int main(int argc, char *argv[])
{
    run_timer t;
    t.start();
    for(int i(0); i < 10000000; ++i)
    {
        bits(rand());
    }
    t.stop();
    t.report("bits version");
    t.start();
    for(int i(0); i < 10000000; ++i)
    {
        bits_c(rand());
    }
    t.stop();
    t.report("c version");
    t.start();
    for(int i(0); i < 10000000; ++i)
    {
        bits_asm(rand());
    }
    t.stop();
    t.report("c version");
    return 0;
}
使用此命令行使用g ++编译:
c++ -msse4.2 -O2 -o bits -c bits.cpp
虽然您可能认为-msse4.2可能是log2()版本的问题,但我试过没有它,而log2()则慢了.
顺便说一句,我不推荐这种方法,因为它不便携.只有基于Intel的计算机才能理解这些说明.
firstBit使用BSF或BSR指令替换您的函数以获得大量加速.
在gcc中,它会__builtin_ffsll和__builtin_ctzll
使用Visual C +,_BitScanForward和_BitScanReverse