Dr.*_*eon 31 c++ performance 64-bit bit-manipulation bitboard
好吧,让我们考虑一个64位数字,其位形成一个8x8表.
例如
0 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0
写作
a b c d e f g h
----------------
0 1 1 0 1 0 1 0
0 1 1 0 1 0 1 1
0 1 1 1 1 0 1 0
0 1 1 0 1 0 1 0
1 1 1 0 1 0 1 0
0 1 1 0 1 0 1 0
0 1 1 0 1 1 1 0
0 1 1 0 1 0 1 0
Run Code Online (Sandbox Code Playgroud)
现在,如果我们想要隔离JUST例如列d(00100000)(或任何行/对角线),该怎么办?
可以这样做吗? 如果是这样,怎么样?
提示:
(a)我的主要目标 - 虽然最初没有提到 - 是原始速度.我正在寻找最快的算法,因为"检索"功能每秒执行数百万次.
(b)这更接近我的意思:https://www.chessprogramming.org/Kindergarten_Bitboards
vir*_*tor 63
这是一个只有4个主要步骤的解决方案:
const uint64_t column_mask = 0x8080808080808080ull;
const uint64_t magic = 0x2040810204081ull;
int get_col(uint64_t board, int col) {
uint64_t column = (board << col) & column_mask;
column *= magic;
return (column >> 56) & 0xff;
}
Run Code Online (Sandbox Code Playgroud)
它的工作原理如下:
选择幻数以仅复制所需的位,并将其余位置放入未使用的位置/溢出该位数.过程看起来像这样(数字是位"ID",而不是数字本身):
original column: ...1.......2.......3.......4.......5.......6.......7.......8....
aligned column: 1.......2.......3.......4.......5.......6.......7.......8.......
multiplied: 123456782345678.345678..45678...5678....678.....78......8.......
shifted to right:........................................................12345678
Run Code Online (Sandbox Code Playgroud)
如果你添加const关键字,汇编实际上变得非常好:
get_col:
.LFB7:
.cfi_startproc
movl %esi, %ecx
movabsq $-9187201950435737472, %rax
salq %cl, %rdi
andq %rax, %rdi
movabsq $567382630219905, %rax
imulq %rax, %rdi
shrq $56, %rdi
movl %edi, %eax
ret
Run Code Online (Sandbox Code Playgroud)
没有分支,没有外部数据,每次计算大约0.4ns.
编辑:使用NPE的解决方案作为基线(下一个最快的一个)大约需要6个时间
是的,所以为了"解决"关于哪个更快/更慢/等等的争论,我已经将所有代码放入一个程序中[我希望我已经为正确的代码片段归功于合适的人].
代码可以在下面找到,以便在我将代码编入函数时正确地编写代码.我确实运行了它没有正确的输出并检查每个函数给出相同的结果[记住在某些情况下顺序略有不同 - 所以我做了一个变体来运行我的代码的另一种方式,只是为了看到它给出"正确"的结果].所以不用多说了,结果如下:
mats1 time in clocks per iteration 10.3457
mats2 time in clocks per iteration 10.4785
mats3 time in clocks per iteration 10.5538
viraptor time in clocks per iteration 6.24603
lemees time in clocks per iteration 14.4818
npe time in clocks per iteration 13.1455
alex time in clocks per iteration 24.8272
Run Code Online (Sandbox Code Playgroud)
(viraptor的结果来自核心i5,g ++ 4.7)
mats1 time in clocks per iteration 7.62338
mats2 time in clocks per iteration 7.36226
mats3 time in clocks per iteration 7.45361
viraptor time in clocks per iteration 2.09582
lemees time in clocks per iteration 9.43744
npe time in clocks per iteration 7.51016
alex time in clocks per iteration 19.3554
Run Code Online (Sandbox Code Playgroud)
(viraptor的结果来自核心i5,clang ++ 3.2)
mats1 time in clocks per iteration 12.956
mats2 time in clocks per iteration 13.4395
mats3 time in clocks per iteration 13.3178
viraptor time in clocks per iteration 2.12914
lemees time in clocks per iteration 13.9267
npe time in clocks per iteration 16.2102
alex time in clocks per iteration 13.8705
Run Code Online (Sandbox Code Playgroud)
这是3.4GHz AMD Athlon2上的时钟周期 - 我没有现代的英特尔机器 - 如果有人希望运行代码,我会有兴趣看看它是什么样的.我相当确定它在缓存中运行良好 - 可能除了获取一些值以进行检查之外.
因此,胜利者显然是viraptor,大约40% - "我的"代码是第二.Alex的代码没有任何跳转/分支,但它似乎比其他替代品运行得慢.不确定为什么npe的结果比我的慢得多 - 它几乎完全相同(当从g ++查看汇编器输出时,代码看起来非常相似).
#include <iostream>
#include <fstream>
#include <cstdint>
using namespace std;
const int SIZE = 1000000;
uint64_t g_val[SIZE];
ofstream nulloutput;
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#define BITA_TO_B(x, a, b) (((x) >> (a-b)) & (1 << b))
unsigned char get_col_mats1(uint64_t val, int col)
{
return BITA_TO_B(val, 56+col, 7) |
BITA_TO_B(val, 48+col, 6) |
BITA_TO_B(val, 40+col, 5) |
BITA_TO_B(val, 32+col, 4) |
BITA_TO_B(val, 24+col, 3) |
BITA_TO_B(val, 16+col, 2) |
BITA_TO_B(val, 8+col, 1) |
BITA_TO_B(val, 0+col, 0);
}
unsigned char get_col_mats2(uint64_t val, int col)
{
return BITA_TO_B(val, 63-col, 7) |
BITA_TO_B(val, 55-col, 6) |
BITA_TO_B(val, 47-col, 5) |
BITA_TO_B(val, 39-col, 4) |
BITA_TO_B(val, 31-col, 3) |
BITA_TO_B(val, 23-col, 2) |
BITA_TO_B(val, 15-col, 1) |
BITA_TO_B(val, 7-col, 0);
}
unsigned char get_col_viraptor(uint64_t board, int col) {
const uint64_t column_mask = 0x8080808080808080ull;
const uint64_t magic = 0x2040810204081ull ;
uint64_t column = board & (column_mask >> col);
column <<= col;
column *= magic;
return (column >> 56) & 0xff;
}
unsigned char get_col_alex(uint64_t bitboard, int col)
{
unsigned char result;
result |= (bitboard & (1ULL << 63-col)) ? 0x80 : 0;
result |= (bitboard & (1ULL << 55-col)) ? 0x40 : 0;
result |= (bitboard & (1ULL << 47-col)) ? 0x20 : 0;
result |= (bitboard & (1ULL << 39-col)) ? 0x10 : 0;
result |= (bitboard & (1ULL << 31-col)) ? 0x08 : 0;
result |= (bitboard & (1ULL << 23-col)) ? 0x04 : 0;
result |= (bitboard & (1ULL << 15-col)) ? 0x02 : 0;
result |= (bitboard & (1ULL << 7-col)) ? 0x01 : 0;
return result;
}
unsigned char get_col_lemees(uint64_t val, int column)
{
int result = 0;
int source_bitpos = 7 - column; // "point" to last entry in this column
for (int target_bitpos = 0; target_bitpos < 8; ++target_bitpos)
{
bool bit = (val >> source_bitpos) & 1; // "extract" bit
result |= bit << target_bitpos; // add bit if it was set
source_bitpos += 8; // move one up in table
}
return result;
}
int get(uint64_t board, int row, int col) {
return (board >> (row * 8 + col)) & 1;
}
uint8_t get_col_npe(uint64_t board, int col) {
uint8_t ret = 0;
for (int i = 0; i < 8; ++i) {
ret = (ret << 1) + get(board, i, col);
}
return ret;
}
#define BITA_TO_B2(x, a, b) (((x) >> (a-b)) & (1 << b))
unsigned char get_col_mats3(uint64_t val, int col)
{
return BITA_TO_B2(val, 63-col, 7) |
BITA_TO_B2(val, 55-col, 6) |
BITA_TO_B2(val, 47-col, 5) |
BITA_TO_B2(val, 39-col, 4) |
BITA_TO_B2(val, 31-col, 3) |
BITA_TO_B2(val, 23-col, 2) |
BITA_TO_B2(val, 15-col, 1) |
BITA_TO_B2(val, 7-col, 0);
}
template<unsigned char (*f)(uint64_t val, int col)>
void runbench(const char *name)
{
unsigned char col[8] = {0};
uint64_t long t = rdtsc();
for(int j = 0; j < SIZE; j++)
{
uint64_t val = g_val[j];
for(int i = 0; i < 8; i++)
{
col[i] += f(val, i);
}
// __asm__ __volatile__("":::"memory");
}
t = rdtsc() - t;
for(int i = 0; i < 8; i++)
{
nulloutput<< "col " << i << " has bits " << hex << (int)col[i] << endl;
}
cout << name << " time in clocks per iteration " << dec << t / (8.0 * SIZE) << endl;
}
#define BM(name) void bench_##name() { runbench<get_col_##name>(#name); }
BM(mats1);
BM(mats2);
BM(mats3);
BM(viraptor);
BM(lemees);
BM(npe);
BM(alex);
struct function
{
void (*func)(void);
const char *name;
};
#define FUNC(f) { bench_##f, #f }
function funcs[] =
{
FUNC(mats1),
FUNC(mats2),
FUNC(mats3),
FUNC(viraptor),
FUNC(lemees),
FUNC(npe),
FUNC(alex),
};
int main()
{
unsigned long long a, b;
int i;
int sum = 0;
nulloutput.open("/dev/nul");
for(i = 0; i < SIZE; i++)
{
g_val[i] = rand() + ((long)rand() << 32L);
}
unsigned char col[8];
for(i = 0; i < sizeof(funcs)/sizeof(funcs[0]); i++)
{
funcs[i].func();
}
}
Run Code Online (Sandbox Code Playgroud)