Cla*_*diu 38 c 32-bit bit-manipulation
我有一个位数组实现,其中第0个索引是数组中第一个字节的MSB,第8个索引是第二个字节的MSB,等等...
找到这个位数组中设置的第一个位的快速方法是什么?我查找的所有相关解决方案都找到了第一个最重要的位,但我需要第一个最重要的解决方案.所以,给定0x00A1,我想要8(因为它是左起第9位).
And*_*ass 42
GCC __builtin_clz
转换为x86/x64上的BSR,ARM上的CLZ等,如果硬件没有实现,则模拟指令.
Visual C++ 2005及以上版本_BitScanReverse
.
joh*_*yrd 25
TL:博士; 对于32位,使用de Bruijn乘法.
这是"最快"的便携式算法.它比这个线程中的所有其他便携式32位MSB算法快得多且更正确.
当输入为零时,de Bruijn算法也返回正确的结果. 当输入为零时,__ builtin_clz和_BitScanReverse指令返回错误的结果.
在x86-64上,de Bruijn乘法的运行速度与等效(有缺陷的)硬件指令相当,性能差异仅为3%左右.
这是代码.
u32 msbDeBruijn32( u32 v )
{
static const int MultiplyDeBruijnBitPosition[32] =
{
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
};
v |= v >> 1; // first round down to one less than a power of 2
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return MultiplyDeBruijnBitPosition[( u32 )( v * 0x07C4ACDDU ) >> 27];
}
Run Code Online (Sandbox Code Playgroud)
此主题中的所有其他答案要么比作者建议的要差得多,要么不能正确计算结果,要么两者都不正确.让我们对它们进行基准测试,让我们验证它们是否按照他们的要求进行操作.
这是一个简单的C++ 11工具来测试所有这些实现.它在Visual Studio上编译干净,但应该适用于所有现代编译器.它允许您在性能模式(bVerifyResults = false)和检查模式(bVerifyResults = true)下运行基准测试.
以下是验证模式的结果:
Verification failed for msbNative64: input was 0; output was 818af060; expected 0
Verification failed for msbFfs: input was 22df; output was 0; expected d
Verification failed for msbPerformanceJunkie32: input was 0; output was ffffffff; expected 0
Verification failed for msbNative32: input was 0; output was 9ab07060; expected 0
Run Code Online (Sandbox Code Playgroud)
当输入为零时,"性能迷"和Microsoft本机实现会做不同的事情.msbPerformanceJunkie32产生-1,而Microsoft的_BitScanReverse产生一个随机数,与底层硬件指令一致.此外,msbPerformanceJunkie32实现产生的结果与所有其他答案中的结果相差一个.
以下是性能模式下的结果,在我的i7-4600笔记本电脑上运行,在发布模式下编译:
msbLoop64 took 2.56751 seconds
msbNative64 took 0.222197 seconds
msbLoop32 took 1.43456 seconds
msbFfs took 0.525097 seconds
msbPerformanceJunkie32 took 1.07939 seconds
msbDeBruijn32 took 0.224947 seconds
msbNative32 took 0.218275 seconds
Run Code Online (Sandbox Code Playgroud)
de Bruijn版本完美地击败了其他实现,因为它是无分支的,因此它可以很好地抵抗产生均匀分布的输出集的输入.由于对现代CPU的分支错误预测的惩罚,所有其他版本对任意输入的速度较慢.smbFfs函数产生不正确的结果,因此可以忽略它.
一些实现适用于32位输入,有些适用于64位输入.无论输入大小如何,模板都可以帮助我们比较苹果和苹果.
这是代码.如果您愿意,可以自行下载并运行基准测试.
#include <iostream>
#include <chrono>
#include <random>
#include <cassert>
#include <string>
#include <limits>
#ifdef _MSC_VER
#define MICROSOFT_COMPILER 1
#include <intrin.h>
#endif // _MSC_VER
const int iterations = 100000000;
bool bVerifyResults = false;
std::random_device rd;
std::default_random_engine re(rd());
typedef unsigned int u32;
typedef unsigned long long u64;
class Timer
{
public:
Timer() : beg_(clock_::now()) {}
void reset() {
beg_ = clock_::now();
}
double elapsed() const {
return std::chrono::duration_cast<second_>
(clock_::now() - beg_).count();
}
private:
typedef std::chrono::high_resolution_clock clock_;
typedef std::chrono::duration<double, std::ratio<1> > second_;
std::chrono::time_point<clock_> beg_;
};
unsigned int msbPerformanceJunkie32(u32 x)
{
static const unsigned int bval[] =
{ 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };
unsigned int r = 0;
if (x & 0xFFFF0000) {
r += 16 / 1;
x >>= 16 / 1;
}
if (x & 0x0000FF00) {
r += 16 / 2;
x >>= 16 / 2;
}
if (x & 0x000000F0) {
r += 16 / 4;
x >>= 16 / 4;
}
return r + bval[x];
}
#define FFS(t) \
{ \
register int n = 0; \
if (!(0xffff & t)) \
n += 16; \
if (!((0xff << n) & t)) \
n += 8; \
if (!((0xf << n) & t)) \
n += 4; \
if (!((0x3 << n) & t)) \
n += 2; \
if (!((0x1 << n) & t)) \
n += 1; \
return n; \
}
unsigned int msbFfs32(u32 x)
{
FFS(x);
}
unsigned int msbLoop32(u32 x)
{
int r = 0;
if (x < 1) return 0;
while (x >>= 1) r++;
return r;
}
unsigned int msbLoop64(u64 x)
{
int r = 0;
if (x < 1) return 0;
while (x >>= 1) r++;
return r;
}
u32 msbDeBruijn32(u32 v)
{
static const int MultiplyDeBruijnBitPosition[32] =
{
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
};
v |= v >> 1; // first round down to one less than a power of 2
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return MultiplyDeBruijnBitPosition[(u32)(v * 0x07C4ACDDU) >> 27];
}
#ifdef MICROSOFT_COMPILER
u32 msbNative32(u32 val)
{
unsigned long result;
_BitScanReverse(&result, val);
return result;
}
u32 msbNative64(u64 val)
{
unsigned long result;
_BitScanReverse64(&result, val);
return result;
}
#endif // MICROSOFT_COMPILER
template <typename InputType>
void test(unsigned int msbFunc(InputType),
const std::string &name,
const std::vector< InputType > &inputs,
std::vector< unsigned int > &results,
bool bIsReference = false
)
{
if (bIsReference)
{
int i = 0;
for (int i = 0; i < iterations; i++)
results[i] = msbFunc(inputs[i]);
}
InputType result;
if (bVerifyResults)
{
bool bNotified = false;
for (int i = 0; i < iterations; i++)
{
result = msbFunc(inputs[i]);
if ((result != results[i]) && !bNotified)
{
std::cout << "Verification failed for " << name << ": "
<< "input was " << std::hex << inputs[i]
<< "; output was " << result
<< "; expected " << results[i]
<< std::endl;
bNotified = true;
}
}
}
else
{
Timer t;
for (int i = 0; i < iterations; i++)
{
result = msbFunc(inputs[i]);
}
double elapsed = t.elapsed();
if ( !bIsReference )
std::cout << name << " took " << elapsed << " seconds" << std::endl;
if (result == -1.0f)
std::cout << "this comparison only exists to keep the compiler from " <<
"optimizing out the benchmark; this branch will never be called";
}
}
void main()
{
std::uniform_int_distribution <u64> dist64(0,
std::numeric_limits< u64 >::max());
std::uniform_int_distribution <u32> shift64(0, 63);
std::vector< u64 > inputs64;
for (int i = 0; i < iterations; i++)
{
inputs64.push_back(dist64(re) >> shift64(re));
}
std::vector< u32 > results64;
results64.resize(iterations);
test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, true);
test< u64 >(msbLoop64, "msbLoop64", inputs64, results64, false);
#ifdef MICROSOFT_COMPILER
test< u64 >(msbNative64, "msbNative64", inputs64, results64, false);
#endif // MICROSOFT_COMPILER
std::cout << std::endl;
std::uniform_int_distribution <u32> dist32(0,
std::numeric_limits< u32 >::max());
std::uniform_int_distribution <u32> shift32(0, 31);
std::vector< u32 > inputs32;
for (int i = 0; i < iterations; i++)
inputs32.push_back(dist32(re) >> shift32(re));
std::vector< u32 > results32;
results32.resize(iterations);
test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, true);
test< u32 >(msbLoop32, "msbLoop32", inputs32, results32, false);
test< u32 >(msbFfs32, "msbFfs", inputs32, results32, false);
test< u32 >(msbPerformanceJunkie32, "msbPerformanceJunkie32",
inputs32, results32, false);
test< u32 >(msbDeBruijn32, "msbDeBruijn32", inputs32, results32, false);
#ifdef MICROSOFT_COMPILER
test< u32 >(msbNative32, "msbNative32", inputs32, results32, false);
#endif // MICROSOFT_COMPILER
}
Run Code Online (Sandbox Code Playgroud)
小智 19
作为一名表演爱好者,我尝试了很多MSB套装的变化,以下是我遇到过的最快的,
unsigned int msb32(unsigned int x)
{
static const unsigned int bval[] =
{0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
unsigned int r = 0;
if (x & 0xFFFF0000) { r += 16/1; x >>= 16/1; }
if (x & 0x0000FF00) { r += 16/2; x >>= 16/2; }
if (x & 0x000000F0) { r += 16/4; x >>= 16/4; }
return r + bval[x];
}
Run Code Online (Sandbox Code Playgroud)
查找BSR(位扫描反向)x86 asm指令,以最快的方式执行此操作.来自英特尔的文档:
Searches the source operand (second operand) for the most significant set bit (1 bit).
If a most significant 1 bit is found, its bit index is stored in the destination operand
(first operand).
归档时间: |
|
查看次数: |
74223 次 |
最近记录: |