cni*_*tar 163
我有很好的结果与djb2由丹·伯恩斯坦.
unsigned long
hash(unsigned char *str)
{
unsigned long hash = 5381;
int c;
while (c = *str++)
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
Run Code Online (Sandbox Code Playgroud)
And*_*kha 28
我想验证卞晓宁的回答,可惜他没有贴出他的代码。因此,我实现了一个小测试套件,并在466K 英语单词列表上运行不同的小哈希函数,以查看每个单词的冲突数量:
Hash function | Collisions | Time (words) | Time (file) | Avalanching
===============================================================================
CRC32 | 23 (0.005%) | 96.1 ms | 120.9 ms | 99.8%
MurmurOAAT | 26 (0.006%) | 17.2 ms | 19.4 ms | 100.0%
FNV hash | 32 (0.007%) | 16.4 ms | 14.0 ms | 98.6%
Jenkins OAAT | 36 (0.008%) | 19.9 ms | 18.5 ms | 100.0%
DJB2 hash | 344 (0.074%) | 18.2 ms | 12.7 ms | 92.9%
K&R V2 | 356 (0.076%) | 18.0 ms | 11.9 ms | 92.8%
Coffin | 763 (0.164%) | 16.3 ms | 11.7 ms | 91.4%
x17 hash | 2242 (0.481%) | 18.6 ms | 20.0 ms | 91.5%
-------------------------------------------------------------------------------
large_prime | 25 (0.005%) | 16.5 ms | 18.6 ms | 96.6%
MurmurHash3_x86_32 | 19 (0.004%) | 20.5 ms | 4.4 ms | 100.0%
Run Code Online (Sandbox Code Playgroud)
我包括了两者的时间:分别对所有单词进行哈希处理,并对所有英语单词的整个文件进行一次哈希处理。我还在MurmurHash3_x86_32我的测试中加入了一个更复杂的内容以供参考。此外,我还计算了“雪崩”,它是衡量连续单词的哈希值的不可预测性的指标。任何低于 100% 的值都意味着“相当可预测”(这对于哈希表可能没问题,但对于其他用途来说很糟糕)。
结论:
Liz/ MHz、Bon/ COM、Rey/ SEX。hash = 17000069 * hash + s[i]与 DJB2 的 344 次碰撞相比,仅产生 25 次碰撞。测试代码(用 编译gcc -O2):
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define MODE 1 // 1 = hash each word individually; 2 = hash the entire file
#define MAXLEN 50 // maximum length of a word
#define MAXWORDS 500000 // maximum number of words in a file
#define SEED 0x12345678
#if MODE == 1
char words[MAXWORDS][MAXLEN];
#else
char file[MAXWORDS * MAXLEN];
#endif
uint32_t DJB2_hash(const uint8_t *str)
{
uint32_t hash = 5381;
uint8_t c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
uint32_t FNV(const void* key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
h ^= 2166136261UL;
const uint8_t* data = (const uint8_t*)key;
for(int i = 0; data[i] != '\0'; i++)
{
h ^= data[i];
h *= 16777619;
}
return h;
}
uint32_t MurmurOAAT_32(const char* str, uint32_t h)
{
// One-byte-at-a-time hash based on Murmur's mix
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
for (; *str; ++str) {
h ^= *str;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint32_t KR_v2_hash(const char *s)
{
// Source: /sf/answers/3194870171/
// a.k.a. Java String hashCode()
uint32_t hashval = 0;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval;
}
uint32_t Jenkins_one_at_a_time_hash(const char *str)
{
uint32_t hash, i;
for (hash = i = 0; str[i] != '\0'; ++i)
{
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
uint32_t crc32b(const uint8_t *str) {
// Source: /sf/answers/1470119871/
unsigned int byte, crc, mask;
int i = 0, j;
crc = 0xFFFFFFFF;
while (str[i] != 0) {
byte = str[i];
crc = crc ^ byte;
for (j = 7; j >= 0; j--) {
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
inline uint32_t _rotl32(uint32_t x, int32_t bits)
{
return x<<bits | x>>(32-bits); // C idiom: will be optimized to a single operation
}
uint32_t Coffin_hash(char const *input) {
// Source: /sf/answers/536666791/
uint32_t result = 0x55555555;
while (*input) {
result ^= *input++;
result = _rotl32(result, 5);
}
return result;
}
uint32_t x17(const void * key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
const uint8_t * data = (const uint8_t*)key;
for (int i = 0; data[i] != '\0'; ++i)
{
h = 17 * h + (data[i] - ' ');
}
return h ^ (h >> 16);
}
uint32_t large_prime(const uint8_t *s, uint32_t hash)
{
// Better alternative to x33: multiply by a large co-prime instead of a small one
while (*s != '\0')
hash = 17000069*hash + *s++;
return hash;
}
void readfile(FILE* fp, char* buffer) {
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
rewind(fp);
fread(buffer, size, 1, fp);
buffer[size] = '\0';
}
struct timeval stop, start;
void timing_start() {
gettimeofday(&start, NULL);
}
void timing_end() {
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
}
uint32_t apply_hash(int hash, const char* line)
{
uint32_t result;
#if MODE == 2
timing_start();
#endif
switch (hash) {
case 1: result = crc32b((const uint8_t*)line); break;
case 2: result = large_prime((const uint8_t *)line, SEED); break;
case 3: result = MurmurOAAT_32(line, SEED); break;
case 4: result = FNV(line, SEED); break;
case 5: result = Jenkins_one_at_a_time_hash(line); break;
case 6: result = DJB2_hash((const uint8_t*)line); break;
case 7: result = KR_v2_hash(line); break;
case 8: result = Coffin_hash(line); break;
case 9: result = x17(line, SEED); break;
default: break;
}
#if MODE == 2
timing_end();
#endif
return result;
}
int main(int argc, char* argv[])
{
// Read arguments
const int hash_choice = atoi(argv[1]);
char const* const fname = argv[2];
printf("Algorithm: %d\n", hash_choice);
printf("File name: %s\n", fname);
// Open file
FILE* f = fopen(fname, "r");
#if MODE == 1
char line[MAXLEN];
size_t word_count = 0;
uint32_t hashes[MAXWORDS];
// Read file line by line
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\n")] = '\0'; // strip newline
strcpy(words[word_count], line);
word_count++;
}
fclose(f);
// Calculate hashes
timing_start();
for (size_t i = 0; i < word_count; i++) {
hashes[i] = apply_hash(hash_choice, words[i]);
}
timing_end();
// Output hashes
for (size_t i = 0; i < word_count; i++) {
printf("%08x\n", hashes[i]);
}
#else
// Read entire file
readfile(f, file);
fclose(f);
printf("Len: %lu\n", strlen(file));
// Calculate hash
uint32_t hash = apply_hash(hash_choice, file);
printf("%08x\n", hash);
#endif
return 0;
}
Run Code Online (Sandbox Code Playgroud)
PS 对现代哈希函数的速度和质量的更全面的回顾可以在 Reini Urban (rurban) 的SMHasher 存储库中找到。请注意表中的“质量问题”列。
Jer*_*fin 20
首先,您通常不希望对哈希表使用加密哈希.根据哈希表标准,加密标准非常快的算法仍然非常慢.
其次,您希望确保输入的每一位都能/将影响结果.一种简单的方法是将当前结果旋转一些位,然后用当前字节对当前哈希码进行异或.重复,直到到达字符串的末尾.请注意,您通常不希望旋转为字节大小的偶数倍.
例如,假设8位字节的常见情况,您可以旋转5位:
int hash(char const *input) {
int result = 0x55555555;
while (*input) {
result ^= *input++;
result = rol(result, 5);
}
}
Run Code Online (Sandbox Code Playgroud)
编辑:另请注意,10000个插槽很少是哈希表大小的好选择.您通常需要以下两种方法之一:您要么使用素数作为大小(需要确保某些类型的散列分辨率的正确性),要么需要2的幂(因此可以通过简单的方法将值减小到正确的范围位掩码).
djb2很好尽管cnicutar 在 stackoverflow 上介绍的djb2几乎肯定更好,但我认为也值得展示K&R哈希值:
unsigned long hash(unsigned char *str)
{
unsigned int hash = 0;
int c;
while (c = *str++)
hash += c;
return hash;
}
Run Code Online (Sandbox Code Playgroud)
% HASHSIZE注意:如果您打算在哈希算法之外对数组长度进行模数调整,请务必从返回语句中删除。另外,我建议您将 return 和“hashval”类型设为unsigned long,甚至更好:uint32_tor uint64_t,而不是简单的unsigned(int) 。这是一个简单的算法,它通过执行以下算法风格来考虑字符串中每个字节的字节顺序hashvalue = new_byte + 31*hashvalue:对于字符串中的所有字节:
unsigned hash(char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval % HASHSIZE;
}
Run Code Online (Sandbox Code Playgroud)
请注意,从这两种算法可以清楚地看出,第一版哈希如此糟糕的原因之一是它没有考虑字符串字符顺序,因此hash("ab")会返回与 相同的值hash("ba")。然而,第二版哈希并非如此,它会(更好!)为这些字符串返回两个不同的值。
std::unordered_map<>模板容器哈希表使用的GCC C++11哈希函数非常出色。用于unordered_map(哈希表模板)和unordered_set(哈希集模板)的 GCC C++11 哈希函数如下所示。
代码:
// Implementation of Murmur hash for 32-bit size_t.
size_t _Hash_bytes(const void* ptr, size_t len, size_t seed)
{
const size_t m = 0x5bd1e995;
size_t hash = seed ^ len;
const char* buf = static_cast<const char*>(ptr);
// Mix 4 bytes at a time into the hash.
while (len >= 4)
{
size_t k = unaligned_load(buf);
k *= m;
k ^= k >> 24;
k *= m;
hash *= m;
hash ^= k;
buf += 4;
len -= 4;
}
// Handle the last few bytes of the input array.
switch (len)
{
case 3:
hash ^= static_cast<unsigned char>(buf[2]) << 16;
[[gnu::fallthrough]];
case 2:
hash ^= static_cast<unsigned char>(buf[1]) << 8;
[[gnu::fallthrough]];
case 1:
hash ^= static_cast<unsigned char>(buf[0]);
hash *= m;
};
// Do a few final mixes of the hash.
hash ^= hash >> 13;
hash *= m;
hash ^= hash >> 15;
return hash;
}
Run Code Online (Sandbox Code Playgroud)
std::unordered_map<>。Austin 不仅是所有这些中最好的,而且还将 MurmerHash3 发布到公共领域。请参阅我的其他答案:What is the default hash function using in C++ std::unordered_map? 。
维基百科显示了一个很棒的字符串哈希函数,名为Jenkins One A Time Hash.它还引用了此哈希的改进版本.
uint32_t jenkins_one_at_a_time_hash(char *key, size_t len)
{
uint32_t hash, i;
for(hash = i = 0; i < len; ++i)
{
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
Run Code Online (Sandbox Code Playgroud)
djb2 对于这个 466k 的英语词典有 317 次冲突,而 MurmurHash 没有 64 位哈希的冲突,21 次 32 位哈希(466k 随机 32 位哈希预计大约 25 次)。我的建议是使用MurmurHash(如果可用),它非常快,因为它一次需要几个字节。但是,如果您需要一个简单而简短的哈希函数来复制并粘贴到您的项目中,我建议您使用 murmurs 一次一个字节的版本:
uint32_t inline MurmurOAAT32 ( const char * key)
{
uint32_t h(3323198485ul);
for (;*key;++key) {
h ^= *key;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint64_t inline MurmurOAAT64 ( const char * key)
{
uint64_t h(525201411107845655ull);
for (;*key;++key) {
h ^= *key;
h *= 0x5bd1e9955bd1e995;
h ^= h >> 47;
}
return h;
}
Run Code Online (Sandbox Code Playgroud)
哈希表的最佳大小 - 简而言之 - 在仍然适合内存的情况下尽可能大。因为我们通常不知道或不想查看我们有多少可用内存,甚至可能会发生变化,所以最佳哈希表大小大约是表中预期存储元素数量的 2 倍。分配比这多得多的值将使您的哈希表更快,但收益会迅速减少,使您的哈希表比这更小将使其速度呈指数级增长。这是因为哈希表的空间复杂度和时间复杂度之间存在非线性权衡,最佳负载因子为 2-sqrt(2) = 0.58 ......显然。
| 归档时间: |
|
| 查看次数: |
193155 次 |
| 最近记录: |