kri*_*iss 23 c c++ sorting algorithm optimization
回答另一个问题,我编写了下面的程序来比较排序数组中的不同搜索方法.基本上我比较了插值搜索和二分搜索的两种实现.我通过计算不同变体所花费的周期(使用相同的数据集)来比较性能.
但是我确信有一些方法可以优化这些功能,使它们更快.有没有人对如何更快地使这个搜索功能有任何想法?使用C或C++的解决方案是可以接受的,但我需要它来处理具有100000个元素的数组.
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <assert.h>
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
int interpolationSearch(int sortedArray[], int toFind, int len) {
// Returns index of toFind in sortedArray, or -1 if not found
int64_t low = 0;
int64_t high = len - 1;
int64_t mid;
int l = sortedArray[low];
int h = sortedArray[high];
while (l <= toFind && h >= toFind) {
mid = low + (int64_t)((int64_t)(high - low)*(int64_t)(toFind - l))/((int64_t)(h-l));
int m = sortedArray[mid];
if (m < toFind) {
l = sortedArray[low = mid + 1];
} else if (m > toFind) {
h = sortedArray[high = mid - 1];
} else {
return mid;
}
}
if (sortedArray[low] == toFind)
return low;
else
return -1; // Not found
}
int interpolationSearch2(int sortedArray[], int toFind, int len) {
// Returns index of toFind in sortedArray, or -1 if not found
int low = 0;
int high = len - 1;
int mid;
int l = sortedArray[low];
int h = sortedArray[high];
while (l <= toFind && h >= toFind) {
mid = low + ((float)(high - low)*(float)(toFind - l))/(1+(float)(h-l));
int m = sortedArray[mid];
if (m < toFind) {
l = sortedArray[low = mid + 1];
} else if (m > toFind) {
h = sortedArray[high = mid - 1];
} else {
return mid;
}
}
if (sortedArray[low] == toFind)
return low;
else
return -1; // Not found
}
int binarySearch(int sortedArray[], int toFind, int len)
{
// Returns index of toFind in sortedArray, or -1 if not found
int low = 0;
int high = len - 1;
int mid;
int l = sortedArray[low];
int h = sortedArray[high];
while (l <= toFind && h >= toFind) {
mid = (low + high)/2;
int m = sortedArray[mid];
if (m < toFind) {
l = sortedArray[low = mid + 1];
} else if (m > toFind) {
h = sortedArray[high = mid - 1];
} else {
return mid;
}
}
if (sortedArray[low] == toFind)
return low;
else
return -1; // Not found
}
int order(const void *p1, const void *p2) { return *(int*)p1-*(int*)p2; }
int main(void) {
int i = 0, j = 0, size = 100000, trials = 10000;
int searched[trials];
srand(-time(0));
for (j=0; j<trials; j++) { searched[j] = rand()%size; }
while (size > 10){
int arr[size];
for (i=0; i<size; i++) { arr[i] = rand()%size; }
qsort(arr,size,sizeof(int),order);
unsigned long long totalcycles_bs = 0;
unsigned long long totalcycles_is_64 = 0;
unsigned long long totalcycles_is_float = 0;
unsigned long long totalcycles_new = 0;
int res_bs, res_is_64, res_is_float, res_new;
for (j=0; j<trials; j++) {
unsigned long long tmp, cycles = rdtsc();
res_bs = binarySearch(arr,searched[j],size);
tmp = rdtsc(); totalcycles_bs += tmp - cycles; cycles = tmp;
res_is_64 = interpolationSearch(arr,searched[j],size);
assert(res_is_64 == res_bs || arr[res_is_64] == searched[j]);
tmp = rdtsc(); totalcycles_is_64 += tmp - cycles; cycles = tmp;
res_is_float = interpolationSearch2(arr,searched[j],size);
assert(res_is_float == res_bs || arr[res_is_float] == searched[j]);
tmp = rdtsc(); totalcycles_is_float += tmp - cycles; cycles = tmp;
}
printf("----------------- size = %10d\n", size);
printf("binary search = %10llu\n", totalcycles_bs);
printf("interpolation uint64_t = %10llu\n", totalcycles_is_64);
printf("interpolation float = %10llu\n", totalcycles_is_float);
printf("new = %10llu\n", totalcycles_new);
printf("\n");
size >>= 1;
}
}
Run Code Online (Sandbox Code Playgroud)
Ben*_*igt 16
如果您可以控制数据的内存中布局,则可能需要查看Judy数组.
或者提出一个更简单的想法:二进制搜索总是将搜索空间减少一半.可以通过插值找到最佳切割点(切割点不应该是预期密钥的位置,而是最小化下一步搜索空间的统计期望的点).这最大限度地减少了步骤数量,但并非所有步骤都具有相同的成本.如果可以维持位置,则分层存储器允许在单个测试的同时执行多个测试.由于二进制搜索的前M个步骤仅触及最多2**M个唯一元素,因此将这些存储在一起可以更好地减少每个高速缓存行获取的搜索空间(不是每次比较),这在现实世界中具有更高的性能.
n-ary树在此基础上工作,然后Judy数组添加一些不太重要的优化.
结论:即使"随机存取存储器"(RAM)在顺序访问时比随机访问更快.搜索算法应该利用这一事实.
基于Win32 Core2 Quad Q6600,gcc v4.3 msys进行基准测试.用g ++ -O3编译,没什么特别的.
观察 - 断言,定时和循环开销约为40%,因此下面列出的任何增益应除以0.6,以获得被测算法的实际改进.
答案简单:
在我的机器上,用int替换int64_t为"低",插值搜索中的"高"和"中"使速度提高20%到40%.这是我能找到的最简单的方法.在我的机器上每次查找大约需要150个周期(阵列大小为100000).这与缓存未命中的周期数大致相同.所以在实际应用中,照顾缓存可能是最重要的因素.
用">> 1"替换binarySearch的"/ 2"可以加快4%的速度.
在包含与"arr"相同的数据的向量上使用STL的binary_search算法,与手动编码的binarySearch大致相同.虽然在较小的"尺寸"上,STL要慢得多 - 大约40%.