有效地实现floored/euclidean整数除法

Question

有效地实现floored/euclidean整数除法

Vla*_*eev 21 c c++ math division integer-division

地板划分是指结果始终向下(朝向-∞),而不是0:

分工类型

是否有可能在C/C++中有效地实现floored或euclidean整数除法？

(显而易见的解决方案是检查红利的标志)

Answer 1

Vla*_*eev 8

我写了一个测试程序来对这里提出的想法进行基准测试:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <windows.h>

#define N 10000000
#define M 100

int dividends[N], divisors[N], results[N];

__forceinline int floordiv_signcheck(int a, int b)
{
    return (a<0 ? a-(b-1) : a) / b;
}

__forceinline int floordiv_signcheck2(int a, int b)
{
    return (a - (a<0 ? b-1 : 0)) / b;
}

__forceinline int floordiv_signmultiply(int a, int b)
{
    return (a + (a>>(sizeof(a)*8-1))*(b-1)) / b;
}

__forceinline int floordiv_floatingpoint(int a, int b)
{
    // I imagine that the call to floor can be replaced to a cast
    // if you can get FPU rounding control to work (I couldn't).
    return floor((double)a / b);
}

void main()
{
    for (int i=0; i<N; i++)
    {
        dividends[i] = rand();
        do
            divisors[i] = rand();
        while (divisors[i]==0);
    }

    LARGE_INTEGER t0, t1;

    QueryPerformanceCounter(&t0);
    for (int j=0; j<M; j++)
        for (int i=0; i<N; i++)
            results[i] = floordiv_signcheck(dividends[i], divisors[i]);
    QueryPerformanceCounter(&t1);
    printf("signcheck    : %9llu\n", t1.QuadPart-t0.QuadPart);

    QueryPerformanceCounter(&t0);
    for (int j=0; j<M; j++)
        for (int i=0; i<N; i++)
            results[i] = floordiv_signcheck2(dividends[i], divisors[i]);
    QueryPerformanceCounter(&t1);
    printf("signcheck2   : %9llu\n", t1.QuadPart-t0.QuadPart);

    QueryPerformanceCounter(&t0);
    for (int j=0; j<M; j++)
        for (int i=0; i<N; i++)
            results[i] = floordiv_signmultiply(dividends[i], divisors[i]);
    QueryPerformanceCounter(&t1);
    printf("signmultiply : %9llu\n", t1.QuadPart-t0.QuadPart);

    QueryPerformanceCounter(&t0);
    for (int j=0; j<M; j++)
        for (int i=0; i<N; i++)
            results[i] = floordiv_floatingpoint(dividends[i], divisors[i]);
    QueryPerformanceCounter(&t1);
    printf("floatingpoint: %9llu\n", t1.QuadPart-t0.QuadPart);
}

Run Code Online (Sandbox Code Playgroud)

结果:

signcheck    :  61458768
signcheck2   :  61284370
signmultiply :  61625076
floatingpoint: 287315364

Run Code Online (Sandbox Code Playgroud)

所以,根据我的结果,检查标志是最快的:

(a - (a<0 ? b-1 : 0)) / b

Run Code Online (Sandbox Code Playgroud)

这个程序不是只测试正红利吗？`rand()` 被指定为只返回正整数。 (2认同)

Answer 2

Dol*_*000 5

五年后我重新审视这个问题，因为这也与我相关。我对 x86-64 的两个纯 C 版本和两个内联汇编版本进行了一些性能测量，结果可能很有趣。

经过测试的地板划分变体是：

我已经使用了一段时间的实现；
与上面介绍的略有不同，仅使用一个分区；
前一个，但是在内联汇编中手动实现；和
CMOV在汇编中实现的版本。

以下是我的基准程序：

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#ifndef VARIANT
#define VARIANT 3
#endif

#if VARIANT == 0
#define floordiv(a, b) (((a) < 0)?((((a) + 1) / (b)) - 1):((a) / (b)))
#elif VARIANT == 1
#define floordiv(a, b) ((((a) < 0)?((a) - ((b) - 1)):(a)) / (b))
#elif VARIANT == 2
#define floordiv(a, b) ({                                   \
    int result;                                             \
    asm("test %%eax, %%eax; jns 1f; sub %1, %%eax;"         \
        "add $1, %%eax; 1: cltd; idivl %1;"                 \
        : "=a" (result)                                     \
        : "r" (b),                                          \
          "0" (a)                                           \
        : "rdx");                                           \
    result;})
#elif VARIANT == 3
#define floordiv(a, b) ({                                           \
    int result;                                                     \
    asm("mov %%eax, %%edx; sub %1, %%edx; add $1, %%edx;"           \
        "test %%eax, %%eax; cmovs %%edx, %%eax; cltd;"              \
        "idivl %1;"                                                 \
        : "=a" (result)                                             \
        : "r" (b),                                                  \
          "0" (a)                                                   \
        : "rdx");                                                   \
    result;})
#endif

double ntime(void)
{
    struct timeval tv;

    gettimeofday(&tv, NULL);
    return(tv.tv_sec + (((double)tv.tv_usec) / 1000000.0));
}

void timediv(int n, int *p, int *q, int *r)
{
    int i;

    for(i = 0; i < n; i++)
        r[i] = floordiv(p[i], q[i]);
}

int main(int argc, char **argv)
{
    int n, i, *q, *p, *r;
    double st;

    n = 10000000;
    p = malloc(sizeof(*p) * n);
    q = malloc(sizeof(*q) * n);
    r = malloc(sizeof(*r) * n);
    for(i = 0; i < n; i++) {
        p[i] = (rand() % 1000000) - 500000;
        q[i] = (rand() % 1000000) + 1;
    }

    st = ntime();
    for(i = 0; i < 100; i++)
        timediv(n, p, q, r);
    printf("%g\n", ntime() - st);
    return(0);
}

Run Code Online (Sandbox Code Playgroud)

我gcc -march=native -Ofast使用 GCC 4.9.2 编译了它，在我的 Core i5-2400 上的结果如下。每次运行的结果都相当可重复——至少它们总是以相同的顺序着陆。

变体 0：7.21 秒
变体 1：7.26 秒
变体 2：6.73 秒
变体 3：4.32 秒

因此，这一CMOV实施至少让其他人大吃一惊。让我惊讶的是，变体 2 远远超过了它的纯 C 版本（变体 1）。我认为编译器应该能够发出至少与我的代码一样高效的代码。

以下是一些其他平台，供比较：

AMD 速龙 64 X2 4200+、GCC 4.7.2：

变体 0：26.33 秒
变体 1：25.38 秒
变体 2：25.19 秒
变体 3：22.39 秒

至强 E3-1271 v3、GCC 4.9.2：

变体 0：5.95 秒
变体 1：5.62 秒
变体 2：5.40 秒
变体 3：3.44 秒

最后一点，我也许应该警告不要过于认真地对待该CMOV版本的明显性能优势，因为在现实世界中，其他版本中的分支可能不会像此基准测试中那样完全随机，并且如果分支预测器如果可以完成合理的工作，分支版本可能会更好。然而，现实情况在很大程度上取决于实践中使用的数据，因此尝试进行任何通用基准测试可能毫无意义。

请注意，变体 0 和 1（未检查其他变体）仅在除数为正时给出楼层除法的数字正确结果。（在这种情况下，结果也与欧几里得除法一致。）例如，如果 a,b = 5,-3，这些公式表示商为 -1，但楼层除法的结果应为 -2。（5,-3 的欧几里得除法是 -1，但当两个操作数均为负数时，这些公式给出的答案与下取整除法和欧几里得除法不同。） (2认同)

归档时间：	15 年，2 月前
查看次数：	1942 次
最近记录：	6 年，11 月前