nju*_*ffa 4 c algorithm floating-point trigonometry
在一些应用中,需要多个角度的正弦和余弦,其中角度是通过将相同大小的增量incr重复添加到起始值基数来导出的。出于性能原因,无需为每个生成的角度调用sin()标准cos()数学库函数(或可能是非标准sincos()函数),只计算一次sin(base)和cos(base),然后导出所有其他角度是非常有利的通过应用角和公式计算正弦和余弦:
\n\n\nsin(基数+增量) = cos(增量) \xc2\xb7 sin(基数) + sin(增量) \xc2\xb7 cos(基数) \
\n
n cos(基数+增量) = cos(增量) \xc2\xb7 cos (基数) - sin(增量) \xc2\xb7 sin(基数)
这只需要一次性预计算比例因子sin(incr)和cos(incr),而不管执行多少次迭代。
\n\n这种方法有几个问题。如果增量很小,cos(incr)将是一个接近单位的数字,在以有限精度浮点格式计算时,会因隐式减法取消而导致精度损失。此外,由于计算没有以数值有利形式sin(base+incr) = sin(base) + adjustment排列,因此会产生比必要的更多的舍入误差,其中计算的数量adjustment 的幅度明显小于sin(base) (类似于余弦)。
\n\n由于通常会应用数十到数百个迭代步骤,因此这些误差将会累积。如何以最有利于保持高精度的方式构建迭代计算?如果融合乘加运算 (FMA) 可用(通过标准数学函数fma()和公开),则应对算法进行哪些更改fmaf()?
应用正弦半角公式可以解决问题中提到的两个影响精度的问题:
\n\n\n\n\nsin(incr/2) = \xe2\x88\x9a((1-cos(incr))/2) \xe2\x87\x92
\n
\n sin\xc2\xb2(incr/2) = (1-cos(incr) ))/2 \xe2\x87\x94
\n 2\xc2\xb7sin\xc2\xb2(incr/2) = 1-cos(incr) \xe2\x87\x94
\n 1-2\xc2\xb7sin\xc2 \xb2(增量/2) = cos(增量)
将其代入原始公式会得到以下中间表示:
\n\n\n\n\nsin(基数+增量) = (1 - 2\xc2\xb7sin\xc2\xb2(incr/2)) \xc2\xb7 sin(基数) + sin(增量) \xc2\xb7 cos(基数)
\n
\n cos(基数+增量) = (1 - 2\xc2\xb7sin\xc2\xb2(增量/2)) \xc2\xb7 cos(基数) - sin(增量) \xc2\xb7 sin(基数)
通过对术语进行简单的重新排序,可以得到所需的公式形式:
\n\n\n\n\nsin(基数+incr) = sin(基数) + (sin(incr) \xc2\xb7 cos(基数) - 2\xc2\xb7sin\xc2\xb2(incr/2) \xc2\xb7 sin(基数))
\n
\ n cos(基数+增量) = cos(基数) - (2\xc2\xb7sin\xc2\xb2(增量/2) \xc2\xb7 cos(基数) + sin(增量) \xc2\xb7 sin(基数))
与原始公式一样,这只需要一次性预计算两个比例因子,即2\xc2\xb7sin\xc2\xb2(incr/2)和sin(incr)。对于小增量,它们都很小:保留完整的精度。
\n\n关于如何将 FMA 应用于此计算有两种选择。人们可以通过废除使用单次调整的方法来最小化运算次数,而是使用两次,希望 FMA 运算减少的舍入误差(两次运算一次舍入)能够补偿精度损失:
\n\n\n\n\nsin(基数+incr) = fma (-2\xc2\xb7sin\xc2\xb2(incr/2), sin(基数), fma ( sin(incr), cos(基数), sin(基数))) \
\n
n cos(基数+增量) = fma (-2\xc2\xb7sin\xc2\xb2(incr/2), cos(基数), fma (-sin(增量), sin(基数), cos(基数)))
另一种选择是将单个 FMA 应用于改进的公式,尽管目前尚不清楚两个乘法中的哪一个应映射到 FMA 内的未舍入乘法:
\n\n\n\n\nsin(基数+incr) = sin(基数) + fma (sin(incr), cos(基数), -2\xc2\xb7sin\xc2\xb2(incr/2) \xc2\xb7 sin(基数)) \
\n
n cos(基数+incr) = cos(基数) - fma (sin(incr), sin(基数), 2\xc2\xb7sin\xc2\xb2(incr/2) \xc2\xb7 cos(基数))
下面的脚手架通过生成许多(基数、增量)对来评估上面讨论的每个计算替代方案,然后对每个对进行迭代一定数量的步骤,同时收集生成的所有正弦和余弦值的误差。由此,它计算每个测试用例的均方根误差,分别计算正弦、余弦。最后报告了在所有生成的测试用例中观察到的最大 RMS 误差。
\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n\n#define NAIVE (1)\n#define ROBUST (2)\n#define FAST (3)\n#define ACCURATE (4)\n#define MODE (ACCURATE)\n\n// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007\nstatic unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;\n#define znew (z=36969*(z&0xffff)+(z>>16))\n#define wnew (w=18000*(w&0xffff)+(w>>16))\n#define MWC ((znew<<16)+wnew)\n#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */\n#define CONG (jcong=69069*jcong+13579) /* 2^32 */\n#define KISS ((MWC^CONG)+SHR3)\n\nint main (void)\n{\n double sumerrsqS, sumerrsqC, rmsS, rmsC, maxrmsS = 0, maxrmsC = 0;\n double refS, refC, errS, errC;\n float base, incr, s0, c0, s1, c1, tt;\n int count, i;\n\n const int N = 100; // # rotation steps per test case\n count = 2000000; // # test cases (a pair of base and increment values)\n\n#if MODE == NAIVE\n printf ("testing: NAIVE (without FMA)\\n");\n#elif MODE == FAST\n printf ("testing: FAST (without FMA)\\n");\n#elif MODE == ACCURATE\n printf ("testing: ACCURATE (with FMA)\\n");\n#elif MODE == ROBUST\n printf ("testing: ROBUST (with FMA)\\n");\n#else\n#error unsupported MODE\n#endif // MODE\n\n do {\n /* generate test case */\n base = (float)(KISS * 1.21e-10); // starting angle, < 30 degrees\n incr = (float)(KISS * 2.43e-10 / N); // increment, < 60/n degrees\n\n /* set up rotation parameters */\n s1 = sinf (incr);\n#if MODE == NAIVE\n c1 = cosf (incr);\n#else\n tt = sinf (incr * 0.5f);\n c1 = 2.0f * tt * tt;\n#endif // MODE\n sumerrsqS = 0;\n sumerrsqC = 0;\n\n s0 = sinf (base); // initial sine\n c0 = cosf (base); // initial cosine\n\n /* run test case through N rotation steps */\n i = 0;\n do { \n\n tt = s0; // old sine\n#if MODE == NAIVE\n /* least accurate, 6 FP ops */\n s0 = c1 * tt + s1 * c0; // new sine\n c0 = c1 * c0 - s1 * tt; // new cosine\n#elif MODE == ROBUST\n /* very accurate, 8 FP ops */\n s0 = ( s1 * c0 - c1 * tt) + tt; // new sine\n c0 = (-s1 * tt - c1 * c0) + c0; // new cosine\n#elif MODE == FAST\n /* accurate and fast, 4 FP ops */\n s0 = fmaf (-c1, tt, fmaf ( s1, c0, tt)); // new sine\n c0 = fmaf (-c1, c0, fmaf (-s1, tt, c0)); // new cosine\n#elif MODE == ACCURATE\n /* most accurate, 6 FP ops */\n s0 = tt + fmaf (s1, c0, -c1 * tt); // new sine\n c0 = c0 - fmaf (s1, tt, c1 * c0); // new cosine\n#endif // MODE\n i++;\n\n refS = sin (fma ((double)i, (double)incr, (double)base));\n refC = cos (fma ((double)i, (double)incr, (double)base));\n errS = ((double)s0 - refS) / refS;\n errC = ((double)c0 - refC) / refC;\n sumerrsqS = fma (errS, errS, sumerrsqS);\n sumerrsqC = fma (errC, errC, sumerrsqC);\n } while (i < N);\n\n rmsS = sqrt (sumerrsqS / N);\n rmsC = sqrt (sumerrsqC / N);\n if (rmsS > maxrmsS) maxrmsS = rmsS;\n if (rmsC > maxrmsC) maxrmsC = rmsC;\n\n } while (--count);\n\n printf ("max rms error sin = % 16.9e\\n", maxrmsS);\n printf ("max rms error cos = % 16.9e\\n", maxrmsC);\n\n return EXIT_SUCCESS;\n}\nRun Code Online (Sandbox Code Playgroud)\n\n测试支架的输出表明,最快的基于 FMA 的替代方案优于问题中的朴素方法,而更准确的基于 FMA 的替代方案是所考虑的替代方案中最准确的:
\n\ntesting: NAIVE (without FMA)\nmax rms error sin = 4.837386842e-006\nmax rms error cos = 6.884047862e-006\n\ntesting: ROBUST (without FMA)\nmax rms error sin = 3.330292645e-006\nmax rms error cos = 4.297631502e-006\n\ntesting: FAST (with FMA)\nmax rms error sin = 3.532624939e-006\nmax rms error cos = 4.763623188e-006\n\ntesting: ACCURATE (with FMA)\nmax rms error sin = 3.330292645e-006\nmax rms error cos = 4.104813533e-006\nRun Code Online (Sandbox Code Playgroud)\n