BPL*_*BPL 7 c++ floating-point optimization fixed-point
在阅读了很多有关定点算术的知识之后,我想我可以说我已经理解了基础知识,但是不幸的是,我还不知道如何转换使用sin / cos / sqrt或任何其他fp函数的例程。
考虑一下这个简单的mcve:
#include <math.h>
#include <stdio.h>
#include <ctime>
#include <fstream>
#include <iostream>
typedef char S8;
typedef short S16;
typedef int S32;
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef float F32;
typedef double F64;
// -------- Fixed point helpers QM.N(32bits) --------
typedef S32 FP32;
#define LUT_SIZE_BITS 9 // 0xffffffff>>(32-9)=511; 32-23=9; 2^9=512
#define LUT_SIZE 512
#define FRACT_BITS 28 // Number fractional bits
#define M (1 << FRACT_BITS) // Scaling factor
inline F32 Q2F(FP32 X) { return ((F32)X / (F32)(M)); }
inline FP32 F2Q(F32 X) { return (FP32)(X * (M)); }
const F32 PI = 3.141592653589793f;
const F32 pi = 3.141592653589793f;
const U32 WIDTH = 256;
const U32 HEIGHT = 256;
FP32 cos_table[LUT_SIZE];
FP32 sin_table[LUT_SIZE];
void init_luts() {
const F32 deg_to_rad = PI / 180.f;
const F32 sample_to_deg = 1 / LUT_SIZE * 360.f;
for (S32 i = 0; i < LUT_SIZE; i++) {
F32 rad = ((F32)i * sample_to_deg) * deg_to_rad;
F32 c = cos(rad);
F32 s = sin(rad);
cos_table[i] = F2Q(c);
sin_table[i] = F2Q(s);
}
}
// -------- Image processing --------
U8 clamp(F32 valor) { return valor > 255 ? 255 : (valor < 0 ? 0 : (U8)valor); }
struct Pbits {
U32 width;
U32 height;
U8 *data;
Pbits(U32 width, U32 height, U8 *data) {
this->width = width;
this->height = height;
this->data = data;
}
Pbits(Pbits *src) {
this->width = src->width;
this->height = src->height;
this->data = new U8[src->width * src->height * 3];
memcpy(this->data, src->data, width * height * 3);
}
~Pbits() { delete this->data; }
void to_bgr() {
U8 r, g, b;
for (S32 y = 0; y < height; y++) {
for (S32 x = 0; x < width; x++) {
get_pixel(y, x, r, g, b);
set_pixel(y, x, b, g, r);
}
}
}
void get_pixel(U32 y, U32 x, U8 &r, U8 &g, U8 &b) {
U32 offset = (y * height * 3) + (x * 3);
r = this->data[offset + 0];
g = this->data[offset + 1];
b = this->data[offset + 2];
}
void set_pixel(U32 y, U32 x, U8 c1, U8 c2, U8 c3) {
U32 offset = (y * height * 3) + (x * 3);
data[offset] = c1;
data[offset + 1] = c2;
data[offset + 2] = c3;
}
};
void fx1_plasma(Pbits *dst, F32 t, F32 k1, F32 k2, F32 k3, F32 k4, F32 k5, F32 k6) {
U32 height = dst->height;
U32 width = dst->width;
for (U32 y = 0; y < height; y++) {
F32 uv_y = (F32)y / height;
for (U32 x = 0; x < width; x++) {
F32 uv_x = (F32)x / width;
F32 v1 = sin(uv_x * k1 + t);
F32 v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
F32 cx = uv_x + sin(t / k1) * k1;
F32 cy = uv_y + sin(t / k2) * k1;
F32 v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
F32 vf = v1 + v2 + v3;
U8 r = (U8)clamp(255 * cos(vf * pi));
U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));
dst->set_pixel(y, x, r, g, b);
}
}
}
// -------- Image helpers --------
inline void _write_s32(U8 *dst, S32 offset, S32 v) {
dst[offset] = (U8)(v);
dst[offset + 1] = (U8)(v >> 8);
dst[offset + 2] = (U8)(v >> 16);
dst[offset + 3] = (U8)(v >> 24);
}
void write_bmp(Pbits *src, S8 *filename) {
Pbits *dst = new Pbits(src);
dst->to_bgr();
S32 w = dst->width;
S32 h = dst->height;
U8 *img = dst->data;
S32 filesize = 54 + 3 * w * h;
U8 bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
U8 bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
U8 bmppad[3] = {0, 0, 0};
_write_s32(bmpfileheader, 2, filesize);
_write_s32(bmpinfoheader, 4, w);
_write_s32(bmpinfoheader, 8, h);
FILE *f = fopen(filename, "wb");
fwrite(bmpfileheader, 1, 14, f);
fwrite(bmpinfoheader, 1, 40, f);
for (S32 i = 0; i < h; i++) {
fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
}
delete dst;
}
void write_ppm(Pbits *dst, S8 *filename) {
std::ofstream file(filename, std::ofstream::trunc);
if (!file.is_open()) {
std::cout << "yep! file is not open" << std::endl;
}
file << "P3\n" << dst->width << " " << dst->height << "\n255\n";
U8 r, g, b, a;
for (U32 y = 0; y < dst->height; y++) {
for (U32 x = 0; x < dst->width; x++) {
dst->get_pixel(y, x, r, g, b);
file << (S32)r << " " << (S32)g << " " << (S32)b << "\n";
}
}
}
S32 main() {
Pbits *dst = new Pbits(WIDTH, HEIGHT, new U8[WIDTH * HEIGHT * 3]);
init_luts();
clock_t begin = clock();
fx1_plasma(dst, 0, 8, 36, 54, 51, 48, 4);
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Generated plasma in " << elapsed_secs << "s" << std::endl;
write_ppm(dst, "plasma.ppm");
write_bmp(dst, "plasma.bmp");
delete dst;
}
Run Code Online (Sandbox Code Playgroud)
此代码将生成此图像:
问题:如何将这种浮点算法转换为快速的定点算法?现在,浮点算术的基础知识对我来说是+/-清楚的,例如:
fa,fb=floating point values; a,b=fixed_point ones; M=scaling factor
fa = a*M
fb = b*M
fa+fb = (a+b)*M
fa-fb = (a-b)*M
fa*fb = (a*b)*M^2
fa/fb = (a/b)
Run Code Online (Sandbox Code Playgroud)
但是如何在定点使用sin / cos / sqrt等仍然使我难以理解。我已经找到了这个相关的线程,但是我仍然不明白如何使用带有随机fp值的三角函数。
#ifdef _MSC_VER\n#pragma comment(lib,"opengl32.lib")\n#pragma comment(lib,"glu32.lib")\n#pragma warning(disable : 4996)\n#pragma warning(disable : 26495) //varsay\xc4\xb1lan de\xc4\x9fer atam\xc4\xb1yorum\n#pragma warning(disable :6031)//d\xc3\xb6n\xc3\xbc\xc5\x9f de\xc4\x9feri yok say\xc4\xb1ld\xc4\xb1\n#endif\n\n#include <Windows.h>\n#include <gl/gl.h> //-lopengl32 \n//#include <gl/glu.h> //-lglu32\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#include <string.h>\n\ntypedef unsigned char U8;\ntypedef unsigned int U32;\n\n#define LUT_SIZE 1024 //1Kb 10 bit sin cos\n#define LUT_SIZE2 0x000fffff //1Mb 20 bit sqrt\n\nfloat cos_tab[LUT_SIZE];\nfloat sin_tab[LUT_SIZE];\nU8 clamp_cos_tab[LUT_SIZE];\nU8 clamp_sin_tab[LUT_SIZE];\nfloat sqrt_tab[LUT_SIZE2];\n\n\nconst float pi = 3.141592;\nconst float pi_k = LUT_SIZE / (2 * pi);\n\nconst U32 WIDTH = 640; //256\nconst U32 HEIGHT = 480; //256\n\n\n\nstruct Pbits;\nPbits* pdst;\n\n\nU8 clamp(float f) { return f > 255 ? 255 : (f < 0 ? 0 : (U8)f); }\n\n#define sin2(f) sin_tab [ (int)(pi_k * (f)) & 0x000003ff]//LUT_SIZE-1\n#define cos2(f) cos_tab [ (int)(pi_k * (f)) & 0x000003ff]\n#define clamp_sin(f) clamp_sin_tab [ (int)(pi_k * (f)) & 0x000003ff]\n#define clamp_cos(f) clamp_cos_tab [ (int)(pi_k * (f)) & 0x000003ff]\n#define sqrt2(f) sqrt_tab [*(int*)&(f)>>12] //32-20 bit\n\nvoid init_luts()\n{\n for (int i = 0; i < LUT_SIZE; i++)\n {\n cos_tab[i] = cos(i / pi_k);\n sin_tab[i] = sin(i / pi_k);\n\n clamp_cos_tab[i] = clamp(255 * cos(i / pi_k));\n clamp_sin_tab[i] = clamp(255 * sin(i / pi_k));\n }\n\n for (int i = 0; i < LUT_SIZE2; i++)//init_luts\n {\n int ii=i<<12; //32-20 bit\n float f = *(float *)ⅈ //i to float\n sqrt_tab[i] = sqrt(f);\n }\n\n}\n\n\n\nfloat sqrt3(float x)\n{\n //https ://www.codeproject.com/Articles/69941/Best-Square-Root-Method-Algorithm-Function-Precisi\n unsigned int i = *(unsigned int*)& x;\n\n i += (127 << 23);\n i >>= 1;\n return *(float*)&i;\n}\nfloat sqrt4(float x)\n{\n //https: //stackoverflow.com/questions/1349542/john-carmacks-unusual-fast-inverse-square-root-quake-iii\n float xhalf = 0.5f * x;\n int i = *(int*)&x; // get bits for floating value\n i = 0x5f375a86 - (i >> 1); // gives initial guess y0\n x = *(float*)&i; // convert bits back to float\n x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy\n return x;\n}\n\n\n\nstruct Pbits\n{\n int width;\n int height;\n U8* data;\n\n Pbits(int _width, int _height, U8* _data = 0)\n {\n width = _width;\n height = _height;\n if (!_data)\n _data = (U8*)calloc(width * height * 3, 1);\n data = _data;\n }\n\n ~Pbits() { free(data); }\n void set_pixel(int y, int x, U8 c1, U8 c2, U8 c3)\n {\n int offset = (y * width * 3) + (x * 3);\n\n data[offset] = c1;\n data[offset + 1] = c2;\n data[offset + 2] = c3;\n }\n void save(const char* filename)\n {\n\n U8 pp[54] = { \'B\', \'M\', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0 ,\n 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0 };\n *(int*)(pp + 2) = 54 + 3 * width * height;\n *(int*)(pp + 18) = width;\n *(int*)(pp + 22) = height;\n\n int size = height * width * 3;\n U8* p = data;\n for (int i = 0; i < size; i += 3)//to_bgr()\n {\n U8 tmp = p[i];\n p[i] = p[i + 2];\n p[i + 2] = tmp;\n }\n\n FILE* f = fopen(filename, "wb");\n fwrite(pp, 1, 54, f);\n fwrite(data, size, 1, f);\n fclose(f);\n\n for (int i = 0; i < size; i += 3)//to_rgb()\n {\n U8 tmp = p[i];\n p[i] = p[i + 2];\n p[i + 2] = tmp;\n }\n\n }\n\n};\n\nvoid fn_plasma_slow(Pbits& dst, float t,\n float k1, float k2, float k3, float k4, float k5, float k6)\n{\n int height = dst.height;\n int width = dst.width;\n\n for (int y = 0; y < height; y++)\n {\n float uv_y = (float)y / height;\n for (int x = 0; x < width; x++)\n {\n float uv_x = (float)x / width;\n\n float v1 = sin(uv_x * k1 + t);\n float v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);\n float cx = uv_x + sin(t / k1) * k1;\n float cy = uv_y + sin(t / k2) * k1;\n float v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);\n float vf = v1 + v2 + v3;\n\n U8 r = (U8)clamp(255 * cos(vf * pi));\n U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));\n U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));\n\n dst.set_pixel(y, x, r, g, b);\n }\n }\n}\n\n\nvoid fn_plasma_fast(Pbits& dst, float t,\n float k1, float k2, float k3,\n float k4, float k5, float k6)\n{\n\n U8* p = dst.data;\n\n float\n height = dst.height,\n width = dst.width,\n\n _k42 = pi * k4 / k2,\n _k52 = pi * k5 / k2,\n _cx = sin2(t / k1) * k1,\n _cy = sin2(t / k2) * k1,\n _x = sin2(t),\n _y = cos2(t / k2);\n\n for (float j = 0; j < height; j++)\n for (float i = 0; i < width; i++)\n {\n float\n x = i / width,\n y = j / height,\n\n v1 = sin2(k1 * x + t),\n v2 = sin2(k1 * (x * _x + y * _y) + t),\n\n cx = x + _cx,\n cy = y + _cy,\n aa1 = k3 * (cx * cx + cy * cy),\n\n v3 = sin2(sqrt2(aa1) + t),\n vf = pi * (v1 + v2 + v3);\n\n *p++ = clamp_cos(vf); //red\n *p++ = clamp_sin(vf + _k42); //green\n *p++ = clamp_cos(vf + _k52); //blue\n\n }\n}\n\n\n\nvoid fn_plasma_fast2(Pbits& dst, float t,\n float k1, float k2, float k3,\n float k4, float k5, float k6)\n{\n\n U8* p = dst.data;\n\n static float data_v1[1024];\n static float data_cx[1024];\n static float data_cy[1024];\n static float data_xx3[1024];\n static float data_yy3[1024];\n\n float\n height = dst.height,\n width = dst.width,\n\n _k42 = pi * k4 / k2,\n _k52 = pi * k5 / k2,\n _cx = sin2(t / k1) * k1,\n _cy = sin2(t / k2) * k1,\n _x = sin2(t)/width*k1 ,\n _y = cos2(t/k2)/height*k1;\n\n\n for (int x = 0; x < width; x++)\n {\n data_v1[x] = sin2(k1 * x /width+ t);\n\n float f = x / width + _cx;\n data_cx[x] =k3* f*f;\n data_xx3[x] = x * _x;\n }\n for (int y = 0; y < height; y++)\n {\n float f = y / height + _cy;\n data_cy[y] = k3*f * f;\n data_yy3[y] = y*_y ;\n };\n\n\n for (int y = 0; y < height; y++)\n for (int x = 0; x < width; x++)\n {\n\n //float v1 = data_v1[x];\n //float v2 = sin2(data_xx3[x] + data_yy3[y]);\n\n float aa1 = data_cx[x] + data_cy[y];\n\n //float v3 = sin2(sqrt2(aa1) + t);\n //float vf = pi * (v1 + v2 + v3);\n float vf = pi * (data_v1[x]+ sin2(data_xx3[x] + data_yy3[y])+ sin2(sqrt2(aa1) + t));\n\n *p++ = clamp_cos(vf); //red\n *p++ = clamp_sin(vf + _k42); //green\n *p++ = clamp_cos(vf + _k52); //blue\n\n }\n}\n\n\n\n\nstruct window\n{\n int x, y, width, height; //i\xc3\xa7 x y +en boy\n\n HINSTANCE hist; // program kayd\xc4\xb1\n HWND hwnd; // window\n HDC hdc; // device context \n HGLRC hrc; // opengl context \n //WNDPROC fn_pencere; // pencere fonksiyonu\n WNDCLASS wc; // pencere s\xc4\xb1n\xc4\xb1f\xc4\xb1\n PIXELFORMATDESCRIPTOR pfd;\n\n window(int _width = 256, int _height = 256)\n {\n memset(this, 0, sizeof(*this));\n x = 100;\n y = 100;\n width = _width;\n height = _height;\n\n //HINSTANCE\n hist = GetModuleHandle(NULL);\n\n //WNDCLASS\n wc.lpfnWndProc = (WNDPROC)fn_window;\n wc.hInstance = hist;\n wc.hIcon = LoadIcon(0, IDI_WINLOGO);\n wc.hCursor = LoadCursor(0, IDC_ARROW);\n wc.lpszClassName = "opengl";\n RegisterClass(&wc);\n\n //HWND\n hwnd = CreateWindow("opengl", "test",\n WS_OVERLAPPEDWINDOW,\n x, y, width + 16, height + 39,\n NULL, NULL, hist, NULL);\n //HDC\n hdc = GetDC(hwnd);\n\n\n //PFD\n pfd.nSize = sizeof(pfd);\n pfd.nVersion = 1;\n pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;\n pfd.iPixelType = PFD_TYPE_RGBA;\n pfd.cColorBits = 32;\n\n int pf = ChoosePixelFormat(hdc, &pfd);\n SetPixelFormat(hdc, pf, &pfd);\n DescribePixelFormat(hdc, pf, sizeof(PIXELFORMATDESCRIPTOR), &pfd);\n\n //HRC\n hrc = wglCreateContext(hdc);\n wglMakeCurrent(hdc, hrc);\n\n ShowWindow(hwnd, SW_SHOW);\n SetFocus(hwnd);\n\n\n }\n ~window()\n {\n if (hrc)\n wglMakeCurrent(NULL, NULL),\n wglDeleteContext(hrc);\n if (hdc) ReleaseDC(hwnd, hdc);\n if (hwnd) DestroyWindow(hwnd);\n if (hist) UnregisterClass("opengl", hist);\n }\n\n void run()\n {\n MSG msg;\n BOOL dongu = true;\n\n while (dongu)\n {\n if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))\n {\n if (msg.message == WM_QUIT) dongu = 0;\n else\n {\n TranslateMessage(&msg);\n DispatchMessage(&msg);\n }\n }\n else\n {\n render();\n SwapBuffers(hdc);\n }\n }\n }\n\n static int __stdcall fn_window(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)\n {\n switch (msg)\n {\n //case WM_CREATE: {} break;\n //case WM_COMMAND: {} break;\n //case WM_PAINT: {} break;\n case WM_CLOSE: { DestroyWindow(hwnd); }break;\n case WM_DESTROY: {PostQuitMessage(0); }break;\n }\n return DefWindowProc(hwnd, msg, wParam, lParam);\n }\n static void render()\n {\n //OPENGL 1.0\n //glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); \n //glMatrixMode(GL_PROJECTION); glLoadIdentity();\n //glMatrixMode(GL_MODELVIEW); glLoadIdentity();\n\n static float t; t += 0.02;\n fn_plasma_fast2(*pdst, t, 8, 36, 54, 51, 48, 4);//FAST\n\n glRasterPos3f(-1, -1, 0);\n glDrawPixels(WIDTH, HEIGHT, GL_RGB, GL_UNSIGNED_BYTE, pdst->data);\n\n }\n};\n\n\n\nint main()\n{\n\n\n Pbits dst(WIDTH, HEIGHT);\n pdst = &dst;\n init_luts();\n\n\n int begin;\n\n begin = clock();\n fn_plasma_slow(dst, 0, 8, 36, 54, 51, 48, 4);\n printf("fn_plasma_slow: %4d\\n", clock() - begin );\n dst.save("plasma_slow.bmp");\n\n begin = clock();\n fn_plasma_fast(dst, 0, 8, 36, 54, 51, 48, 4);\n printf("fn_plasma_fast: %4d\\n", clock() - begin);\n dst.save("plasma_fast.bmp");\n\n\n begin = clock();\n fn_plasma_fast2(dst, 0, 8, 36, 54, 51, 48, 4);\n printf("fn_plasma_fast2: %4d\\n", clock() - begin );\n dst.save("plasma_fast2.bmp");\n\n\n\n window win(WIDTH, HEIGHT);\n win.run();\n\n\n return 0;\n}\nRun Code Online (Sandbox Code Playgroud)\n
| 归档时间: |
|
| 查看次数: |
491 次 |
| 最近记录: |