如何将浮点算法转换为定点?

BPL*_*BPL 7 c++ floating-point optimization fixed-point

在阅读了很多有关定点算术的知识之后,我想我可以说我已经理解了基础知识,但是不幸的是,我还不知道如何转换使用sin / cos / sqrt或任何其他fp函数的例程。

考虑一下这个简单的mcve:

#include <math.h>
#include <stdio.h>
#include <ctime>
#include <fstream>
#include <iostream>

typedef char S8;
typedef short S16;
typedef int S32;
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef float F32;
typedef double F64;

// -------- Fixed point helpers QM.N(32bits) --------
typedef S32 FP32;

#define LUT_SIZE_BITS 9  // 0xffffffff>>(32-9)=511; 32-23=9; 2^9=512
#define LUT_SIZE 512

#define FRACT_BITS 28        // Number fractional bits
#define M (1 << FRACT_BITS)  // Scaling factor

inline F32 Q2F(FP32 X) { return ((F32)X / (F32)(M)); }
inline FP32 F2Q(F32 X) { return (FP32)(X * (M)); }

const F32 PI = 3.141592653589793f;
const F32 pi = 3.141592653589793f;
const U32 WIDTH = 256;
const U32 HEIGHT = 256;

FP32 cos_table[LUT_SIZE];
FP32 sin_table[LUT_SIZE];

void init_luts() {
    const F32 deg_to_rad = PI / 180.f;
    const F32 sample_to_deg = 1 / LUT_SIZE * 360.f;
    for (S32 i = 0; i < LUT_SIZE; i++) {
        F32 rad = ((F32)i * sample_to_deg) * deg_to_rad;
        F32 c = cos(rad);
        F32 s = sin(rad);
        cos_table[i] = F2Q(c);
        sin_table[i] = F2Q(s);
    }
}

// -------- Image processing --------
U8 clamp(F32 valor) { return valor > 255 ? 255 : (valor < 0 ? 0 : (U8)valor); }

struct Pbits {
    U32 width;
    U32 height;
    U8 *data;

    Pbits(U32 width, U32 height, U8 *data) {
        this->width = width;
        this->height = height;
        this->data = data;
    }

    Pbits(Pbits *src) {
        this->width = src->width;
        this->height = src->height;
        this->data = new U8[src->width * src->height * 3];
        memcpy(this->data, src->data, width * height * 3);
    }

    ~Pbits() { delete this->data; }

    void to_bgr() {
        U8 r, g, b;
        for (S32 y = 0; y < height; y++) {
            for (S32 x = 0; x < width; x++) {
                get_pixel(y, x, r, g, b);
                set_pixel(y, x, b, g, r);
            }
        }
    }

    void get_pixel(U32 y, U32 x, U8 &r, U8 &g, U8 &b) {
        U32 offset = (y * height * 3) + (x * 3);
        r = this->data[offset + 0];
        g = this->data[offset + 1];
        b = this->data[offset + 2];
    }

    void set_pixel(U32 y, U32 x, U8 c1, U8 c2, U8 c3) {
        U32 offset = (y * height * 3) + (x * 3);

        data[offset] = c1;
        data[offset + 1] = c2;
        data[offset + 2] = c3;
    }
};

void fx1_plasma(Pbits *dst, F32 t, F32 k1, F32 k2, F32 k3, F32 k4, F32 k5, F32 k6) {
    U32 height = dst->height;
    U32 width = dst->width;

    for (U32 y = 0; y < height; y++) {
        F32 uv_y = (F32)y / height;
        for (U32 x = 0; x < width; x++) {
            F32 uv_x = (F32)x / width;

            F32 v1 = sin(uv_x * k1 + t);
            F32 v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
            F32 cx = uv_x + sin(t / k1) * k1;
            F32 cy = uv_y + sin(t / k2) * k1;
            F32 v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
            F32 vf = v1 + v2 + v3;

            U8 r = (U8)clamp(255 * cos(vf * pi));
            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));

            dst->set_pixel(y, x, r, g, b);
        }
    }
}

// -------- Image helpers --------
inline void _write_s32(U8 *dst, S32 offset, S32 v) {
    dst[offset] = (U8)(v);
    dst[offset + 1] = (U8)(v >> 8);
    dst[offset + 2] = (U8)(v >> 16);
    dst[offset + 3] = (U8)(v >> 24);
}

void write_bmp(Pbits *src, S8 *filename) {
    Pbits *dst = new Pbits(src);
    dst->to_bgr();

    S32 w = dst->width;
    S32 h = dst->height;
    U8 *img = dst->data;

    S32 filesize = 54 + 3 * w * h;

    U8 bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    U8 bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    U8 bmppad[3] = {0, 0, 0};

    _write_s32(bmpfileheader, 2, filesize);
    _write_s32(bmpinfoheader, 4, w);
    _write_s32(bmpinfoheader, 8, h);

    FILE *f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (S32 i = 0; i < h; i++) {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    delete dst;
}

void write_ppm(Pbits *dst, S8 *filename) {
    std::ofstream file(filename, std::ofstream::trunc);
    if (!file.is_open()) {
        std::cout << "yep! file is not open" << std::endl;
    }
    file << "P3\n" << dst->width << " " << dst->height << "\n255\n";

    U8 r, g, b, a;
    for (U32 y = 0; y < dst->height; y++) {
        for (U32 x = 0; x < dst->width; x++) {
            dst->get_pixel(y, x, r, g, b);
            file << (S32)r << " " << (S32)g << " " << (S32)b << "\n";
        }
    }
}

S32 main() {
    Pbits *dst = new Pbits(WIDTH, HEIGHT, new U8[WIDTH * HEIGHT * 3]);

    init_luts();

    clock_t begin = clock();
    fx1_plasma(dst, 0, 8, 36, 54, 51, 48, 4);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    std::cout << "Generated plasma in " << elapsed_secs << "s" << std::endl;

    write_ppm(dst, "plasma.ppm");
    write_bmp(dst, "plasma.bmp");

    delete dst;
}
Run Code Online (Sandbox Code Playgroud)

此代码将生成此图像:

在此处输入图片说明

问题:如何将这种浮点算法转换为快速的定点算法?现在,浮点算术的基础知识对我来说是+/-清楚的,例如:

fa,fb=floating point values; a,b=fixed_point ones; M=scaling factor

fa = a*M
fb = b*M
fa+fb = (a+b)*M
fa-fb = (a-b)*M
fa*fb = (a*b)*M^2
fa/fb = (a/b)
Run Code Online (Sandbox Code Playgroud)

但是如何在定点使用sin / cos / sqrt等仍然使我难以理解。我已经找到了这个相关的线程,但是我仍然不明白如何使用带有随机fp值的三角函数。

ism*_*lik 2

#ifdef _MSC_VER\n#pragma comment(lib,"opengl32.lib")\n#pragma comment(lib,"glu32.lib")\n#pragma warning(disable : 4996)\n#pragma warning(disable : 26495)  //varsay\xc4\xb1lan de\xc4\x9fer atam\xc4\xb1yorum\n#pragma warning(disable :6031)//d\xc3\xb6n\xc3\xbc\xc5\x9f de\xc4\x9feri yok say\xc4\xb1ld\xc4\xb1\n#endif\n\n#include <Windows.h>\n#include <gl/gl.h>      //-lopengl32 \n//#include <gl/glu.h>   //-lglu32\n\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#include <string.h>\n\ntypedef unsigned char   U8;\ntypedef unsigned int    U32;\n\n#define LUT_SIZE 1024           //1Kb  10 bit sin cos\n#define LUT_SIZE2 0x000fffff    //1Mb  20 bit sqrt\n\nfloat cos_tab[LUT_SIZE];\nfloat sin_tab[LUT_SIZE];\nU8    clamp_cos_tab[LUT_SIZE];\nU8    clamp_sin_tab[LUT_SIZE];\nfloat sqrt_tab[LUT_SIZE2];\n\n\nconst float pi = 3.141592;\nconst float pi_k = LUT_SIZE / (2 * pi);\n\nconst U32 WIDTH = 640;  //256\nconst U32 HEIGHT = 480; //256\n\n\n\nstruct Pbits;\nPbits* pdst;\n\n\nU8 clamp(float f) { return f > 255 ? 255 : (f < 0 ? 0 : (U8)f); }\n\n#define sin2(f)      sin_tab        [ (int)(pi_k * (f)) & 0x000003ff]//LUT_SIZE-1\n#define cos2(f)      cos_tab        [ (int)(pi_k * (f)) & 0x000003ff]\n#define clamp_sin(f) clamp_sin_tab  [ (int)(pi_k * (f)) & 0x000003ff]\n#define clamp_cos(f) clamp_cos_tab  [ (int)(pi_k * (f)) & 0x000003ff]\n#define sqrt2(f)     sqrt_tab       [*(int*)&(f)>>12]   //32-20 bit\n\nvoid init_luts()\n{\n    for (int i = 0; i < LUT_SIZE; i++)\n    {\n        cos_tab[i] = cos(i / pi_k);\n        sin_tab[i] = sin(i / pi_k);\n\n        clamp_cos_tab[i] = clamp(255 * cos(i / pi_k));\n        clamp_sin_tab[i] = clamp(255 * sin(i / pi_k));\n    }\n\n    for (int i = 0; i < LUT_SIZE2; i++)//init_luts\n    {\n        int ii=i<<12;       //32-20 bit\n        float f = *(float *)&ii;    //i to float\n        sqrt_tab[i] = sqrt(f);\n    }\n\n}\n\n\n\nfloat sqrt3(float x)\n{\n    //https ://www.codeproject.com/Articles/69941/Best-Square-Root-Method-Algorithm-Function-Precisi\n    unsigned int i = *(unsigned int*)& x;\n\n    i += (127 << 23);\n    i >>= 1;\n    return *(float*)&i;\n}\nfloat sqrt4(float x)\n{\n    //https: //stackoverflow.com/questions/1349542/john-carmacks-unusual-fast-inverse-square-root-quake-iii\n    float xhalf = 0.5f * x;\n    int i = *(int*)&x;              // get bits for floating value\n    i = 0x5f375a86 - (i >> 1);      // gives initial guess y0\n    x = *(float*)&i;                // convert bits back to float\n    x = x * (1.5f - xhalf * x * x); // Newton step, repeating increases accuracy\n    return x;\n}\n\n\n\nstruct Pbits\n{\n    int width;\n    int height;\n    U8* data;\n\n    Pbits(int _width, int _height, U8* _data = 0)\n    {\n        width = _width;\n        height = _height;\n        if (!_data)\n            _data = (U8*)calloc(width * height * 3, 1);\n        data = _data;\n    }\n\n    ~Pbits() { free(data); }\n    void set_pixel(int y, int x, U8 c1, U8 c2, U8 c3)\n    {\n        int offset = (y * width * 3) + (x * 3);\n\n        data[offset] = c1;\n        data[offset + 1] = c2;\n        data[offset + 2] = c3;\n    }\n    void save(const char* filename)\n    {\n\n        U8 pp[54] = { \'B\', \'M\', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0 ,\n                         40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0 };\n        *(int*)(pp + 2) = 54 + 3 * width * height;\n        *(int*)(pp + 18) = width;\n        *(int*)(pp + 22) = height;\n\n        int size = height * width * 3;\n        U8* p = data;\n        for (int i = 0; i < size; i += 3)//to_bgr()\n        {\n            U8 tmp = p[i];\n            p[i] = p[i + 2];\n            p[i + 2] = tmp;\n        }\n\n        FILE* f = fopen(filename, "wb");\n        fwrite(pp, 1, 54, f);\n        fwrite(data, size, 1, f);\n        fclose(f);\n\n        for (int i = 0; i < size; i += 3)//to_rgb()\n        {\n            U8 tmp = p[i];\n            p[i] = p[i + 2];\n            p[i + 2] = tmp;\n        }\n\n    }\n\n};\n\nvoid fn_plasma_slow(Pbits& dst, float t,\n    float k1, float k2, float k3, float k4, float k5, float k6)\n{\n    int height = dst.height;\n    int width = dst.width;\n\n    for (int y = 0; y < height; y++)\n    {\n        float uv_y = (float)y / height;\n        for (int x = 0; x < width; x++)\n        {\n            float uv_x = (float)x / width;\n\n            float v1 = sin(uv_x * k1 + t);\n            float v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);\n            float cx = uv_x + sin(t / k1) * k1;\n            float cy = uv_y + sin(t / k2) * k1;\n            float v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);\n            float vf = v1 + v2 + v3;\n\n            U8 r = (U8)clamp(255 * cos(vf * pi));\n            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));\n            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));\n\n            dst.set_pixel(y, x, r, g, b);\n        }\n    }\n}\n\n\nvoid fn_plasma_fast(Pbits& dst, float t,\n    float k1, float k2, float k3,\n    float k4, float k5, float k6)\n{\n\n    U8* p = dst.data;\n\n    float\n        height = dst.height,\n        width = dst.width,\n\n        _k42 = pi * k4 / k2,\n        _k52 = pi * k5 / k2,\n        _cx = sin2(t / k1) * k1,\n        _cy = sin2(t / k2) * k1,\n        _x = sin2(t),\n        _y = cos2(t / k2);\n\n    for (float j = 0; j < height; j++)\n        for (float i = 0; i < width; i++)\n        {\n            float\n                x = i / width,\n                y = j / height,\n\n                v1 = sin2(k1 * x + t),\n                v2 = sin2(k1 * (x * _x + y * _y) + t),\n\n                cx = x + _cx,\n                cy = y + _cy,\n                aa1 = k3 * (cx * cx + cy * cy),\n\n                v3 = sin2(sqrt2(aa1) + t),\n                vf = pi * (v1 + v2 + v3);\n\n            *p++ = clamp_cos(vf);           //red\n            *p++ = clamp_sin(vf + _k42);    //green\n            *p++ = clamp_cos(vf + _k52);    //blue\n\n        }\n}\n\n\n\nvoid fn_plasma_fast2(Pbits& dst, float t,\n    float k1, float k2, float k3,\n    float k4, float k5, float k6)\n{\n\n    U8* p = dst.data;\n\n    static float data_v1[1024];\n    static float data_cx[1024];\n    static float data_cy[1024];\n    static float data_xx3[1024];\n    static float data_yy3[1024];\n\n    float\n        height = dst.height,\n        width = dst.width,\n\n        _k42 = pi * k4 / k2,\n        _k52 = pi * k5 / k2,\n        _cx = sin2(t / k1) * k1,\n        _cy = sin2(t / k2) * k1,\n        _x = sin2(t)/width*k1 ,\n        _y = cos2(t/k2)/height*k1;\n\n\n    for (int x = 0; x < width; x++)\n    {\n        data_v1[x] = sin2(k1 * x /width+ t);\n\n        float f = x / width + _cx;\n        data_cx[x] =k3* f*f;\n        data_xx3[x] = x * _x;\n    }\n    for (int y = 0; y < height; y++)\n    {\n        float f = y / height + _cy;\n        data_cy[y] = k3*f * f;\n        data_yy3[y] = y*_y ;\n    };\n\n\n    for (int y = 0; y < height; y++)\n    for (int x = 0; x < width; x++)\n    {\n\n        //float v1 = data_v1[x];\n        //float v2 = sin2(data_xx3[x] + data_yy3[y]);\n\n        float aa1 = data_cx[x] + data_cy[y];\n\n        //float     v3 = sin2(sqrt2(aa1) + t);\n        //float     vf = pi * (v1 + v2 + v3);\n        float vf = pi * (data_v1[x]+ sin2(data_xx3[x] + data_yy3[y])+ sin2(sqrt2(aa1) + t));\n\n        *p++ = clamp_cos(vf);           //red\n        *p++ = clamp_sin(vf + _k42);    //green\n        *p++ = clamp_cos(vf + _k52);    //blue\n\n    }\n}\n\n\n\n\nstruct window\n{\n    int  x, y, width, height;       //i\xc3\xa7 x y +en boy\n\n    HINSTANCE   hist;       //  program kayd\xc4\xb1\n    HWND        hwnd;       //  window\n    HDC         hdc;        //  device context \n    HGLRC       hrc;        //  opengl context \n    //WNDPROC       fn_pencere; //  pencere fonksiyonu\n    WNDCLASS    wc;         //  pencere s\xc4\xb1n\xc4\xb1f\xc4\xb1\n    PIXELFORMATDESCRIPTOR pfd;\n\n    window(int _width = 256, int _height = 256)\n    {\n        memset(this, 0, sizeof(*this));\n        x = 100;\n        y = 100;\n        width = _width;\n        height = _height;\n\n        //HINSTANCE\n        hist = GetModuleHandle(NULL);\n\n        //WNDCLASS\n        wc.lpfnWndProc = (WNDPROC)fn_window;\n        wc.hInstance = hist;\n        wc.hIcon = LoadIcon(0, IDI_WINLOGO);\n        wc.hCursor = LoadCursor(0, IDC_ARROW);\n        wc.lpszClassName = "opengl";\n        RegisterClass(&wc);\n\n        //HWND\n        hwnd = CreateWindow("opengl", "test",\n            WS_OVERLAPPEDWINDOW,\n            x, y, width + 16, height + 39,\n            NULL, NULL, hist, NULL);\n        //HDC\n        hdc = GetDC(hwnd);\n\n\n        //PFD\n        pfd.nSize = sizeof(pfd);\n        pfd.nVersion = 1;\n        pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;\n        pfd.iPixelType = PFD_TYPE_RGBA;\n        pfd.cColorBits = 32;\n\n        int  pf = ChoosePixelFormat(hdc, &pfd);\n        SetPixelFormat(hdc, pf, &pfd);\n        DescribePixelFormat(hdc, pf, sizeof(PIXELFORMATDESCRIPTOR), &pfd);\n\n        //HRC\n        hrc = wglCreateContext(hdc);\n        wglMakeCurrent(hdc, hrc);\n\n        ShowWindow(hwnd, SW_SHOW);\n        SetFocus(hwnd);\n\n\n    }\n    ~window()\n    {\n        if (hrc)\n            wglMakeCurrent(NULL, NULL),\n            wglDeleteContext(hrc);\n        if (hdc)    ReleaseDC(hwnd, hdc);\n        if (hwnd)   DestroyWindow(hwnd);\n        if (hist)   UnregisterClass("opengl", hist);\n    }\n\n    void run()\n    {\n        MSG         msg;\n        BOOL dongu = true;\n\n        while (dongu)\n        {\n            if (PeekMessage(&msg, NULL, 0, 0, PM_REMOVE))\n            {\n                if (msg.message == WM_QUIT) dongu = 0;\n                else\n                {\n                    TranslateMessage(&msg);\n                    DispatchMessage(&msg);\n                }\n            }\n            else\n            {\n                render();\n                SwapBuffers(hdc);\n            }\n        }\n    }\n\n    static int __stdcall fn_window(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)\n    {\n        switch (msg)\n        {\n            //case WM_CREATE:   {}  break;\n            //case WM_COMMAND:  {}  break;\n            //case WM_PAINT:    {}  break;\n        case WM_CLOSE: {    DestroyWindow(hwnd); }break;\n        case WM_DESTROY: {PostQuitMessage(0); }break;\n        }\n        return DefWindowProc(hwnd, msg, wParam, lParam);\n    }\n    static void render()\n    {\n        //OPENGL 1.0\n        //glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);   \n        //glMatrixMode(GL_PROJECTION);  glLoadIdentity();\n        //glMatrixMode(GL_MODELVIEW);   glLoadIdentity();\n\n        static float t; t += 0.02;\n        fn_plasma_fast2(*pdst, t, 8, 36, 54, 51, 48, 4);//FAST\n\n        glRasterPos3f(-1, -1, 0);\n        glDrawPixels(WIDTH, HEIGHT, GL_RGB, GL_UNSIGNED_BYTE, pdst->data);\n\n    }\n};\n\n\n\nint main()\n{\n\n\n    Pbits dst(WIDTH, HEIGHT);\n    pdst = &dst;\n    init_luts();\n\n\n    int begin;\n\n    begin = clock();\n    fn_plasma_slow(dst, 0, 8, 36, 54, 51, 48, 4);\n    printf("fn_plasma_slow:  %4d\\n", clock() - begin );\n    dst.save("plasma_slow.bmp");\n\n    begin = clock();\n    fn_plasma_fast(dst, 0, 8, 36, 54, 51, 48, 4);\n    printf("fn_plasma_fast: %4d\\n", clock() - begin);\n    dst.save("plasma_fast.bmp");\n\n\n    begin = clock();\n    fn_plasma_fast2(dst, 0, 8, 36, 54, 51, 48, 4);\n    printf("fn_plasma_fast2: %4d\\n", clock() - begin );\n    dst.save("plasma_fast2.bmp");\n\n\n\n    window win(WIDTH, HEIGHT);\n    win.run();\n\n\n    return 0;\n}\n
Run Code Online (Sandbox Code Playgroud)\n

  • 巨大的代码转储并不能给出好的答案。添加一些解释,说明此代码如何回答问题,以及您为提高性能所做的工作。 (2认同)