在 GLSL 中优化光线追踪着色器

Mak*_*gan 5 c++ opengl optimization gpu glsl

我编写了一个基于体素化的光线追踪器,它按预期工作,但速度非常慢。

目前光线追踪器代码如下:

#version 430 
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;

out vec4 fragment_color;

struct Voxel
{
    vec4 position;
    vec4 normal;
    vec4 color;
};

struct Node
{
    //children of the current node
    int children[8];
};

layout(std430, binding = 0) buffer voxel_buffer
{
    //last layer of the tree, the leafs
    Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
    uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
    //tree structure       
    Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
    uint t_index;
};

uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels

#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
    bool inside = true;
    position-=corner;//coordinate of the position relative to the box coordinate system
    //Test that all coordinates are inside the box, if any is outisde, the point is out the box
    for(int i=0; i<3; i++)
    {
        inside = inside && (position[i] > -EPSILON);
        inside = inside && (position[i] < size+EPSILON);
    }

    return inside;
}

//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
    dir = normalize(dir);
    vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box

    float coeffs[6];
    //Calculate the intersaction coefficients with te 6 bonding planes 
    coeffs[0] = (corner0.x - origin.x)/(dir.x);
    coeffs[1] = (corner0.y - origin.y)/(dir.y);
    coeffs[2] = (corner0.z - origin.z)/(dir.z);

    coeffs[3] = (corner1.x - origin.x)/(dir.x);
    coeffs[4] = (corner1.y - origin.y)/(dir.y);
    coeffs[5] = (corner1.z - origin.z)/(dir.z);
    //by default the distance to the box is infinity
    float t = 1.f/0.f;

    for(uint i=0; i<6; i++){
        //if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
        coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
        //The distance is the minumum of the previous calculated distance and the current distance
        t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
    }

    return t;
}

#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;

void push(int node, int level, vec3 corner)
{
    nodes[sp] = node;
    levels[sp] = level;
    positions[sp] = corner;
    sp++;
}

void main()
{   
    int count = 0; //count the iterations of the algorithm
    vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
    r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
    vec3 dir = r;
    r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position

    fragment_color = vec4(0);
    int max_level = int(log2(voxel_resolution));//height of the tree
    push(0,0,vec3(-cube_dim));//set the stack
    float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
    //tree variables
    int level=0;
    int node=0;
    vec3 corner;

    do
    {
        //pop from stack
        sp--;
        node = nodes[sp];
        level = levels[sp];
        corner = positions[sp];

        //set the size of the current voxel 
        float size = cube_dim / pow(2,level);
        //set the corners of the children
        vec3 corners[] =
            {corner,                        corner+vec3(0,0,size),
            corner+vec3(0, size,0),         corner+vec3(0,size,size),
            corner+vec3(size,0,0),          corner+vec3(size,0,size),
            corner+vec3(size,size,0),       corner+vec3(size,size,size)};

        float coeffs[8];
        for(int child=0; child<8; child++)
        {
            //Test non zero childs, zero childs are empty and thus should be discarded
            coeffs[child] = tree[node].children[child]>0?
                //Get the distance to your child if it's not empty or infinity if it's empty
                boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
        }
        int indices[8] = {0,1,2,3,4,5,6,7};
        //sort the children from closest to farthest
        for(uint i=0; i<8; i++)
        {
            for(uint j=i; j<8; j++)
            {
                if((coeffs[j] < coeffs[i]))
                {
                    float swap = coeffs[i];
                    coeffs[i] = coeffs[j];
                    coeffs[j] = swap;

                    int iSwap = indices[i];
                    indices[i] = indices[j];
                    indices[j] = iSwap;

                    vec3 vSwap = corners[i];
                    corners[i] = corners[j];
                    corners[j] = vSwap;
                }
            }
        }
        //push to stack
        for(uint i=7; i>=0; i--)
        {
            if(!isinf(coeffs[i]))
            {
                push(tree[node].children[indices[i]],
                    level+1, corners[i]);
            }
        }
        count++;
    }while(level < (max_level-1) && sp>0);
    //set color
    fragment_color = vec4(count)/100;
}
Run Code Online (Sandbox Code Playgroud)

由于可能不完全清楚这是做什么的,让我解释一下。

我们从一个大立方体开始检查光线盒交叉点。如果我们击中它,我们会测试与组成它的 8 个立方体的交集。

如果我们击中任何一个,我们就会检查与组成该立方体的 8 个立方体的交集。

在 2D 中,这将如下所示:

在此输入图像描述

在这种情况下,我们有 4 层,我们首先检查大框,然后检查红色的,然后检查绿色的,最后检查蓝色的。

打印出以颜色形式执行光线跟踪步骤的次数(这就是我提供的代码片段的作用)

结果如下图:

在此输入图像描述

正如您所看到的,大多数时候着色器不会执行超过 100 次迭代。

然而,该着色器在 gtx 1070 中平均执行需要 200 000 微秒。

由于问题不是执行次数,所以我的问题可能出在线程执行上。

有谁知道我如何优化这段代码?最大的瓶颈似乎是堆栈的使用。

如果我运行相同的代码而不压入堆栈(生成错误的输出),则运行时会提高 10 倍

Dar*_*ter 1

GPU 上的线程执行可能是大规模并行的,但这并不意味着所有线程都彼此独立运行。线程组执行完全相同的指令,唯一的区别是输入数据。这意味着分支和循环不能\xe2\x80\x99 使线程在执行中发散,因此也不会让它们提前终止。

\n\n

您的示例显示了最极端的边缘情况:当一组线程中 xe2x80x99 完成的所有工作很可能仅与一个线程相关时。

\n\n

为了缓解这种情况,您应该尝试减少组中(或总数)线程的执行长度(在您的情况下为迭代)的差异。这可以通过设置每个着色器通道的迭代次数限制并仅重新调度那些需要更多迭代的线程/像素来完成。

\n