C/C++和CUDA中的双线性插值

use*_*868 11 c++ cuda

我想模仿CPU上CUDA双线性插值的行为,但我发现返回值tex2D似乎不适合双线性公式.

我想将插值系数从比特值[1]的比特转换float9比特定点格式导致不同的值.8

根据转换fomula [2,线106] ,该转换的结果将是相同的作为输入float时coeffient是1/2^n,与n=0,1,..., 8,但我仍然(不总是)接收怪异值.

下面我报告一个奇怪值的例子.在这种情况下,奇怪的价值总是发生在id = 2*n+1,有人可以告诉我为什么吗?

Src数组:

Src[0][0] =  38;  
Src[1][0] =  39;  
Src[0][1] = 118;  
Src[1][1] =  13;  
Run Code Online (Sandbox Code Playgroud)

纹理定义:

static texture<float4, 2, cudaReadModeElementType> texElnt;
texElnt.addressMode[0] = cudaAddressModeClamp;
texElnt.addressMode[1] = cudaAddressModeClamp;
texElnt.filterMode = cudaFilterModeLinear;
texElnt.normalized = false;
Run Code Online (Sandbox Code Playgroud)

内核功能:

static __global__ void kernel_texElnt(float* pdata, int w, int h, int c, float stride/*0.03125f*/) {
    const int gx = blockIdx.x*blockDim.x + threadIdx.x;
    const int gy = blockIdx.y*blockDim.y + threadIdx.y;
    const int gw = gridDim.x * blockDim.x;
    const int gid = gy*gw + gx;
    if (gx >= w || gy >= h) {
        return;
    }

    float2 pnt;
    pnt.x = (gx)*(stride)/*1/32*/;
    pnt.y = 0.0625f/*1/16*/;

    float4 result = tex2D( texElnt, pnt.x + 0.5, pnt.y + 0.5f);
    pdata[gid*3 + 0] = pnt.x;
    pdata[gid*3 + 1] = pnt.y;
    pdata[gid*3 + 2] = result.x;

}
Run Code Online (Sandbox Code Playgroud)

CUDA的双线性结果

id  pnt.x   pnt.y   tex2D
 0  0.00000 0.0625  43.0000000  
 1  0.03125 0.0625  42.6171875  
 2  0.06250 0.0625  42.6484375  
 3  0.09375 0.0625  42.2656250  
 4  0.12500 0.0625  42.2968750  
 5  0.15625 0.0625  41.9140625  
 6  0.18750 0.0625  41.9453125  
 7  0.21875 0.0625  41.5625000  
 8  0.25000 0.0625  41.5937500  
 9  0.28125 0.0625  41.2109375  
 0  0.31250 0.0625  41.2421875  
10  0.34375 0.0625  40.8593750  
11  0.37500 0.0625  40.8906250  
12  0.40625 0.0625  40.5078125  
13  0.43750 0.0625  40.5390625  
14  0.46875 0.0625  40.1562500  
15  0.50000 0.0625  40.1875000  
16  0.53125 0.0625  39.8046875  
17  0.56250 0.0625  39.8359375  
18  0.59375 0.0625  39.4531250  
19  0.62500 0.0625  39.4843750  
20  0.65625 0.0625  39.1015625  
21  0.68750 0.0625  39.1328125  
22  0.71875 0.0625  38.7500000  
23  0.75000 0.0625  38.7812500  
24  0.78125 0.0625  38.3984375  
25  0.81250 0.0625  38.4296875  
26  0.84375 0.0625  38.0468750  
27  0.87500 0.0625  38.0781250  
28  0.90625 0.0625  37.6953125  
29  0.93750 0.0625  37.7265625  
30  0.96875 0.0625  37.3437500  
31  1.00000 0.0625  37.3750000
Run Code Online (Sandbox Code Playgroud)

CPU结果:

// convert coefficient ((1-?)*(1-?)), (?*(1-?)), ((1-?)*?), (?*?) to fixed point format  

id  pnt.x   pnt.y   tex2D
 0  0.00000 0.0625 43.00000000  
 1  0.03125 0.0625 43.23046875  
 2  0.06250 0.0625 42.64843750  
 3  0.09375 0.0625 42.87890625  
 4  0.12500 0.0625 42.29687500  
 5  0.15625 0.0625 42.52734375  
 6  0.18750 0.0625 41.94531250  
 7  0.21875 0.0625 42.17578125  
 8  0.25000 0.0625 41.59375000  
 9  0.28125 0.0625 41.82421875  
 0  0.31250 0.0625 41.24218750  
10  0.34375 0.0625 41.47265625  
11  0.37500 0.0625 40.89062500  
12  0.40625 0.0625 41.12109375  
13  0.43750 0.0625 40.53906250  
14  0.46875 0.0625 40.76953125  
15  0.50000 0.0625 40.18750000  
16  0.53125 0.0625 40.41796875  
17  0.56250 0.0625 39.83593750  
18  0.59375 0.0625 40.06640625  
19  0.62500 0.0625 39.48437500  
20  0.65625 0.0625 39.71484375  
21  0.68750 0.0625 39.13281250  
22  0.71875 0.0625 39.36328125  
23  0.75000 0.0625 38.78125000  
24  0.78125 0.0625 39.01171875  
25  0.81250 0.0625 38.42968750  
26  0.84375 0.0625 38.66015625  
27  0.87500 0.0625 38.07812500  
28  0.90625 0.0625 38.30859375  
29  0.93750 0.0625 37.72656250  
30  0.96875 0.0625 37.95703125  
31  1.00000 0.0625 37.37500000
Run Code Online (Sandbox Code Playgroud)

我在我的github上留下了一个简单的代码[3],运行程序后你会得到两个文件D:\.

编辑2014/01/20

我以不同的增量运行程序,并找到"当乘以小于,返回与双线性插值公式不匹配"的规范tex2D alphabeta0.00390625tex2D

Jac*_*ern 13

已经为这个问题提供了令人满意的答案,所以现在我只想提供有关双线性插值的有用信息的概要,如何在C++中实现它以及在CUDA中可以实现的不同方式.

双线性插值背后的数学

假设原始功能T(x, y)是在点的笛卡尔规则网格采样(i, j)0 <= i < M1,0 <= j < M2ij整数.对于每个值y,可以首先使用它0 <= a < 1来表示i + ai和之间包含的任意点i + 1.然后,可以执行在该点处沿y = j轴(平行于x轴)的线性插值

在此输入图像描述

r(x,y)插值样本的函数在哪里T(x,y).y = j + 1获得线也可以做同样的事情

在此输入图像描述

现在,对于每一个i + a,沿内插y轴,可以对样品进行r(i+a,j)r(i+a,j+1).因此,如果一个人0 <= b < 1用来表示j + b位于j和之间的任意点j + 1,则可以计算沿x = i + a轴(与轴平行y)的线性插值,从而得到最终结果

在此输入图像描述

需要注意的是关系i,j,a,b,xy有以下几种

在此输入图像描述

C/C++实现

我要强调,这个实现,以及下面的CUDA的,假设,如在开始时进行,这的样品T都位于点的笛卡尔规则网格(i, j)0 <= i < M1,0 <= j < M2ij整数(单位间隔).此外,该程序以单精度,复杂(float2)算法提供,但它可以很容易地在其他感兴趣的算术中进行.

void bilinear_interpolation_function_CPU(float2 * __restrict__ h_result, float2 * __restrict__ h_data, 
                                         float * __restrict__ h_xout, float * __restrict__ h_yout, 
                                         const int M1, const int M2, const int N1, const int N2){

    float2 result_temp1, result_temp2;
    for(int k=0; k<N2; k++){
        for(int l=0; l<N1; l++){

            const int   ind_x = floor(h_xout[k*N1+l]); 
            const float a     = h_xout[k*N1+l]-ind_x; 

            const int   ind_y = floor(h_yout[k*N1+l]); 
            const float b     = h_yout[k*N1+l]-ind_y; 

            float2 h00, h01, h10, h11;
            if (((ind_x)   < M1)&&((ind_y)   < M2)) h00 = h_data[ind_y*M1+ind_x];       else    h00 = make_float2(0.f, 0.f);
            if (((ind_x+1) < M1)&&((ind_y)   < M2)) h10 = h_data[ind_y*M1+ind_x+1];     else    h10 = make_float2(0.f, 0.f);
            if (((ind_x)   < M1)&&((ind_y+1) < M2)) h01 = h_data[(ind_y+1)*M1+ind_x];   else    h01 = make_float2(0.f, 0.f);
            if (((ind_x+1) < M1)&&((ind_y+1) < M2)) h11 = h_data[(ind_y+1)*M1+ind_x+1]; else    h11 = make_float2(0.f, 0.f);

            result_temp1.x = a * h10.x + (-h00.x * a + h00.x); 
            result_temp1.y = a * h10.y + (-h00.y * a + h00.y);

            result_temp2.x = a * h11.x + (-h01.x * a + h01.x);
            result_temp2.y = a * h11.y + (-h01.y * a + h01.y);

            h_result[k*N1+l].x = b * result_temp2.x + (-result_temp1.x * b + result_temp1.x);
            h_result[k*N1+l].y = b * result_temp2.y + (-result_temp1.y * b + result_temp1.y);

        }   
    }
}
Run Code Online (Sandbox Code Playgroud)

if/else上述代码中的陈述只是边界检查.如果样品落在外面[0, M1-1] x [0, M2-1],则将其设置为0.

标准CUDA实施

这是跟踪上述CPU的"标准"CUDA实现.没有使用纹理内存.

__global__ void bilinear_interpolation_kernel_GPU(float2 * __restrict__ d_result, const float2 * __restrict__ d_data, 
                                                  const float * __restrict__ d_xout, const float * __restrict__ d_yout, 
                                                  const int M1, const int M2, const int N1, const int N2)
{
   const int l = threadIdx.x + blockDim.x * blockIdx.x;
   const int k = threadIdx.y + blockDim.y * blockIdx.y;

   if ((l<N1)&&(k<N2)) {

       float2 result_temp1, result_temp2;

       const int    ind_x = floor(d_xout[k*N1+l]); 
       const float  a     = d_xout[k*N1+l]-ind_x; 

       const int    ind_y = floor(d_yout[k*N1+l]); 
       const float  b     = d_yout[k*N1+l]-ind_y; 

       float2 d00, d01, d10, d11;
       if (((ind_x)   < M1)&&((ind_y)   < M2))  d00 = d_data[ind_y*M1+ind_x];       else    d00 = make_float2(0.f, 0.f);
       if (((ind_x+1) < M1)&&((ind_y)   < M2))  d10 = d_data[ind_y*M1+ind_x+1];     else    d10 = make_float2(0.f, 0.f);
       if (((ind_x)   < M1)&&((ind_y+1) < M2))  d01 = d_data[(ind_y+1)*M1+ind_x];   else    d01 = make_float2(0.f, 0.f);
       if (((ind_x+1) < M1)&&((ind_y+1) < M2))  d11 = d_data[(ind_y+1)*M1+ind_x+1]; else    d11 = make_float2(0.f, 0.f);

       result_temp1.x = a * d10.x + (-d00.x * a + d00.x); 
       result_temp1.y = a * d10.y + (-d00.y * a + d00.y);

       result_temp2.x = a * d11.x + (-d01.x * a + d01.x);
       result_temp2.y = a * d11.y + (-d01.y * a + d01.y);

       d_result[k*N1+l].x = b * result_temp2.x + (-result_temp1.x * b + result_temp1.x);
       d_result[k*N1+l].y = b * result_temp2.y + (-result_temp1.y * b + result_temp1.y);

   } 
}
Run Code Online (Sandbox Code Playgroud)

使用纹理提取的CUDA实现

这与上面的实现相同,但现在由纹理缓存访问全局内存.例如,T[i,j]被访问为

tex2D(d_texture_fetch_float,ind_x,ind_y);
Run Code Online (Sandbox Code Playgroud)

(其中,当然ind_x = iind_y = j,并且d_texture_fetch_float被假定为一个全局范围变量),而不是

d_data[ind_y*M1+ind_x];
Run Code Online (Sandbox Code Playgroud)

请注意,此处未使用硬连线纹理过滤功能.下面的例程与上面的例程具有相同的精度,并且可能比旧的CUDA架构更快.

__global__ void bilinear_interpolation_kernel_GPU_texture_fetch(float2 * __restrict__ d_result, 
                                                                const float * __restrict__ d_xout, const float * __restrict__ d_yout, 
                                                                const int M1, const int M2, const int N1, const int N2)
{
   const int l = threadIdx.x + blockDim.x * blockIdx.x;
   const int k = threadIdx.y + blockDim.y * blockIdx.y;

   if ((l<N1)&&(k<N2)) {

       float2 result_temp1, result_temp2;

       const int    ind_x = floor(d_xout[k*N1+l]); 
       const float  a     = d_xout[k*N1+l]-ind_x; 

       const int    ind_y = floor(d_yout[k*N1+l]); 
       const float  b     = d_yout[k*N1+l]-ind_y; 

       const float2 d00  = tex2D(d_texture_fetch_float,ind_x,ind_y);    
       const float2 d10  = tex2D(d_texture_fetch_float,ind_x+1,ind_y);
       const float2 d11  = tex2D(d_texture_fetch_float,ind_x+1,ind_y+1);
       const float2 d01  = tex2D(d_texture_fetch_float,ind_x,ind_y+1);

       result_temp1.x = a * d10.x + (-d00.x * a + d00.x); 
       result_temp1.y = a * d10.y + (-d00.y * a + d00.y);

       result_temp2.x = a * d11.x + (-d01.x * a + d01.x);
       result_temp2.y = a * d11.y + (-d01.y * a + d01.y);

       d_result[k*N1+l].x = b * result_temp2.x + (-result_temp1.x * b + result_temp1.x);
       d_result[k*N1+l].y = b * result_temp2.y + (-result_temp1.y * b + result_temp1.y);

   } 
}
Run Code Online (Sandbox Code Playgroud)

纹理绑定可以根据

void TextureBindingBilinearFetch(const float2 * __restrict__ data, const int M1, const int M2)
{
    size_t pitch; 
    float* data_d;
    gpuErrchk(cudaMallocPitch((void**)&data_d,&pitch, M1 * sizeof(float2), M2));
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float2>();
    gpuErrchk(cudaBindTexture2D(0,&d_texture_fetch_float,data_d,&desc,M1,M2,pitch));
    d_texture_fetch_float.addressMode[0] = cudaAddressModeClamp;
    d_texture_fetch_float.addressMode[1] = cudaAddressModeClamp;
    gpuErrchk(cudaMemcpy2D(data_d,pitch,data,sizeof(float2)*M1,sizeof(float2)*M1,M2,cudaMemcpyHostToDevice));

}
Run Code Online (Sandbox Code Playgroud)

请注意,现在我们不需要if/else边界检查,因为[0, M1-1] x [0, M2-1]根据指令,纹理会自动将采样区域外的样本钳位为零

    d_texture_fetch_float.addressMode[0] = cudaAddressModeClamp;
    d_texture_fetch_float.addressMode[1] = cudaAddressModeClamp;
Run Code Online (Sandbox Code Playgroud)

使用纹理插值的CUDA实现

这是最后一个实现,并使用纹理过滤的硬连线功能.

__global__ void bilinear_interpolation_kernel_GPU_texture_interp(float2 * __restrict__ d_result, 
                                                                 const float * __restrict__ d_xout, const float * __restrict__ d_yout, 
                                                                 const int M1, const int M2, const int N1, const int N2)
{
   const int l = threadIdx.x + blockDim.x * blockIdx.x;
   const int k = threadIdx.y + blockDim.y * blockIdx.y;

   if ((l<N1)&&(k<N2)) { d_result[k*N1+l] = tex2D(d_texture_interp_float, d_xout[k*N1+l] + 0.5f, d_yout[k*N1+l] + 0.5f); }
}
Run Code Online (Sandbox Code Playgroud)

请注意,此功能实现的插值公式与上面导出的相同,但现在

在此输入图像描述

在哪里x_B = x - 0.5y_B = y - 0.5.这解释0.5了指令中的偏移量

tex2D(d_texture_interp_float, d_xout[k*N1+l] + 0.5f, d_yout[k*N1+l] + 0.5f)
Run Code Online (Sandbox Code Playgroud)

在这种情况下,纹理绑定应该如下进行

void TextureBindingBilinearInterp(const float2 * __restrict__ data, const int M1, const int M2)
{
    size_t pitch; 
    float* data_d;
    gpuErrchk(cudaMallocPitch((void**)&data_d,&pitch, M1 * sizeof(float2), M2));
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float2>();
    gpuErrchk(cudaBindTexture2D(0,&d_texture_interp_float,data_d,&desc,M1,M2,pitch));
    d_texture_interp_float.addressMode[0] = cudaAddressModeClamp;
    d_texture_interp_float.addressMode[1] = cudaAddressModeClamp;
    d_texture_interp_float.filterMode = cudaFilterModeLinear;   // --- Enable linear filtering
    d_texture_interp_float.normalized = false;                  // --- Texture coordinates will NOT be normalized
    gpuErrchk(cudaMemcpy2D(data_d,pitch,data,sizeof(float2)*M1,sizeof(float2)*M1,M2,cudaMemcpyHostToDevice));

}
Run Code Online (Sandbox Code Playgroud)

请注意,正如在其他答案中已经提到的那样,a并且以比特值的比特b存储在9比特定点格式中8,因此这种方法将非常快,但不如上述那些准确.