CUDA：如何创建2D纹理对象？

Question

CUDA：如何创建2D纹理对象？

我正在尝试创建2D纹理对象4x4 uint8_t。这是代码：

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    uint8_t val = tex2D<uint8_t>(tex, x, y);
    printf("%d, ", val);
    return;
}

int main(int argc, char **argv)
{
    cudaTextureObject_t tex;
    uint8_t dataIn[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
    uint8_t* dataDev = 0;
    cudaMalloc((void**)&dataDev, 16);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.desc.x = 8;
    resDesc.res.pitch2D.desc.y = 8;
    resDesc.res.pitch2D.desc.f = cudaChannelFormatKindUnsigned;
    resDesc.res.pitch2D.width = 4;
    resDesc.res.pitch2D.height = 4;
    resDesc.res.pitch2D.pitchInBytes = 4;
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    cudaMemcpy(dataDev, &dataIn[0], 16, cudaMemcpyHostToDevice);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    return 0;
}

Run Code Online (Sandbox Code Playgroud)

我希望结果将是这样的：

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,

Run Code Online (Sandbox Code Playgroud)

即纹理对象的所有值（顺序无关紧要）。

但是实际结果是：

0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6, 0, 2, 4, 6,

Run Code Online (Sandbox Code Playgroud)

我究竟做错了什么？

Answer 1

Rob*_*lla 5

当您将pitch2D变体用于纹理操作时，基础分配应该是适当的变调分配。我认为通常人们会用cudaMallocPitch。但是，规定的要求是：

cudaResourceDesc :: res :: pitch2D :: pitchInBytes指定两行之间的间距（以字节为单位），并且必须与cudaDeviceProp :: texturePitchAlignment对齐。

在我的GPU上，最后一个属性是32。我不知道您的GPU，但是我敢打赌，对于您的GPU，该属性不是4。但是，您在此处指定4：

resDesc.res.pitch2D.pitchInBytes = 4;

Run Code Online (Sandbox Code Playgroud)

Again, I think people would typically use a pitched allocation created with cudaMallocPitch for this. However it does appear to be possible to me to pass an ordinary linear allocation if the row-to-row dimension (in bytes) is divisible by texturePitchAlignment (32 in my case).

Another change I made is to use cudaCreateChannelDesc<>() instead of manually setting the parameters like you did. This creates a different set of desc parameters and seems to affect the result also. It should not be difficult to study the differences.

When I adjust your code to address those issues, I get results that seem sensible to me:

$ cat t30.cu
#include <stdio.h>
#include <stdint.h>

typedef uint8_t mt;  // use an integer type

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    mt val = tex2D<mt>(tex, x, y);
    printf("%d, ", val);
}

int main(int argc, char **argv)
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
    cudaTextureObject_t tex;
    const int num_rows = 4;
    const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
    const int ts = num_cols*num_rows;
    const int ds = ts*sizeof(mt);
    mt dataIn[ds];
    for (int i = 0; i < ts; i++) dataIn[i] = i;
    mt* dataDev = 0;
    cudaMalloc((void**)&dataDev, ds);
    cudaMemcpy(dataDev, dataIn, ds, cudaMemcpyHostToDevice);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.width = num_cols;
    resDesc.res.pitch2D.height = num_rows;
    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
    resDesc.res.pitch2D.pitchInBytes = num_cols*sizeof(mt);
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    printf("\n");
    return 0;
}
$ nvcc -o t30 t30.cu
$ cuda-memcheck ./t30
========= CUDA-MEMCHECK
texturePitchAlignment: 32
0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
========= ERROR SUMMARY: 0 errors
$

Run Code Online (Sandbox Code Playgroud)

As asked in the comments, if I were going to do something similar to this but using cudaMallocPitch and cudaMemcpy2D, it could look something like this:

$ cat t1421.cu
#include <stdio.h>
#include <stdint.h>

typedef uint8_t mt;  // use an integer type

__global__ void kernel(cudaTextureObject_t tex)
{
    int x = threadIdx.x;
    int y = threadIdx.y;
    mt val = tex2D<mt>(tex, x, y);
    printf("%d, ", val);
}

int main(int argc, char **argv)
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("texturePitchAlignment: %lu\n", prop.texturePitchAlignment);
    cudaTextureObject_t tex;
    const int num_rows = 4;
    const int num_cols = prop.texturePitchAlignment*1; // should be able to use a different multiplier here
    const int ts = num_cols*num_rows;
    const int ds = ts*sizeof(mt);
    mt dataIn[ds];
    for (int i = 0; i < ts; i++) dataIn[i] = i;
    mt* dataDev = 0;
    size_t pitch;
    cudaMallocPitch((void**)&dataDev, &pitch,  num_cols*sizeof(mt), num_rows);
    cudaMemcpy2D(dataDev, pitch, dataIn, num_cols*sizeof(mt), num_cols*sizeof(mt), num_rows, cudaMemcpyHostToDevice);
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypePitch2D;
    resDesc.res.pitch2D.devPtr = dataDev;
    resDesc.res.pitch2D.width = num_cols;
    resDesc.res.pitch2D.height = num_rows;
    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<mt>();
    resDesc.res.pitch2D.pitchInBytes = pitch;
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
    dim3 threads(4, 4);
    kernel<<<1, threads>>>(tex);
    cudaDeviceSynchronize();
    printf("\n");
    return 0;
}
$ nvcc -o t1421 t1421.cu
$ cuda-memcheck ./t1421
========= CUDA-MEMCHECK
texturePitchAlignment: 32
0, 1, 2, 3, 32, 33, 34, 35, 64, 65, 66, 67, 96, 97, 98, 99,
========= ERROR SUMMARY: 0 errors
$

Run Code Online (Sandbox Code Playgroud)

尽管这里提供的是纹理对象，但它很容易证明纹理引用也会发生类似的问题。您无法创建任意小的2D纹理参考，就像无法创建任意小的2D纹理对象一样。我也不会对此进行演示，因为它将在很大程度上重复上述内容，并且人们不应该再将纹理引用用于新的开发工作-纹理对象是更好的方法。

我不能做与`cudaMallocPitch`相同的事情，因为它将分配给更宽的分配间距（可能是512个字节而不是32个字节）。我可以做类似但不完全相同的事情。我用一个例子更新了我的答案。 (2认同)

归档时间：	7 年前
查看次数：	329 次
最近记录：	6 年，10 月前