如何从2D纹理成功读取

Mar*_*m0t 6 c++ textures cuda

我怎么能够:

  1. 将cudaMallocPitch浮动内存绑定到2D纹理参考
  2. 将一些主机数据复制到设备上的2D阵列
  3. 将一个添加到纹理参考并写入a.)Pitch 2D数组或b.)写入线性存储器阵列
  4. 阅读答案并显示出来.

下面是一个应该完成此任务的代码.请注意,对于NxN数组大小,我的代码可以正常工作.对于N!M,其中N!= M,我的代码咬了灰尘(不是正确的结果).如果你能解决这个问题,我将奖励你1个互联网(供应有限).也许我很疯狂,但根据文档,这应该工作(它确实适用于方阵!).附加的代码应该与'nvcc whateveryoucallit.cu -o runit'一起运行.

感谢帮助!

#include<stdio.h>
#include<cuda.h>
#include<iostream>
#define height 16
#define width 11
#define BLOCKSIZE 16

using namespace std;

// Device Kernels

//Texture reference Declaration
texture<float,2> texRefEx;


__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)
{
 // Thread indexes
        unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
        unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

 // Texutre Coordinates
 float u=(idx)/float(width);
 float v=(idy)/float(height);
 devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx];
 // Write Texture Contents to malloc array +1
 devMPtr[idy*width+idx]= tex2D(texRefEx,u,v);//+1.0f;
}
int main()
{
 // memory size
 size_t memsize=height*width;
 size_t offset;
 float * data,  // input from host
  *h_out,  // host space for output
  *devMPPtr, // malloc Pitch ptr
  *devMPtr; // malloc ptr

 size_t pitch;

 // Allocate space on the host
 data=(float *)malloc(sizeof(float)*memsize);
 h_out=(float *)malloc(sizeof(float)*memsize);


// Define data
for (int i = 0; i <  height; i++)
 for (int j=0; j < width; j++)
  data[i*width+j]=float(j);

// Define the grid
dim3 grid((int)(width/BLOCKSIZE)+1,(int)(height/BLOCKSIZE)+1), threads(BLOCKSIZE,BLOCKSIZE);

// allocate Malloc Pitch
cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

// Print the pitch
printf("The pitch is %d \n",pitch/sizeof(float));

// Texture Channel Description
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

// Bind texture to pitch mem:
cudaBindTexture2D(&offset,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);
cout << "My Description x is " << channelDesc.x << endl;
cout << "My Description y is " << channelDesc.y << endl;
cout << "My Description z is " << channelDesc.z << endl;
cout << "My Description w is " << channelDesc.w << endl;
cout << "My Description kind is " << channelDesc.f << endl;
cout << "Offset is " << offset << endl;

// Set mutable properties:
texRefEx.normalized=true;
texRefEx.addressMode[0]=cudaAddressModeWrap;
texRefEx.addressMode[1]=cudaAddressModeWrap;
texRefEx.filterMode= cudaFilterModePoint;

// Allocate cudaMalloc memory
cudaMalloc((void**)&devMPtr,memsize*sizeof(float));

// Read data from host to device
cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*width,
  sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory
cudaMemcpy2D((void*)h_out,width*sizeof(float),(void*)devMPPtr,pitch,
  sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }

 cout << "Done" << endl;
// Memory is fine... 

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

// Copy back data to host
cudaMemcpy((void*)h_out,(void*)devMPtr,width*height*sizeof(float),cudaMemcpyDeviceToHost);


// Print the Result
 cout << endl;
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }
 cout << "Done" << endl;

return(0);
}
Run Code Online (Sandbox Code Playgroud)

编辑10月17日:所以我还没有找到解决这个问题的方法.Nvidia在这方面相当沉默似乎世界也是如此.我找到了使用共享内存的解决方法,但如果有人有纹理解决方案,我会非常高兴.

编辑 Octoboer 26:仍然没有解决,但如果有人知道,仍然对一个感兴趣.

编辑 7月26日:哇它已经9个月了 - 我一直忽略了正确的答案.诀窍是:

if ( idx < width  && idy < height){//.... code }
Run Code Online (Sandbox Code Playgroud)

正如之前所指出的那样.感谢所有贡献者!

tke*_*win 3

这可能与你的块大小有关。在此代码中,您尝试将 16x16 线程块写入 11x16 内存块。这意味着您的某些线程正在写入未分配的内存。这也解释了为什么您的 (16*M x 32*N) 测试有效:没有线程写入未分配的内存,因为您的尺寸是 16 的倍数。

解决这个问题的一个简单方法是这样的:

if ((x < width) && (y < height)) {
   // write output 
  devMPtr[idy*width+idx]= tex2D(texRefEx,u,v); 
}
Run Code Online (Sandbox Code Playgroud)

在调用内核之前,您需要将高度和宽度传递给内核函数或将常量复制到卡。