the*_*ush 6 c++ memory textures cuda
我正在尝试使用Cuda和C++在GPU上执行两个任务(分成2个内核).作为输入,我采用NxM矩阵(作为浮点数组存储在主机的内存中).然后,我将使用对此矩阵执行某些操作的内核使其成为NxMxD矩阵.然后我有一个第二个内核,它对这个3D矩阵执行一些操作(我只是读取值,我不必为其写入值).
在纹理内存中运行似乎对我的任务来说要快得多,所以我的问题是如果可以在内核1之后从设备上的全局内存中复制我的数据并将其直接传输到内核2的纹理内存而不将其带回主机?
UPDATE
我添加了一些代码来更好地说明我的问题.
这是两个内核.第一个只是现在的占位符,并将2D矩阵复制为3D.
__global__ void computeFeatureVector(float* imData3D_dev, int imX, int imY, int imZ) {
//calculate each thread global index
int xindex=blockIdx.x*blockDim.x+threadIdx.x;
int yindex=blockIdx.y*blockDim.y+threadIdx.y;
#pragma unroll
for (int z=0; z<imZ; z++) {
imData3D_dev[xindex+yindex*imX + z*imX*imY] = tex2D(texImIp,xindex,yindex);
}
}
Run Code Online (Sandbox Code Playgroud)
第二个将采用这个3D矩阵,现在表示为纹理并对其执行一些操作.现在空白.
__global__ void kernel2(float* resData_dev, int imX) {
//calculate each thread global index
int xindex=blockIdx.x*blockDim.x+threadIdx.x;
int yindex=blockIdx.y*blockDim.y+threadIdx.y;
resData_dev[xindex+yindex*imX] = tex3D(texImIp3D,xindex,yindex, 0);
return;
}
Run Code Online (Sandbox Code Playgroud)
然后代码的主体如下:
// declare textures
texture<float,2,cudaReadModeElementType> texImIp;
texture<float,3,cudaReadModeElementType> texImIp3D;
void main_fun() {
// constants
int imX = 1024;
int imY = 768;
int imZ = 16;
// input data
float* imData2D = new float[sizeof(float)*imX*imY];
for(int x=0; x<imX*imY; x++)
imData2D[x] = (float) rand()/RAND_MAX;
//create channel to describe data type
cudaArray* carrayImIp;
cudaChannelFormatDesc channel;
channel=cudaCreateChannelDesc<float>();
//allocate device memory for cuda array
cudaMallocArray(&carrayImIp,&channel,imX,imY);
//copy matrix from host to device memory
cudaMemcpyToArray(carrayImIp,0,0,imData2D,sizeof(float)*imX*imY,cudaMemcpyHostToDevice);
// Set texture properties
texImIp.filterMode=cudaFilterModePoint;
texImIp.addressMode[0]=cudaAddressModeClamp;
texImIp.addressMode[1]=cudaAddressModeClamp;
// bind texture reference with cuda array
cudaBindTextureToArray(texImIp,carrayImIp);
// kernel params
dim3 blocknum;
dim3 blocksize;
blocksize.x=16; blocksize.y=16; blocksize.z=1;
blocknum.x=(int)ceil((float)imX/16);
blocknum.y=(int)ceil((float)imY/16);
// store output here
float* imData3D_dev;
cudaMalloc((void**)&imData3D_dev,sizeof(float)*imX*imY*imZ);
// execute kernel
computeFeatureVector<<<blocknum,blocksize>>>(imData3D_dev, imX, imY, imZ);
//unbind texture reference to free resource
cudaUnbindTexture(texImIp);
// check copied ok
float* imData3D = new float[sizeof(float)*imX*imY*imZ];
cudaMemcpy(imData3D,imData3D_dev,sizeof(float)*imX*imY*imZ,cudaMemcpyDeviceToHost);
cout << " kernel 1" << endl;
for (int x=0; x<10;x++)
cout << imData3D[x] << " ";
cout << endl;
delete [] imData3D;
//
// kernel 2
//
// copy data on device to 3d array
cudaArray* carrayImIp3D;
cudaExtent volumesize;
volumesize = make_cudaExtent(imX, imY, imZ);
cudaMalloc3DArray(&carrayImIp3D,&channel,volumesize);
cudaMemcpyToArray(carrayImIp3D,0,0,imData3D_dev,sizeof(float)*imX*imY*imZ,cudaMemcpyDeviceToDevice);
// texture params and bind
texImIp3D.filterMode=cudaFilterModePoint;
texImIp3D.addressMode[0]=cudaAddressModeClamp;
texImIp3D.addressMode[1]=cudaAddressModeClamp;
texImIp3D.addressMode[2]=cudaAddressModeClamp;
cudaBindTextureToArray(texImIp3D,carrayImIp3D,channel);
// store output here
float* resData_dev;
cudaMalloc((void**)&resData_dev,sizeof(float)*imX*imY);
// kernel 2
kernel2<<<blocknum,blocksize>>>(resData_dev, imX);
cudaUnbindTexture(texImIp3D);
//copy result matrix from device to host memory
float* resData = new float[sizeof(float)*imX*imY];
cudaMemcpy(resData,resData_dev,sizeof(float)*imX*imY,cudaMemcpyDeviceToHost);
// check copied ok
cout << " kernel 2" << endl;
for (int x=0; x<10;x++)
cout << resData[x] << " ";
cout << endl;
delete [] imData2D;
delete [] resData;
cudaFree(imData3D_dev);
cudaFree(resData_dev);
cudaFreeArray(carrayImIp);
cudaFreeArray(carrayImIp3D);
}
Run Code Online (Sandbox Code Playgroud)
我很高兴第一个内核正常工作,但3D矩阵imData3D_dev似乎没有正确绑定到纹理texImIp3D.
回答
我用cudaMemcpy3D解决了我的问题.这是主函数第二部分的修订代码.imData3D_dev包含来自第一个内核的全局内存中的3D矩阵.
cudaArray* carrayImIp3D;
cudaExtent volumesize;
volumesize = make_cudaExtent(imX, imY, imZ);
cudaMalloc3DArray(&carrayImIp3D,&channel,volumesize);
cudaMemcpy3DParms copyparms={0};
copyparms.extent = volumesize;
copyparms.dstArray = carrayImIp3D;
copyparms.kind = cudaMemcpyDeviceToDevice;
copyparms.srcPtr = make_cudaPitchedPtr((void*)imData3D_dev, sizeof(float)*imX,imX,imY);
cudaMemcpy3D(©parms);
// texture params and bind
texImIp3D.filterMode=cudaFilterModePoint;
texImIp3D.addressMode[0]=cudaAddressModeClamp;
texImIp3D.addressMode[1]=cudaAddressModeClamp;
texImIp3D.addressMode[2]=cudaAddressModeClamp;
cudaBindTextureToArray(texImIp3D,carrayImIp3D,channel);
// store output here
float* resData_dev;
cudaMalloc((void**)&resData_dev,sizeof(float)*imX*imY);
kernel2<<<blocknum,blocksize>>>(resData_dev, imX);
// ... clean up
Run Code Online (Sandbox Code Playgroud)
当这个问题第一次被问到时,各种 cudaMemcpy 例程的命名曾经有些复杂,但自那以后已经被 Nvidia 清理了。
为了在 3D 数组上进行操作,您需要使用cudaMemcpy3D()
能够将线性内存中的 3D 数据复制到 3D 数组的工具(在其他数组中)。
cudaMemcpyToArray()
曾经是将线性数据复制到二维数组所需的函数,但已被弃用,取而代之的是更一致的命名cudaMemcpy2D()
。
但是,如果您使用的设备的计算能力为 2.0 或更高,则您不想使用任何功能cudaMemcpy*()
。相反,使用允许您直接写入纹理的表面,而不需要在内核之间进行任何数据复制。(您仍然需要将读取和写入分离到两个不同的内核中,就像现在一样,因为纹理缓存与表面写入不一致,并且仅在内核启动时无效)。
归档时间: |
|
查看次数: |
4799 次 |
最近记录: |