我正在测量标准和1Dtexture访问内存之间的差异.为此,我创建了两个内核
__global__ void texture1D(float* doarray,int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]=tex1Dfetch(texreference,index);
return;
}
__global__ void standard1D(float* diarray, float* doarray, int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]= diarray[index];
return;
}
Run Code Online (Sandbox Code Playgroud)
然后,我调用eache内核来测量它所花费的时间:
//copy array from host to device memory
cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice);
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventRecord(startEvent, 0) );
//bind texture reference with linear memory
cudaBindTexture(0,texreference,diarray,sizeof(float)*size);
//execute device kernel …Run Code Online (Sandbox Code Playgroud) cuda ×1