我已经开始学习cuda一段时间了,我有以下问题
看看我在下面的做法:
复制GPU
int* B;
// ...
int *dev_B;
//initialize B=0
cudaMalloc((void**)&dev_B, Nel*Nface*sizeof(int));
cudaMemcpy(dev_B, B, Nel*Nface*sizeof(int),cudaMemcpyHostToDevice);
//...
//Execute on GPU the following function which is supposed to fill in
//the dev_B matrix with integers
findNeiborElem <<< Nblocks, Nthreads >>>(dev_B, dev_MSH, dev_Nel, dev_Npel, dev_Nface, dev_FC);
Run Code Online (Sandbox Code Playgroud)
再次复制CPU
cudaMemcpy(B, dev_B, Nel*Nface*sizeof(int),cudaMemcpyDeviceToHost);
Run Code Online (Sandbox Code Playgroud)
findNeiborElem函数涉及每个线程的循环,例如它看起来像这样
__ global __ void findNeiborElem(int *dev_B, int *dev_MSH, int *dev_Nel, int *dev_Npel, int *dev_Nface, int *dev_FC){
int tid=threadIdx.x + blockIdx.x * blockDim.x;
while (tid<dev_Nel[0]){
for (int j=1;j<=Nel;j++){
// do some …Run Code Online (Sandbox Code Playgroud)