在编写一些CUDA代码时遇到了这个非常奇怪的问题:从gpu到cpu内存的同一块cudaMemcpy需要不同的时间来完成对子程序的不同迭代调用,这是一个巨大的差异:~60 ms vs~ 0.02毫秒
代码如下:
float calc_formation_obj( int formationNo, bool calcObj )
{
int i;
int prev = prevCP[aperIndex];
int next = nextCP[aperIndex];
float ll = formations_l[formationNo];
float rl = formations_r[formationNo];
float obj = 0.0;
float *f_grid = new float[grid_size_voxe];
// use ll and rl
thrust::device_ptr<float> dll(d_leafpos_l);
thrust::device_ptr<float> drl(d_leafpos_r);
dll[rows_per_beam*aperIndex+ rowIndex] = ll;
drl[rows_per_beam*aperIndex+ rowIndex] = rl;
// set all leaf positions between prev/next
set_leafpos<<<grid_size_ncps,BLOCK_SIZE>>> (aperIndex, rowIndex, prev, next, ncps, d_leafpos_l,
d_leafpos_r, ll, rl, rows_per_beam, d_cp_angles);
// copy dose to …Run Code Online (Sandbox Code Playgroud)