我正在尝试使用流异步启动多个CUDA FFT内核.为此,我正在创建我的流,cuFFT前向和反向计划如下:
streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)
{
cudaStreamCreate(&streams[i]);
CHECK_ERROR(5)
cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
CHECK_ERROR(5)
cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
CHECK_ERROR(5)
cufftSetStream(plansF[i],streams[i]);
CHECK_ERROR(5)
cufftSetStream(plansI[i],streams[i]);
CHECK_ERROR(5)
}
Run Code Online (Sandbox Code Playgroud)
在main函数中,我正在启动正向FFT,如下所示:
for(w=1;w<q;w++)
{
cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
CHECK_ERROR(8)
k += rect_small_real;
j += rect_small_complex;
}
Run Code Online (Sandbox Code Playgroud)
我还有其他内核,我使用相同的流异步启动.
当我使用Visual Profiler 5.0分析我的应用程序时,我发现除了CUDA FFT(正向和反向)之外的所有内核并行运行并重叠.FFT内核确实在不同的流中运行,但它们不重叠,因为它们实际上是顺序运行的.谁能告诉我我的问题是什么?
我的环境是VS 2008,64位,Windows 7.
谢谢.
这是在Kepler体系结构中使用CUDA中的流的cuFFT执行和memcopies的工作示例.
这是代码:
#include <stdio.h>
#include <cufft.h>
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 5000;
// --- Host input data initialization
float2 *h_in1 = new float2[N];
float2 *h_in2 = new float2[N];
float2 *h_in3 = new float2[N];
for (int i = 0; i < N; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out1 = new float2[N];
float2 *h_out2 = new float2[N];
float2 *h_out3 = new float2[N];
for (int i = 0; i < N; i++) {
h_out1[i].x = 0.f;
h_out1[i].y = 0.f;
h_out2[i].x = 0.f;
h_out2[i].y = 0.f;
h_out3[i].x = 0.f;
h_out3[i].y = 0.f;
}
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaHostUnregister(h_out1));
gpuErrchk(cudaHostUnregister(h_out2));
gpuErrchk(cudaHostUnregister(h_out3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out1;
delete[] h_out2;
delete[] h_out3;
cudaDeviceReset();
return 0;
}
Run Code Online (Sandbox Code Playgroud)
请根据CUFFT错误处理添加cuFFT错误检查.
下面,提供了在Kepler K20c卡上测试上述算法时的一些分析信息.正如您将看到的,只有在您有足够大的情况下,才能在计算和内存传输之间实现真正的重叠N.
N = 5000

N = 50000

N = 500000

小智 2
问题出在您使用的硬件上。
所有支持 CUDA 的 GPU 都能够同时执行内核并以两种方式复制数据。但是,只有具有计算能力 3.5 的设备才具有名为Hyper-Q 的功能。
简而言之,在这些 GPU 中实现了几个(我想是 16 个)硬件内核队列。在以前的GPU中,可以使用一对一的硬件队列。
这意味着 cudaStream 只是虚拟的,并且仅在重叠计算和内存复制的情况下,它们对旧硬件的使用才有意义。当然,这不仅适用于 cuFFT,也适用于您自己的内核!
请深入查看视觉分析器的输出。您可能会无意中将时间线可视化视为 GPU 执行的确切数据。然而事情并没有那么简单。有几行显示的数据可能指的是执行内核启动行的时间点(通常是橙色的)。该行对应于 GPU 上特定内核的执行(蓝色矩形)。内存传输也是如此(确切的时间显示为浅棕色矩形)。
希望,我帮助您解决了您的问题。