The*_*ere 1 c++ cuda visual-studio-2010 visual-studio-2013 cufft
这个简单的CUFFT代码在两个IDE上运行 -
我发现使用Cuda 7.0的VS 2013 1000大约慢了一倍.该代码0.6 ms在VS 2010中执行,并且520 ms在平均值上都采用了VS 2013.
#include "stdafx.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "cufft.h"
typedef cuComplex Complex;
#include <iostream>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
const int SIZE = 10000;
Complex *h_col = (Complex*)malloc(SIZE*sizeof(Complex));
for (int i = 0; i < SIZE; i++)
{
h_col[i].x = i;
h_col[i].y = i;
}
Complex *d_col;
cudaMalloc((void**)&d_col, SIZE*sizeof(Complex));
cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);
cufftHandle plan;
const int BATCH = 1;
cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);
cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cufftDestroy(plan);
cout << milliseconds;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
代码在同一台计算机上运行,具有相同的操作系统,相同的显卡,并且一个接一个地运行.两种情况下的配置都是x64 Release.您可以选择是使用C++编译器还是使用CUDA C/C++编译文件.我在两个项目上尝试了两个选项并没有任何区别.
有什么想法解决这个问题?
FWIW,我在VS 2013上的Cuda 6.5和Cuda 7得到了相同的结果
袖带库从4.2到7.0变得相当大,并且导致更多的初始化时间.如果您将此初始化时间作为一个因素删除,我认为您会发现执行时间差异将远远小于1000倍.
这是一个修改过的代码,证明了这一点:
$ cat t807.cu
#include <cufft.h>
#include <cuComplex.h>
typedef cuComplex Complex;
#include <iostream>
using namespace std;
int main(int argc, char* argv[])
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
const int SIZE = 10000;
Complex *h_col = (Complex*)malloc(SIZE*sizeof(Complex));
for (int i = 0; i < SIZE; i++)
{
h_col[i].x = i;
h_col[i].y = i;
}
Complex *d_col;
cudaMalloc((void**)&d_col, SIZE*sizeof(Complex));
cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);
cufftHandle plan;
const int BATCH = 1;
cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);
cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cufftDestroy(plan);
cout << milliseconds << endl;
cudaEventRecord(start);
for (int i = 0; i < SIZE; i++)
{
h_col[i].x = i;
h_col[i].y = i;
}
cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);
cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);
cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cufftDestroy(plan);
cout << milliseconds << endl;
return 0;
}
$ nvcc -o t807 t807.cu -lcufft
$ ./t807
94.8298
1.44778
$
Run Code Online (Sandbox Code Playgroud)
上面的第二个数字表示基本上相同的代码,删除了袖带初始化(因为它是在第一次传递时完成的).