我试图在CUDA C++代码上运行向量步骤添加功能,但对于大小为5,000,000的大型浮点数组,它运行速度比我的CPU版本慢.以下是我所说的相关CUDA和cpu代码:
#define THREADS_PER_BLOCK 1024
typedef float real;
__global__ void vectorStepAddKernel2(real*x, real*y, real*z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
{
x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
}
}
cudaError_t vectorStepAdd2(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
cudaError_t cudaStatus;
int threadsPerBlock = THREADS_PER_BLOCK;
int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock;
vectorStepAddKernel2<<<blocksPerGrid, …Run Code Online (Sandbox Code Playgroud)