我正在测试新的CUDA 8以及Pascal Titan X GPU,并期待我的代码加速,但由于某种原因它最终会变慢.我在Ubuntu 16.04上.
以下是可以重现结果的最小代码:
CUDASample.cuh
class CUDASample{
public:
void AddOneToVector(std::vector<int> &in);
};
Run Code Online (Sandbox Code Playgroud)
CUDASample.cu
__global__ static void CUDAKernelAddOneToVector(int *data)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int mx = gridDim.x * blockDim.x;
data[y * mx + x] = data[y * mx + x] + 1.0f;
}
void CUDASample::AddOneToVector(std::vector<int> &in){
int *data;
cudaMallocManaged(reinterpret_cast<void **>(&data),
in.size() * sizeof(int),
cudaMemAttachGlobal);
for (std::size_t i = 0; i < in.size(); …
Run Code Online (Sandbox Code Playgroud)