Pau*_*aul 14 struct cuda alignment
我是CUDA C的新手,我正在尝试将typedef'd结构传递给内核.当我尝试使用仅包含整数的结构时,我的方法工作得很好,但是当我切换到浮点数时,我得到了无意义的数字作为结果.我认为这与对齐有关,我尝试包括__align__我的类型声明,但无济于事.有人能举例说明这是如何完成的,还是提供了另一种方法?我正在尝试设置它,以便我可以轻松地添加或删除字段,而不会更改除结构和内核之外的任何内容.我的代码:
typedef struct __align__(8)
{
float a, b;
} point;
__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}
int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;
// allocate memory
point *cpuPointArray = new point[numPoints],
*gpuPointArray = new point[numPoints];
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);
// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %d, point.b: %d\n",cpuPointArray[i].a,cpuPointArray[i].b);
}
// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);
return 0;
}
Run Code Online (Sandbox Code Playgroud)
Pau*_*aul 19
由于似乎没有关于如何做到这一点的任何体面的文档,我想我会在这里发布最终的修订代码.事实证明,该__align__部分也是不必要的,实际问题是在尝试打印浮动时在printf中使用%d.
#include <stdlib.h>
#include <stdio.h>
typedef struct
{
float a, b;
} point;
__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}
int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;
// allocate memory
point *cpuPointArray,
*gpuPointArray;
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);
// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %f, point.b: %f\n",cpuPointArray[i].a,cpuPointArray[i].b);
}
// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);
return 0;
}
Run Code Online (Sandbox Code Playgroud)
看看它是如何在 CUDA 包含目录中的 vector_types.h 头文件中完成的。那应该已经给了你一些指示。
但是,这里的主要问题是%d在您的printf通话中。您现在正在尝试打印浮点数,而不是整数。所以那些真的应该%f改用。