Phi*_*hil 4 c++ scope cuda qualifiers device
我目前正在尝试制作一个CUDA代码,其中一个类将仅用于设备端(即主机不需要知道它的存在).但是我无法计算出类的正确限定符(deviceclass如下):
__device__ float devicefunction (float *x) {return x[0]+x[1];}
class deviceclass {
private:
float _a;
public:
deviceclass(float *x) {_a = devicefunction(x);}
float getvalue () {return _a;}
};
// Device code
__global__ void VecInit(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
deviceclass *test;
test = new deviceclass(1.0, 2.0);
A[i] = test->getvalue();
}
}
// Standard CUDA guff below: Variables
float *h_A, *d_A;
// Host code
int main(int argc, char** argv)
{
printf("Vector initialization...\n");
int N = 10000;
size_t size = N * sizeof(float);
// Allocate
h_A = (float*)malloc(size);
cudaMalloc(&d_A, size);
printf("Computing...\n");
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
// Copy result from device memory to host memory
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
//...etc
}
Run Code Online (Sandbox Code Playgroud)
设置Deviceclass为仅从__device__全局函数调用时抛出错误,但将其设置为__device__ __host__或__global__似乎不必要.有人能指出我正确的方向吗?
Phi*_*hil 11
事实证明,限定符必须在类的成员函数上,下面是一个完全工作的版本:
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
void Cleanup(void);
// Functions to be pointed to
__device__ float Plus (float a, float b) {return a+b;}
class deviceclass {
private:
float test;
public:
__device__ deviceclass(float a, float b) {
test = Plus(a,b);
}
__device__ float getvalue() {return test;}
};
// Device code
__global__ void VecInit(float* A, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
deviceclass test(1.0, 2.0);
A[i] = test.getvalue();
}
}
// Standard CUDA guff below: Variables
float *h_A, *d_A;
// Host code
int main(int argc, char** argv)
{
printf("Vector initialization...\n");
int N = 10000;
size_t size = N * sizeof(float);
// Allocate
h_A = (float*)malloc(size);
cudaMalloc(&d_A, size);
printf("Computing...\n");
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);
// Copy result from device memory to host memory
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
// Verify result
int i;
for (i = 0; i < N; ++i) {
cout << endl << h_A[i];
}
cout << endl;
Cleanup();
}
void Cleanup(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
// Free host memory
if (h_A)
free(h_A);
cudaThreadExit();
exit(0);
}
Run Code Online (Sandbox Code Playgroud)