CUDA：将 CPU 上动态创建的函数指针数组复制到 GPU 内存

Question

CUDA：将 CPU 上动态创建的函数指针数组复制到 GPU 内存

我想在 CPU 上动态创建函数指针列表（使用push_back()从调用的某种方法main()）并将其复制到 GPU__constant__或__device__数组，而不需要诉诸静态__device__函数指针。我相信这个问题与我的问题有关；但是，我的目标是迭代地创建__host__函数指针数组，然后将其复制到__constant__函数指针数组，而不是在声明时初始化后者。

具有静态函数指针的工作代码示例（如此处或此处所示）将是：

常见的.h：

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <iostream>

#define num_functions 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),     file, line);
      if (abort) exit(code);
   }
}

// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);

// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}

// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];

// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
  fptr_t f;
  if (threadIdx.x < num_functions) {
    f = constant_fList[threadIdx.x];
    f(a,b);
  }
}

#endif

Run Code Online (Sandbox Code Playgroud)

主.cu:

#include "common.h"

// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;

// Load function list to constant memory
void loadList_staticpointers() {
  fptr_t h_fList[num_functions];
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}

int main() {

  loadList_staticpointers();
  int a = 12, b = 15;
  kernel<<<1,3>>>(a, b);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

  return 0;
}

Run Code Online (Sandbox Code Playgroud)

-arch=sm_30规格：GeForce GTX 670，针对CUDA 6.5、Ubuntu 14.04编译

我希望避免使用静态设备函数指针，因为附加每个函数需要在用户端进行代码维护 - 声明新的静态指针，例如p_Addor p_Mul，操作void loadList_functionpointers()等。为了清楚起见，我正在尝试类似的方法以下（崩溃）代码：

main_wrong.cu：

#include "common.h"
#include <vector>

// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;

// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}

// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call 
void UploadVector() {
  fptr_t* h_vpointer = vec_fList.data();
  gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}

int main() {

  addFunc(Add);
  addFunc(Subtract);
  addFunc(Multiply);
  int a = 12, b = 15;

  UploadVector();

  kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

  return 0;
}

Run Code Online (Sandbox Code Playgroud)

我的理解是，指向主机地址的函数指针被复制到GPU，内核无法使用，调用函数时需要指向GPU地址的指针。f(a,b)使用设备端指针填充主机端数组对我来说适用于原始数据（请参阅此问题），但不适用于函数指针。使用统一内存的简单尝试也失败了......到目前为止，我只发现静态设备端指针可以工作。是否没有其他方法可以将动态创建的 CPU 函数指针数组复制到 GPU 上？

Answer 1

m.s*_*.s. 5

如果您可以使用 C++11（自 CUDA 7 起支持），您可以使用以下命令自动生成函数表：

template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
  constexpr auto num_f = sizeof...(Functions);

  constexpr fptr_t table[] = { Functions... };

  if (threadIdx.x < num_f)
  {
    fptr_t f = table[threadIdx.x];
    f(a,b);
  }
}

Run Code Online (Sandbox Code Playgroud)

然后您可以使用以下命令调用该内核

kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

Run Code Online (Sandbox Code Playgroud)

归档时间：	10 年，3 月前
查看次数：	2242 次
最近记录：	9 年，6 月前