我们如何在每次运行中使用不同的种子在CUDA C中生成随机数?

use*_*901 3 cuda nvidia gpu-programming

我正在研究一个随机过程,我希望每次运行程序时,如果CUDA内核中的随机数生成不同的序列.这类似于我们在C++中通过声明seed = time(null)后跟srand(seed)和rand()

我可以通过内核将种子从主机传递到设备但是这样做的问题是我必须将每个线程的整个种子数组传递到内核中,以便每次都有不同的随机种子.有没有办法可以生成随机种子/进程if/machine time或类似内核之外的东西并将其作为种子传递?

Jac*_*ern 5

您不需要传递随机种子数组,但是,当您使用cuRAND库时,您可以正确设置序列号参数curand_init.例如[免责声明:它是一个未经测试的功能]

__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i < Np) {

        curandState state;

        curand_init(seed, i, 0, &state);

        numbers[i] = curand_uniform(&state);
    }
}
Run Code Online (Sandbox Code Playgroud)

如果将curand_init指令更改为,也可以避免从外部传递种子

curand_init(clock64(), i, 0, &state);
Run Code Online (Sandbox Code Playgroud)

编辑

根据Roger Dahl的评论,我在生成131072元素数组的四种不同可能性之间进行了比较(Kepler K20c):

  1. 单个随机数生成:用于初始化和随机数生成的独立内核;
  2. 单个随机数生成:用于初始化和随机数生成的唯一内核;
  3. 多个随机数生成:用于初始化和随机数生成的独立内核;
  4. 多个随机数生成:用于初始化和随机数生成的唯一内核;

下面是代码.产生的时间如下:

  1. 861ms;
  2. 852ms;
  3. 866ms;
  4. 2556ms;

我希望我已正确理解Roger Dahl提出的性能问题.

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

#define DSIZE 8192*16
#define nTPB 256

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
        if (abort) exit(code);
    }
}

/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void testrand1(curandState *state, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    a[idx] = curand_uniform(&state[idx]);
}

__global__ void testrand2(unsigned long seed, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);
    a[idx] = curand_uniform(&state);
}

/********/
/* MAIN */
/********/
int main() {

    int n_iter = 20;

    curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));

    float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));

    float time;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);

    getchar();
}
Run Code Online (Sandbox Code Playgroud)

GTX660的输出:

Elapsed time for separate kernels:  1960.3 ms
Elapsed time for single kernels:  1536.9 ms
Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
Elapsed time for single kernels for multiple random number generation:  4612.2 ms
Run Code Online (Sandbox Code Playgroud)

GTX570的输出:

Elapsed time for separate kernels:  957.2 ms 
Elapsed time for single kernels:  947.7 ms 
Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
Elapsed time for single kernels for multiple random number generation:  2839.0 ms 
Run Code Online (Sandbox Code Playgroud)

与K20c大致相同的性能.


Tom*_*Tom 1

每次运行时使用不同的种子应该很简单。确切的方法取决于您使用的生成器,但如果您使用的是cuRAND生成器之一,那么您可以将 time_t 从 time(NULL) 转换为 64 位整数并将其传递给种子函数。

如果您从内核调用生成器,则需要将此种子作为内核参数或通过变量传递__device__。然后,您可以使用偏移量curand_init()或使用skip_ahead()来获取不同的子序列。

如果您有一个特定的生成器,这对其不起作用,请发布更多信息。