use*_*901 3 cuda nvidia gpu-programming
我正在研究一个随机过程,我希望每次运行程序时,如果CUDA内核中的随机数生成不同的序列.这类似于我们在C++中通过声明seed = time(null)后跟srand(seed)和rand()
我可以通过内核将种子从主机传递到设备但是这样做的问题是我必须将每个线程的整个种子数组传递到内核中,以便每次都有不同的随机种子.有没有办法可以生成随机种子/进程if/machine time或类似内核之外的东西并将其作为种子传递?
您不需要传递随机种子数组,但是,当您使用cuRAND库时,您可以正确设置序列号参数curand_init.例如[免责声明:它是一个未经测试的功能]
__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < Np) {
curandState state;
curand_init(seed, i, 0, &state);
numbers[i] = curand_uniform(&state);
}
}
Run Code Online (Sandbox Code Playgroud)
如果将curand_init指令更改为,也可以避免从外部传递种子
curand_init(clock64(), i, 0, &state);
Run Code Online (Sandbox Code Playgroud)
编辑
根据Roger Dahl的评论,我在生成131072元素数组的四种不同可能性之间进行了比较(Kepler K20c):
下面是代码.产生的时间如下:
861ms;852ms;866ms;2556ms;我希望我已正确理解Roger Dahl提出的性能问题.
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192*16
#define nTPB 256
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void testrand1(curandState *state, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a[idx] = curand_uniform(&state[idx]);
}
__global__ void testrand2(unsigned long seed, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState state;
curand_init(seed, idx, 0, &state);
a[idx] = curand_uniform(&state);
}
/********/
/* MAIN */
/********/
int main() {
int n_iter = 20;
curandState *devState; gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels with multiple random number generation: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels for multiple random number generation: %3.1f ms \n", time);
getchar();
}
Run Code Online (Sandbox Code Playgroud)
GTX660的输出:
Elapsed time for separate kernels: 1960.3 ms
Elapsed time for single kernels: 1536.9 ms
Elapsed time for separate kernels with multiple random number generation: 1576.0 ms
Elapsed time for single kernels for multiple random number generation: 4612.2 ms
Run Code Online (Sandbox Code Playgroud)
GTX570的输出:
Elapsed time for separate kernels: 957.2 ms
Elapsed time for single kernels: 947.7 ms
Elapsed time for separate kernels with multiple random number generation: 964.6 ms
Elapsed time for single kernels for multiple random number generation: 2839.0 ms
Run Code Online (Sandbox Code Playgroud)
与K20c大致相同的性能.
| 归档时间: |
|
| 查看次数: |
7523 次 |
| 最近记录: |