Bes*_*ssa 3 cuda particle-system
我一直在尝试编写一个内核,用于计算N个给定点之间距离的倒数之和.C中的串行尾声就像
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Run Code Online (Sandbox Code Playgroud)
其中rx和ry分别是x和y坐标.
我通过使用随机数生成器的内核生成点.对于内核,我使用每块128(256)个线程来获得4k(8k)点.在它上面,每个线程执行内部的内部循环,然后将结果传递给reduce sum函数,如下所示
生成点数:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Run Code Online (Sandbox Code Playgroud)
减少:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Run Code Online (Sandbox Code Playgroud)
核心:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
Run Code Online (Sandbox Code Playgroud)
内核配置和启动如下:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Run Code Online (Sandbox Code Playgroud)
最后,我将数据复制回主机,然后执行剩余的总和:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
Run Code Online (Sandbox Code Playgroud)
4k积分,好的!结果是:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
Run Code Online (Sandbox Code Playgroud)
在这种情况下,总和可能以不同的顺序执行,导致两个结果彼此分开,我不知道......
但是当谈到8k时,结果却是不同的:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
Run Code Online (Sandbox Code Playgroud)
对我而言,似乎内核和串行代码都以相同的方式编写.是什么让我不相信CUDA计算浮点数的精度.这有意义吗?或者当某些线程同时从X和Y加载相同的数据时,对全局内存的访问会导致一些冲突吗?或者我编写内核的方式在某种程度上是"错误的"(我的意思是,我做的是什么导致两个结果彼此分开?).
实际上,从我所知道的,问题似乎是在CPU方面.我根据您的代码创建了一个示例代码.
我能够重现你的结果.
首先,我打开的所有实例sinf,cosf以及sqrtf他们相应的双版本.这对结果没有影响.
接下来,我包括一个typedef所以我可以很容易地从切换精确float到double和背部,更换的每一个相关的实例float与代码mytype这是我的typedef.
当我运行的typedef的代码float和4096的数据大小,我得到这些结果:
GPU average = 108.294922
CPU average = 109.925285
Run Code Online (Sandbox Code Playgroud)
当我运行的typedef的代码double和4096的数据大小,我得到这些结果:
GPU average = 108.294903
CPU average = 108.294903
Run Code Online (Sandbox Code Playgroud)
当我运行的typedef的代码float和8192的数据大小,我得到这些结果:
GPU average = 153.447327
CPU average = 131.473526
Run Code Online (Sandbox Code Playgroud)
当我运行的typedef的代码double和8192的数据大小,我得到这些结果:
GPU average = 153.447380
CPU average = 153.447380
Run Code Online (Sandbox Code Playgroud)
至少有2个观察结果:
基于此,我相信CPU正在提供可变的,可疑的行为.
这是我的代码供参考:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
Run Code Online (Sandbox Code Playgroud)
我在RHEL 5.5,CUDA 5.0,Intel Xeon X5560上运行
编译:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
Run Code Online (Sandbox Code Playgroud)
编辑: 在观察到CPU的可变性之后,我发现通过修改CPU平均代码可以消除大部分CPU的可变性,如下所示:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
Run Code Online (Sandbox Code Playgroud)
所以我想说CPU端的中间结果有问题.有趣的是,它没有出现在GPU结果上.我怀疑其原因是GPU平均值的最终总和是在CPU上完成的(因此每个单独的GPU块结果按大小缩小,例如8192),并且这些可能具有足以存活的中间精度.最后的师.如果您内联CPU平均值计算,您可能会再次观察到不同的内容.