我正在尝试使用Simpson在CUDA中的方法对集成进行编码.
这是辛普森统治的公式
哪里x_k = a + k*h
.
这是我的代码
__device__ void initThreadBounds(int *n_start, int *n_end, int n,
int totalBlocks, int blockWidth)
{
int threadId = blockWidth * blockIdx.x + threadIdx.x;
int nextThreadId = threadId + 1;
int threads = blockWidth * totalBlocks;
*n_start = (threadId * n)/ threads;
*n_end = (nextThreadId * n)/ threads;
}
__device__ float reg_func (float x)
{
return x;
}
typedef float (*p_func) (float);
__device__ p_func integrale_f = reg_func;
__device__ void integralSimpsonMethod(int totalBlocks, int totalThreads,
double a, double b, int n, float p_function(float), float* result)
{
*result = 0;
float h = (b - a)/n;
//*result = p_function(a)+p_function(a + h * n);
//parallel
int idx_start;
int idx_end;
initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
//parallel_ends
for (int i = idx_start; i < idx_end; i+=2) {
*result += ( p_function(a + h*(i-1)) +
4 * p_function(a + h*(i)) +
p_function(a + h*(i+1)) ) * h/3;
}
}
__global__ void integralSimpson(int totalBlocks, int totalThreads, float* result)
{
float res = 0;
integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
result[(blockIdx.x*totalThreads + threadIdx.x)] = res;
//printf ("Simpson method\n");
}
__host__ void inttest()
{
const int blocksNum = 32;
const int threadNum = 32;
float *device_resultf;
float host_resultf[threadNum*blocksNum]={0};
cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
cudaThreadSynchronize();
cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum,
cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i != blocksNum*threadNum; ++i) {
sum += host_resultf[i];
// printf ("result in %i cell = %f \n", i, host_resultf[i]);
}
printf ("sum = %f \n", sum);
cudaFree(device_resultf);
}
int main(int argc, char* argv[])
{
inttest();
int i;
scanf ("%d",&i);
}
Run Code Online (Sandbox Code Playgroud)
问题是:当n
它低于100000 时它工作错误.对于从中0
得到的积分10
,结果是~99
,但是当n = 100000
它或更大时它工作正常,结果是~50
.
怎么了,伙计们?
这里的基本问题是你不了解自己的算法.
您的integralSimpsonMethod()
函数的设计使得每个线程在积分域中每个子区间至少采样3个正交点.因此,如果选择n使其小于内核调用中线程数的四倍,则每个子区间将不可避免地重叠,并且结果积分将不正确.您需要确保代码检查并缩放线程计数或n,以便在计算积分时它们不会产生重叠.
如果您正在为自我启发以外的任何事情做这件事,那么我建议您查看Simpson规则的复合版本.这更适合并行实现,如果正确实现,性能会更高.