fac*_*hpc 5 c algorithm parallel-processing opencl
我开始使用OpenCL,我可以看到添加向量示例并理解它.但我在考虑梯形方法.这是[a,b]中x ^ 2的积分计算的代码(C).
double f(double x)
{
return x*x;
}
double Simple_Trap(double a, double b)
{
double fA, fB;
fA = f(a);
fB = f(b);
return ((fA + fB) * (b-a)) / 2;
}
double Comp_Trap( double a, double b)
{
double Suma = 0;
double i = 0;
i = a + INC;
Suma += Simple_Trap(a,i);
while(i < b)
{
i+=INC;
Suma += Simple_Trap(i,i + INC);
}
return Suma;
}
Run Code Online (Sandbox Code Playgroud)
问题是如何使用梯形方法获得用于积分计算的内核?
所以,我正在思考这个想法:partials [i] = integration(a,a + offset),然后创建一个内核来计算部分的总和,如Patrick87所述.
但是,这是最好的方法吗?
这就是我的想法。我没有对该内核进行端到端测试。当我有更多时间时,我会进行更新。
comp_trap 是基于上面代码的基本分而治之方法。comp_trap_multi 通过让每个工作项划分其子部分来提高准确性
您只需在主机中分配一组双精度数,以便每个工作组都有一个双精度数来返回其结果。这应该有助于减少您想要避免的向量分配。
如果有任何问题,请告诉我。
更新:
1) 将所有 double 引用更改为 float,因为 double 在 opencl 中是可选的
2) 将工作组大小硬编码为 64。该值在我的系统上是最佳值,应通过实验确定。我更喜欢对该值进行硬编码,而不是传递本地浮点数组来使用,因为无论如何,主机程序最终将只使用目标系统上的最佳值。
3)修正了一个错误的计算(a1是错误的,现在应该更好了)
/*
numerical-integration.cl
*/
float f(float x)
{
return x*x;
}
float simple_trap(float a, float b)
{
float fA, fB;
fA = f(a);
fB = f(b);
return ((fA + fB) * (b-a)) / 2;
}
__kernel void comp_trap(
float a,
float b,
__global float* sums)
{
/*
- assumes 1D global and local work dimensions
- each work unit will calculate 1/get_global_size of the total sum
- the 0th work unit of each group then accumulates the sum for the
group and stores it in __global * sums
- memory allocation: sizeof(sums) = get_num_groups(0) * sizeof(float)
- assumes local scratchpad size is at lease 8 bytes per work unit in the group
ie sizeof(wiSums) = get_local_size(0) * sizeof(float)
*/
__local float wiSums[64];
int l_id = get_local_id(0);
//cumpute range for this work item is: a1, b1
float a1 = a+((b-a)/get_global_size(0))*get_global_id(0);
float b1 = a1+(b-a)/get_global_size(0);
wiSums[l_id] = simple_trap(a1,b1);
barrier(CLK_LOCAL_MEM_FENCE);
int i;
if(l_id == 0){
for(i=1;i<get_local_size(0);i++){
wiSums[0] += wiSums[i];
}
sums[get_group_id(0)] = wiSums[0];
}
}
__kernel void comp_trap_multi(
float a,
float b,
__global float* sums,
int divisions)
{
/*
- same as above, but each work unit further divides its range into
'divisions' equal parts, yielding a more accurate result
- work units still store only one sum in the local array, which is
used later for the final group accumulation
*/
__local float wiSums[64];
int l_id = get_local_id(0);
float a1 = a+((b-a)/get_global_size(0))*get_global_id(0);
float b1 = a1+(b-a)/get_global_size(0);
float range;
if(divisions > 0){
range = (b1-a1)/divisions;
}else{
range = (b1-a1);
}
int i;
wiSums[l_id] = 0;
for(i=0;i<divisions;i++){
wiSums[l_id] += simple_trap(a1+range*i,a1+range*(i+1));
}
barrier(CLK_LOCAL_MEM_FENCE);
if(l_id == 0){
for(i=1;i<get_local_size(0);i++){
wiSums[0] += wiSums[i];
}
sums[get_group_id(0)] = wiSums[0];
}
}
Run Code Online (Sandbox Code Playgroud)