Seb*_*anP 5 concurrency cuda opencl
我正在使用OpenCL计算n维点之间的欧氏距离.我得到两个n维点列表,我应该返回一个数组,其中只包含第一个表中每个点到第二个表中每个点的距离.
我的方法是做常规的doble循环(对于Table1中的每个点{对于Table2 {...}}中的每个点,然后对并行中的每对点进行计算.
然后将欧氏距离分成3个部分:1.取两个点之间的差值2.平方差(仍为每个维度)3.求和2中得到的所有值.4.取平方根在3中获得的值.(此示例中省略了此步骤.)
在尝试累积所有差异的总和之前,一切都像魅力一样(即,执行上述过程的第3步,下面代码的第49行).
作为测试数据,我使用的DescriptorLists各有2个点:DescriptorList1:001,002,003,...,127,128; (p1)129,130,131,...,255,256; (P2)
DescriptorList2:000,001,002,...,126,127; (p1)128,129,130,...,254,255; (P2)
因此,结果向量应该具有值:128,2064512,2130048,128现在我得到的随机数随每次运行而变化.
我感谢任何帮助或引导我做错了什么.希望一切都清楚我正在工作的场景.
#define BLOCK_SIZE 128
typedef struct
{
//How large each point is
int length;
//How many points in every list
int num_elements;
//Pointer to the elements of the descriptor (stored as a raw array)
__global float *elements;
} DescriptorList;
__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float As[BLOCK_SIZE])
{
int gpidA = get_global_id(0);
int featA = get_local_id(0);
//temporary array to store the difference between each dimension of 2 points
float dif_acum[BLOCK_SIZE];
//counter to track the iterations of the inner loop
int loop = 0;
//loop over all descriptors in A
for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
//take the i-th descriptor. Returns a DescriptorList with just the i-th
//descriptor in DescriptorList A
DescriptorList tmpA = GetDescriptor(A, i);
//copy the current descriptor to local memory.
//returns one element of the only descriptor in DescriptorList tmpA
//and index featA
As[featA] = GetElement(tmpA, 0, featA);
//wait for all the threads to finish copying before continuing
barrier(CLK_LOCAL_MEM_FENCE);
//loop over all the descriptors in B
for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
//take the difference of both current points
dif_acum[featA] = As[featA]-B.elements[k*BLOCK_SIZE + featA];
//wait again
barrier(CLK_LOCAL_MEM_FENCE);
//square value of the difference in dif_acum and store in C
//which is where the results should be stored at the end.
C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];
loop += 1;
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
Run Code Online (Sandbox Code Playgroud)
您的问题在于以下几行代码:
C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];
Run Code Online (Sandbox Code Playgroud)
工作组中的所有线程(实际上是你的所有线程,但稍后再来),它们试图同时修改这个数组位置而不进行任何同步.有几个因素会导致这个问题:
现在让我们来解决这个问题:虽然我们也许能够得到这个使用原子能全球内存工作,它不会那么快,所以让积聚在本地内存:
local float* accum;
...
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
{
if ((featA % (2*i)) == 0)
accum[featA] += accum[featA + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(featA == 0)
C[loop] = accum[0];
Run Code Online (Sandbox Code Playgroud)
当然你可以为此重用其他本地缓冲区,但我认为重点很明确(顺便说一下:你确定dif_acum将在本地内存中创建,因为我想我读到的地方不会放在本地内存中,这将使A预加载到本地存储器中毫无意义.
关于此代码的其他一些观点:
get_local_size(0)workgroupsize而不是使用Define(因为你可能在主机代码中更改它而没有意识到你应该将opencl代码更改为)考虑到最后一个子弹你可以简单地做:
float As = GetElement(tmpA, 0, featA);
...
float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];
Run Code Online (Sandbox Code Playgroud)
这将使代码(不考虑前两个子弹):
__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE])
{
int gpidA = get_global_id(0);
int featA = get_local_id(0);
int loop = 0;
for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
DescriptorList tmpA = GetDescriptor(A, i);
float As = GetElement(tmpA, 0, featA);
for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
{
if ((featA % (2*i)) == 0)
accum[featA] += accum[featA + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(featA == 0)
C[loop] = accum[0];
barrier(CLK_LOCAL_MEM_FENCE);
loop += 1;
}
}
}
Run Code Online (Sandbox Code Playgroud)