Pro*_*mer 2 parallel-processing cuda gpu gpgpu
我在文件上有几个数字列表.例如,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
Run Code Online (Sandbox Code Playgroud)
现在,我想获得使用GPU发生每个数字的次数.我相信这在GPU上比在CPU上更快,因为每个线程可以处理一个列表.我应该在GPU上使用什么数据结构来轻松获得上述计数.例如,对于上述内容,答案如下:
.333 = 2 times in entire file
.324 = 1 time
Run Code Online (Sandbox Code Playgroud)
等等..
我在寻找一般的解决方案.不适用于仅具有特定计算能力的设备
只需编写Pavan建议的内核,看看我是否有效地实现了它:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
Run Code Online (Sandbox Code Playgroud)
问题:如何修改此内核以便它处理任意大小的数组.即,当线程总数<元素数时处理条件
以下是我将如何在matlab中执行代码
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
Run Code Online (Sandbox Code Playgroud)
在普通cuda中没有简单的方法可以做到这一点,但您可以使用现有的库来执行此操作.
1)推力
它还附带CUDA 4.0的CUDA工具包.
通过使用以下函数,可以将matlab代码粗略地转换为推力.我不太精通推力,但我只是想让你知道要看什么样的惯例.
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
Run Code Online (Sandbox Code Playgroud)
您现在拥有每个唯一值的第一个实例的位置.您现在可以启动内核以查找d_Locations中从0到new_end的相邻元素之间的差异.从num中减去最终值以获得最终位置的计数.
编辑(添加通过聊天提供的代码)
以下是需要完成差异代码的方法
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
Run Code Online (Sandbox Code Playgroud)
这是一个易于使用,免费的基于数组的库.
您可以在ArrayFire中执行以下操作.
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
Run Code Online (Sandbox Code Playgroud)
披露:我为AccelerEyes工作,开发此软件.