在CUDA中按键排序3个数组(也许使用Thrust)

Kia*_*ash 7 sorting cuda gpu thrust

我有3相同大小的数组(超过300.000元素).一个浮点数和两个索引数组.所以,对于每个号码我都有2ID.

所有3阵列都已经在GPU全局内存中.我想相应地将所有数字与他们的ID排序.

有什么方法可以使用Thrust库来完成这项任务吗?有没有比推力图书馆更好的方法?

当然,我不喜欢将它们复制到主机内存中几次.顺便说一句,他们的数组不是向量.

感谢您的帮助.


暂时的解决方案,但这是非常缓慢的.它需要几4秒钟,我的数组大小按顺序排列300000

thrust::device_ptr<float> keys(afterSum);
thrust::device_ptr<int> vals0(d_index);
thrust::device_ptr<int> vals1(blockId); 

thrust::device_vector<int> sortedIndex(numElements);
thrust::device_vector<int> sortedBlockId(numElements);

thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(numElements);
thrust::copy(iter, iter + indices.size(), indices.begin()); 

thrust::sort_by_key(keys, keys + numElements , indices.begin());    

thrust::gather(indices.begin(), indices.end(), vals0, sortedIndex.begin());
thrust::gather(indices.begin(), indices.end(), vals1, sortedBlockId.begin());

thrust::host_vector<int> h_sortedIndex=sortedIndex;
thrust::host_vector<int> h_sortedBlockId=sortedBlockId;
Run Code Online (Sandbox Code Playgroud)

har*_*ism 11

当然你可以使用Thrust.首先,您需要包装原始CUDA设备指针thrust::device_ptr.假设你的浮点值在数组中pkeys,而ID是在阵列pvals0pvals1,并且是包含numElements的数组的长度,这样的事情应该工作:

#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);

thrust::device_ptr<float> keys(pkeys);
thrust::device_ptr<int> vals0(pvals0);
thrust::device_ptr<int> vals1(pvals1);

// allocate space for the output
thrust::device_vector<int> sortedVals0(numElements);
thrust::device_vector<int> sortedVals1(numElements);

// initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(numElements);
thrust::copy(iter, iter + indices.size(), indices.begin());

// first sort the keys and indices by the keys
thrust::sort_by_key(keys.begin(), keys.end(), indices.begin());

// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), vals0.begin(), sortedVals0.begin());
thrust::gather(indices.begin(), indices.end(), vals1.begin(), sortedVals1.begin());

cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Took %f milliseconds for %d elements\n", milliseconds, numElements);
Run Code Online (Sandbox Code Playgroud)