CUDA 二阶递归与推力 Include_scan

Question

CUDA 二阶递归与推力 Include_scan

我试图了解如何并行递归计算。连续地，计算采用以下形式：

for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }

Run Code Online (Sandbox Code Playgroud)

对于i-1索引，我之前的问题有一个解决方案：CUDA强制指令执行顺序

我想修改它以使用i-2，但我无法理解如何将相同的过程应用于二阶计算。应该可以使用该thrust::inclusive_scan功能，但我不知道如何使用。有谁知道解决方案吗？

Answer 1

Rob*_*lla 5

继续上一个问题/答案，我们将注意力转移到Blelloch 引用的论文中的方程 1.11。我们观察到您的问题表述：

for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }

Run Code Online (Sandbox Code Playgroud)

如果我们设置 m=2，则似乎与方程 1.11 中的情况相匹配，在这种情况下，我们还可以观察到，对于您的公式，所有 a _i,1都为零（并且与之前一样，所有 a _i,2都是 k）。

根据该论文中的方程 1.12，我们的状态变量 s _i现在变成了一个二元组：

s _i = |x _i x _i-1 |

注意到这些事情，我们观察方程 1.13 的“正确性”：

s _i = |x _i-1 x _i-2 | 。|0 1，k 0| + |b _i 0|

重写：

s _i,1 = x _i = k*x _i-2 + b _i

s _i,2 = x _i-1 = x _i-1

（在我看来，另一个答案让你在这一点上离开。这种认识，即result.data[0] = right + k * left.data[1];足以进行串行扫描，但不适用于并行扫描。同样明显的是，函子/扫描操作不具有关联性。）

我们现在需要提出一个二元运算符bop，它是（1.7）中定义对这种情况的扩展。参考之前方程1.7中的定义，我们在1.13中的处理基础上将其扩展如下：

C _i = | A _i , B _i |

在哪里：

A _i = |0 1, k 0|

和：

B _i = |b _i 0 |

然后我们有：

C _i bop C _j = | 人工智能。A _j，B _i。A _j + B _j |

这将成为我们函子/扫描运算符的公式。我们需要始终携带 6 个标量“状态”量：2 个用于 B 向量，4 个用于 A 矩阵。

接下来是上面的实现：

$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
  for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }
}
struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
    thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
    thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
    thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
    thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
    return ret;
    }
};

typedef float mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){

  mt *b1  = new mt[ds]; // b as in blelloch (1.5)
  mt *cr = new mt[ds]; // cpu result
  for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
  cr[0] = b1[0];
  cr[1] = b1[1];
  cpufunction(cr, b1, ds, k);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
  thrust::device_vector<mt> db(b1, b1+ds);
  auto b0 = thrust::constant_iterator<mt>(0);
  auto a0 = thrust::constant_iterator<mt>(0);
  auto a1 = thrust::constant_iterator<mt>(1);
  auto a2 = thrust::constant_iterator<mt>(k);
  auto a3 = thrust::constant_iterator<mt>(0);
  thrust::device_vector<mt> dx1(ds);
  thrust::device_vector<mt> dx0(ds);
  thrust::device_vector<mt> dy0(ds);
  thrust::device_vector<mt> dy1(ds);
  thrust::device_vector<mt> dy2(ds);
  thrust::device_vector<mt> dy3(ds);
  auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(db.begin(), b0, a0, a1, a2, a3));
  auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
  thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
  thrust::host_vector<mt> hx1 = dx1;
  thrust::copy_n(hx1.begin(), snip, std::ostream_iterator<mt>(std::cout, ","));
  thrust::copy_n(hx1.begin()+ds-snip, snip, std::ostream_iterator<mt>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.218,601.275,576.315,607.993,582.947,614.621,589.516,621.699,595.644,628.843,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.621,589.516,621.7,595.644,628.843,
========= ERROR SUMMARY: 0 errors
$

Run Code Online (Sandbox Code Playgroud)

是的，上面的一些结果在第 6 位数字上有所不同。float考虑到串行和并行方法之间截然不同的操作顺序，我将此归因于分辨率的限制。如果将更改typedef为double，结果将显示为完全匹配。

既然您已经询问过它，那么这里有一个等效的实现，它是使用之前分配的设备数据进行演示的cudaMalloc：

$ cat t1930.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>
#include <cstdio>
template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
  for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }
}
struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
    thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
    thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
    thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
    thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
    return ret;
    }
};

typedef double mt;
const size_t ds = 512;
const mt k = 1.01;
const int snip = 10;
int main(){

  mt *b1  = new mt[ds]; // b as in blelloch (1.5)
  mt *cr = new mt[ds]; // cpu result
  for (int i = 0; i < ds; i++) { b1[i] = rand()/(float)RAND_MAX;}
  cr[0] = b1[0];
  cr[1] = b1[1];
  cpufunction(cr, b1, ds, k);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
  mt *db;
  cudaMalloc(&db, ds*sizeof(db[0]));
  cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
  thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
  auto b0 = thrust::constant_iterator<mt>(0);
  auto a0 = thrust::constant_iterator<mt>(0);
  auto a1 = thrust::constant_iterator<mt>(1);
  auto a2 = thrust::constant_iterator<mt>(k);
  auto a3 = thrust::constant_iterator<mt>(0);
  thrust::device_vector<mt> dx1(ds);
  thrust::device_vector<mt> dx0(ds);
  thrust::device_vector<mt> dy0(ds);
  thrust::device_vector<mt> dy1(ds);
  thrust::device_vector<mt> dy2(ds);
  thrust::device_vector<mt> dy3(ds);
  auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
  auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1.begin(), dx0.begin(), dy0.begin(), dy1.begin(), dy2.begin(), dy3.begin()));
  thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
  cudaMemcpy(cr, thrust::raw_pointer_cast(dx1.data()), ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
}
$ nvcc -std=c++14 t1930.cu -o t1930
$ cuda-memcheck ./t1930
========= CUDA-MEMCHECK
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
0.840188,0.394383,1.63169,1.19677,2.55965,1.40629,2.92047,2.18858,3.22745,2.76443,570.219,601.275,576.316,607.994,582.948,614.622,589.516,621.7,595.645,628.844,
========= ERROR SUMMARY: 0 errors

Run Code Online (Sandbox Code Playgroud)

这两种方法之间应该没有显着的性能差异。（不过，在这个例子中，我碰巧将切换为typedef，double所以这会有所不同。）使用cudaMalloc来替代device_vector各种状态向量（dx0, dx1, dy0, dy1...）可能会稍微快一些，因为device_vector首先进行cudaMalloc样式分配，然后启动一个内核将分配清零。对于状态向量来说，这个归零步骤是不必要的。如果您有兴趣，这里给出的模式应该演示如何做到这一点。

这是一个完全消除使用thrust::device_vectorand的版本thrust::host_vector：

#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <cstdlib>

template <typename T>
void cpufunction(T *result, T *oldArray, size_t size, T k){
  for (int i = 2; i<size; i++)
  {
    result[i] = oldArray[i] + k * result[i-2];
  }
}
struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<2>(t2) + thrust::get<1>(t1)*thrust::get<4>(t2)+thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<0>(t1)*thrust::get<3>(t2) + thrust::get<1>(t1)*thrust::get<5>(t2)+thrust::get<1>(t2);
    thrust::get<2>(ret) = thrust::get<2>(t1)*thrust::get<2>(t2) + thrust::get<3>(t1)*thrust::get<4>(t2);
    thrust::get<3>(ret) = thrust::get<2>(t1)*thrust::get<3>(t2) + thrust::get<3>(t1)*thrust::get<5>(t2);
    thrust::get<4>(ret) = thrust::get<4>(t1)*thrust::get<2>(t2) + thrust::get<5>(t1)*thrust::get<4>(t2);
    thrust::get<5>(ret) = thrust::get<4>(t1)*thrust::get<3>(t2) + thrust::get<5>(t1)*thrust::get<5>(t2);
    return ret;
    }
};

typedef float mt;
const size_t ds = 32768*4;
const mt k = 1.001;
const int snip = 10;
int main(){

  mt *b1  = new mt[ds]; // b as in blelloch (1.5)
  mt *cr = new mt[ds]; // result
  for (int i = 0; i < ds; i++) { b1[i] = (rand()/(float)RAND_MAX)-0.5;}
  cr[0] = b1[0];
  cr[1] = b1[1];
  cpufunction(cr, b1, ds, k);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
  mt *db, *dstate;
  cudaMalloc(&db, ds*sizeof(db[0]));
  cudaMalloc(&dstate, 6*ds*sizeof(dstate[0]));
  cudaMemcpy(db, b1, ds*sizeof(db[0]), cudaMemcpyHostToDevice);
  thrust::device_ptr<mt> dp_db = thrust::device_pointer_cast(db);
  auto b0 = thrust::constant_iterator<mt>(0);
  auto a0 = thrust::constant_iterator<mt>(0);
  auto a1 = thrust::constant_iterator<mt>(1);
  auto a2 = thrust::constant_iterator<mt>(k);
  auto a3 = thrust::constant_iterator<mt>(0);
  thrust::device_ptr<mt> dx1 = thrust::device_pointer_cast(dstate);
  thrust::device_ptr<mt> dx0 = thrust::device_pointer_cast(dstate+ds);
  thrust::device_ptr<mt> dy0 = thrust::device_pointer_cast(dstate+2*ds);
  thrust::device_ptr<mt> dy1 = thrust::device_pointer_cast(dstate+3*ds);
  thrust::device_ptr<mt> dy2 = thrust::device_pointer_cast(dstate+4*ds);
  thrust::device_ptr<mt> dy3 = thrust::device_pointer_cast(dstate+5*ds);
  auto my_i_zip = thrust::make_zip_iterator(thrust::make_tuple(dp_db, b0, a0, a1, a2, a3));
  auto my_o_zip = thrust::make_zip_iterator(thrust::make_tuple(dx1, dx0, dy0, dy1, dy2, dy3));
  thrust::inclusive_scan(my_i_zip, my_i_zip+ds, my_o_zip, scan_op());
  cudaMemcpy(cr, dstate, ds*sizeof(cr[0]), cudaMemcpyDeviceToHost);
  for (int i = 0; i < snip; i++) std::cout << cr[i] << ",";
  for (int i = ds-snip; i < ds; i++) std::cout << cr[i] << ",";
  std::cout << std::endl;
}

Run Code Online (Sandbox Code Playgroud)

归档时间：	3 年，11 月前
查看次数：	247 次
最近记录：	1 年，8 月前