在matlab上的CUDA循环

phd*_*ent 8 matlab openmp openacc

我一直在使用Fortran中的ACC和OpenMP进行并行化.我现在正在尝试在matlab中做同样的事情.我觉得非常有趣的是,在matlab中使用GPU来对一个循环进行并行化是非常困难的.显然,唯一的方法是使用arrayfun函数.但我可能错了.

从概念上讲,我想知道为什么matlab中的GPU使用不比fortran更直接.在更实际的层面上,我想知道如何在下面的简单代码中使用GPU.

下面,我将分享三个代码和基准:

  1. Fortran OpenMP代码
  2. Fortran ACC代码
  3. Matlab parfor代码
  4. Matlab CUDA(?)这是我不知道怎么做的.

Fortran OpenMP:

program rbc

 use omp_lib     ! For timing
 use tools
 implicit none

 real, parameter :: beta = 0.984, eta = 2, alpha = 0.35, delta = 0.01, &
                     rho = 0.95, sigma = 0.005, zmin=-0.0480384, zmax=0.0480384;
integer, parameter :: nz = 4, nk=4800;
real :: zgrid(nz), kgrid(nk), t_tran_z(nz,nz), tran_z(nz,nz);
real :: kmax, kmin, tol, dif, c(nk), r(nk), w(nk);
real, dimension(nk,nz) :: v=0., v0=0., ev=0., c0=0.;
integer :: i, iz, ik, cnt;
logical :: ind(nk);
real(kind=8) :: start, finish   ! For timing
real :: tmpmax, c1  


call omp_set_num_threads(12)


!Grid for productivity z

! [1 x 4] grid of values for z
call linspace(zmin,zmax,nz,zgrid)
zgrid = exp(zgrid)
! [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757
tran_z(1,2) = 0.00324265
tran_z(1,3) = 0
tran_z(1,4) = 0
tran_z(2,1) = 0.000385933
tran_z(2,2) = 0.998441
tran_z(2,3) = 0.00117336
tran_z(2,4) = 0
tran_z(3,1) = 0
tran_z(3,2) = 0.00117336
tran_z(3,3) = 0.998441
tran_z(3,4) = 0.000385933
tran_z(4,1) = 0
tran_z(4,2) = 0
tran_z(4,3) = 0.00324265
tran_z(4,4) = 0.996757

! Grid for capital k

kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)**(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)**(1/(alpha-1));

! [1 x 4800] grid of possible values of k
call linspace(kmin, kmax, nk, kgrid)


! Compute initial wealth c0(k,z)
do iz=1,nz
  c0(:,iz) = zgrid(iz)*kgrid**alpha + (1-delta)*kgrid;
end do

dif = 10000
tol = 1e-8
cnt = 1

do while(dif>tol)
    !$omp parallel do default(shared) private(ik,iz,i,tmpmax,c1)    
    do ik=1,nk;        
          do iz = 1,nz;
          tmpmax = -huge(0.)

          do i = 1,nk
             c1 = c0(ik,iz) - kgrid(i)
             if(c1<0) exit
             c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
             if(tmpmax<c1) tmpmax = c1
          end do
          v(ik,iz) = tmpmax
       end do

    end do
    !$omp end parallel do
    ev = beta*matmul(v,tran_z)
    dif = maxval(abs(v-v0))
    v0 = v
    if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
        cnt = cnt+1
end do


end program
Run Code Online (Sandbox Code Playgroud)

Fortran ACC:

只需将上面代码中的mainloop语法替换为:

do while(dif>tol)
    !$acc kernels
    !$acc loop gang
        do ik=1,nk;        
         !$acc loop gang
          do iz = 1,nz;
          tmpmax = -huge(0.)

          do i = 1,nk
             c1 = c0(ik,iz) - kgrid(i)
             if(c1<0) exit
             c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
             if(tmpmax<c1) tmpmax = c1
          end do
          v(ik,iz) = tmpmax
       end do

    end do

    !$acc end kernels
    ev = beta*matmul(v,tran_z)
    dif = maxval(abs(v-v0))
    v0 = v
    if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
        cnt = cnt+1
end do
Run Code Online (Sandbox Code Playgroud)

Matlab parfor :( 我知道使用矢量化语法可以使下面的代码更快,但练习的重点是比较循环速度).

tic;
beta = 0.984; 
eta = 2; 
alpha = 0.35; 
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;

v=zeros(nk,nz); 
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);

%Grid for productivity z

%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;

% Grid for capital k

kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));

% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);

% Compute initial wealth c0(k,z)
for iz=1:nz
  c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end 

dif = 10000;
tol = 1e-8;
cnt = 1;

while dif>tol

    parfor ik=1:nk
          for iz = 1:nz
          tmpmax = -intmax;

          for i = 1:nk
             c1 = c0(ik,iz) - kgrid(i);
             if (c1<0) 
                 continue
             end 
             c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
             if tmpmax<c1 
                 tmpmax = c1;
             end
          end 
          v(ik,iz) = tmpmax;
          end 

    end 
    ev = beta*v*tran_z;
    dif = max(max(abs(v-v0)));
    v0 = v;
    if mod(cnt,1)==0 
        fprintf('%1.5f :  %1.5f \n', [cnt dif])
    end
        cnt = cnt+1;
end 


toc
Run Code Online (Sandbox Code Playgroud)

Matlab CUDA:

这就是我不知道如何编码的原因.使用arrayfun这种方法的唯一方法是什么?在Fortran中,从OpenMP迁移到OpenACC非常简单.在Matlab中从parfor到GPU循环是否有一种简单的方法?

代码之间的时间比较:

Fortran OpenMP: 83.1 seconds 
Fortran ACC:    2.4 seconds
Matlab parfor:  1182 seconds
Run Code Online (Sandbox Code Playgroud)

最后一点,我应该说上面的代码解决了一个简单的真实商业周期模型,并在基础上编写.

Igo*_*gor 0

Matlab编码器

首先,正如Dev-iL 已经提到的,您可以使用 GPU 编码器。

它(我使用 R2019a)只需要对代码进行少量更改:

function cdapted()
beta = 0.984; 
eta = 2; 
alpha = 0.35; 
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;

v=zeros(nk,nz); 
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);

%Grid for productivity z

%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;

% Grid for capital k

kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));

% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);

% Compute initial wealth c0(k,z)
for iz=1:nz
  c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end 

dif = 10000;
tol = 1e-8;
cnt = 1;

while dif>tol

    for ik=1:nk
        for iz = 1:nz
            tmpmax = double(intmin);

            for i = 1:nk
                c1 = c0(ik,iz) - kgrid(i);
                if (c1<0)
                    continue
                end
                c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
                if tmpmax<c1
                    tmpmax = c1;
                end
            end
            v(ik,iz) = tmpmax;
        end

    end
    ev = beta*v*tran_z;
    dif = max(max(abs(v-v0)));
    v0 = v;
    % I've commented out fprintf because double2single cannot handle it 
    % (could be manually uncommented in the converted version if needed)
    % ------------
    % if mod(cnt,1)==0       
    %     fprintf('%1.5f :  %1.5f \n', cnt, dif);
    % end
    cnt = cnt+1;
end
end
Run Code Online (Sandbox Code Playgroud)

构建这个的脚本是:

% unload mex files
clear mex


%% Build for gpu, float64

% Produces ".\codegen\mex\cdapted" folder and "cdapted_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg cdapted

% benchmark it (~7.14s on my GTX1080Ti)
timeit(@() cdapted_mex,0)


%% Build for gpu, float32:

% Produces ".\codegen\cdapted\single" folder
scfg = coder.config('single');
codegen -double2single scfg cdapted

% Produces ".\codegen\mex\cdapted_single" folder and "cdapted_single_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg .\codegen\cdapted\single\cdapted_single.m

% benchmark it (~2.09s on my GTX1080Ti)
timeit(@() cdapted_single_mex,0)
Run Code Online (Sandbox Code Playgroud)

因此,如果您的 Fortran 二进制文件使用 float32 精度(我怀疑是这样),则此 Matlab Coder 结果与之相当。但这并不意味着两者都非常高效。Matlab Coder 生成的代码还远未达到高效的程度。而且它没有充分利用 GPU(即使 TDP 约为 50%)。


矢量化和 gpuArray

接下来,我同意user10597469Nicky Mattsson 的观点,即您的 Matlab 代码看起来不像正常的“本机”矢量化 Matlab 代码。

有很多事情需要调整。(但arrayfun几乎不比for)。首先,让我们删除for循环:

function vertorized1()
t_tot = tic();
beta = 0.984; 
eta = 2; 
alpha = 0.35; 
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;

v=zeros(nk,nz); 
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);

%Grid for productivity z

%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;

% Grid for capital k

kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));

% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);

% Compute initial wealth c0(k,z)
for iz=1:nz
  c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end 

dif = 10000;
tol = 0.4; 
tol = 1e-8;
cnt = 1;

t_acc=zeros([1,2]);

while dif>tol

    %% orig-noparfor:
    t=tic();
    for ik=1:nk
          for iz = 1:nz
          tmpmax = -intmax;

          for i = 1:nk
             c1 = c0(ik,iz) - kgrid(i);
             if (c1<0) 
                 continue
             end 
             c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
             if tmpmax<c1 
                 tmpmax = c1;
             end
          end 
          v(ik,iz) = tmpmax;
          end 

    end 
    t_acc(1) = t_acc(1) + toc(t);    

    %% better:
    t=tic();          

    kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
    c1_ = c0 - kgrid_;    
    c1_x = c1_.^(1-eta)/(1-eta);

    c2 = c1_x + reshape(ev', [1 nz nk]);
    c2(c1_<0) = -Inf;
    v_ = max(c2,[],3);        
    t_acc(2) = t_acc(2) + toc(t);    

    %% compare
    assert(isequal(v_,v));   
    v=v_;

    %% other
    ev = beta*v*tran_z;
    dif = max(max(abs(v-v0)));
    v0 = v;
    if mod(cnt,1)==0
        fprintf('%1.5f :  %1.5f \n', cnt, dif);
    end
    cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end

% toc result:
%   tol = 0.4  ->   12 iterations :: t_acc = [  17.7       9.8]
%   tol = 1e-8 -> 1124 iterations :: t_acc = [1758.6     972.0]
% 
% (all 1124 iterations) with commented-out orig :: t_tot = 931.7443
Run Code Online (Sandbox Code Playgroud)

现在,非常明显的是,循环内大多数计算密集型计算while(例如^(1-eta)/(1-eta))实际上会产生可以预先计算的常数。一旦我们修复了这个问题,结果就会比基于原始parfor版本(在我的 2xE5-2630v3 上)快一点:

function vertorized2()
t_tot = tic();
beta = 0.984; 
eta = 2; 
alpha = 0.35; 
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;

v=zeros(nk,nz); 
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);

%Grid for productivity z

%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;

% Grid for capital k

kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));

% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);

% Compute initial wealth c0(k,z)
for iz=1:nz
  c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end 

dif = 10000;
tol = 0.4; 
tol = 1e-8;
cnt = 1;

t_acc=zeros([1,2]);

%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);

while dif>tol

    %% orig:
    t=tic();
    parfor ik=1:nk
          for iz = 1:nz
          tmpmax = -intmax;

          for i = 1:nk
             c1 = c0(ik,iz) - kgrid(i);
             if (c1<0) 
                 continue
             end 
             c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
             if tmpmax<c1 
                 tmpmax = c1;
             end
          end 
          v(ik,iz) = tmpmax;
          end 

    end 
    t_acc(1) = t_acc(1) + toc(t);

    %% better:
    t=tic();       
    c2 = c1_x + reshape(ev', [1 nz nk]);
    c2 = c2 + mask;
    v_ = max(c2,[],3);        
    t_acc(2) = t_acc(2) + toc(t);    

    %% compare
    assert(isequal(v_,v));
    v=v_;

    %% other
    ev = beta*v*tran_z;
    dif = max(max(abs(v-v0)));
    v0 = v;
    if mod(cnt,1)==0
        fprintf('%1.5f :  %1.5f \n', cnt, dif);
    end
    cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end

% toc result:
%   tol = 0.4  ->   12 iterations :: t_acc = [  2.4    1.7] 
%   tol = 1e-8 -> 1124 iterations :: t_acc = [188.3  115.9]
% 
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
Run Code Online (Sandbox Code Playgroud)

这种矢量化代码仍然效率低下(例如reshape(ev',...),它消耗了约 60% 的时间,可以通过重新排序维度轻松避免),但它在某种程度上适合gpuArray()

function vertorized2()
t_tot = tic();
beta = 0.984; 
eta = 2; 
alpha = 0.35; 
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;

v=zeros(nk,nz); 
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);

%Grid for productivity z

%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;

% Grid for capital k

kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));

% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);

% Compute initial wealth c0(k,z)
for iz=1:nz
  c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end 

dif = 10000;
tol = 0.4; 
tol = 1e-8;
cnt = 1;

t_acc=zeros([1,2]);

%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);

while dif>tol

    %% orig:
    t=tic();
    parfor ik=1:nk
          for iz = 1:nz
          tmpmax = -intmax;

          for i = 1:nk
             c1 = c0(ik,iz) - kgrid(i);
             if (c1<0) 
                 continue
             end 
             c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
             if tmpmax<c1 
                 tmpmax = c1;
             end
          end 
          v(ik,iz) = tmpmax;
          end 

    end 
    t_acc(1) = t_acc(1) + toc(t);

    %% better:
    t=tic();       
    c2 = c1_x + reshape(ev', [1 nz nk]);
    c2 = c2 + mask;
    v_ = max(c2,[],3);        
    t_acc(2) = t_acc(2) + toc(t);    

    %% compare
    assert(isequal(v_,v));
    v=v_;

    %% other
    ev = beta*v*tran_z;
    dif = max(max(abs(v-v0)));
    v0 = v;
    if mod(cnt,1)==0
        fprintf('%1.5f :  %1.5f \n', cnt, dif);
    end
    cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end

% toc result:
%   tol = 0.4  ->   12 iterations :: t_acc = [  2.4    1.7] 
%   tol = 1e-8 -> 1124 iterations :: t_acc = [188.3  115.9]
% 
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
Run Code Online (Sandbox Code Playgroud)

这个约 15 秒的结果比我们从 Matlab Coder 获得的结果(约 2 秒)差约 7 倍。但此选项需要的工具箱较少。在实践中,gpuArray当您从编写“本机 Matlab 代码”开始时,这是最方便的。包括互动使用。

最后,如果您使用 Matlab Coder 构建最终的矢量化版本(您必须进行一些琐碎的调整),它不会比第一个版本更快。速度会慢 2-3 倍。