phd*_*ent 8 matlab openmp openacc
我一直在使用Fortran中的ACC和OpenMP进行并行化.我现在正在尝试在matlab中做同样的事情.我觉得非常有趣的是,在matlab中使用GPU来对一个循环进行并行化是非常困难的.显然,唯一的方法是使用arrayfun函数.但我可能错了.
从概念上讲,我想知道为什么matlab中的GPU使用不比fortran更直接.在更实际的层面上,我想知道如何在下面的简单代码中使用GPU.
下面,我将分享三个代码和基准:
Fortran OpenMP:
program rbc
use omp_lib ! For timing
use tools
implicit none
real, parameter :: beta = 0.984, eta = 2, alpha = 0.35, delta = 0.01, &
rho = 0.95, sigma = 0.005, zmin=-0.0480384, zmax=0.0480384;
integer, parameter :: nz = 4, nk=4800;
real :: zgrid(nz), kgrid(nk), t_tran_z(nz,nz), tran_z(nz,nz);
real :: kmax, kmin, tol, dif, c(nk), r(nk), w(nk);
real, dimension(nk,nz) :: v=0., v0=0., ev=0., c0=0.;
integer :: i, iz, ik, cnt;
logical :: ind(nk);
real(kind=8) :: start, finish ! For timing
real :: tmpmax, c1
call omp_set_num_threads(12)
!Grid for productivity z
! [1 x 4] grid of values for z
call linspace(zmin,zmax,nz,zgrid)
zgrid = exp(zgrid)
! [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757
tran_z(1,2) = 0.00324265
tran_z(1,3) = 0
tran_z(1,4) = 0
tran_z(2,1) = 0.000385933
tran_z(2,2) = 0.998441
tran_z(2,3) = 0.00117336
tran_z(2,4) = 0
tran_z(3,1) = 0
tran_z(3,2) = 0.00117336
tran_z(3,3) = 0.998441
tran_z(3,4) = 0.000385933
tran_z(4,1) = 0
tran_z(4,2) = 0
tran_z(4,3) = 0.00324265
tran_z(4,4) = 0.996757
! Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)**(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)**(1/(alpha-1));
! [1 x 4800] grid of possible values of k
call linspace(kmin, kmax, nk, kgrid)
! Compute initial wealth c0(k,z)
do iz=1,nz
c0(:,iz) = zgrid(iz)*kgrid**alpha + (1-delta)*kgrid;
end do
dif = 10000
tol = 1e-8
cnt = 1
do while(dif>tol)
!$omp parallel do default(shared) private(ik,iz,i,tmpmax,c1)
do ik=1,nk;
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$omp end parallel do
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
end program
Run Code Online (Sandbox Code Playgroud)
Fortran ACC:
只需将上面代码中的mainloop语法替换为:
do while(dif>tol)
!$acc kernels
!$acc loop gang
do ik=1,nk;
!$acc loop gang
do iz = 1,nz;
tmpmax = -huge(0.)
do i = 1,nk
c1 = c0(ik,iz) - kgrid(i)
if(c1<0) exit
c1 = c1**(1-eta)/(1-eta)+ev(i,iz)
if(tmpmax<c1) tmpmax = c1
end do
v(ik,iz) = tmpmax
end do
end do
!$acc end kernels
ev = beta*matmul(v,tran_z)
dif = maxval(abs(v-v0))
v0 = v
if(mod(cnt,1)==0) write(*,*) cnt, ':', dif
cnt = cnt+1
end do
Run Code Online (Sandbox Code Playgroud)
Matlab parfor :( 我知道使用矢量化语法可以使下面的代码更快,但练习的重点是比较循环速度).
tic;
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', [cnt dif])
end
cnt = cnt+1;
end
toc
Run Code Online (Sandbox Code Playgroud)
Matlab CUDA:
这就是我不知道如何编码的原因.使用arrayfun这种方法的唯一方法是什么?在Fortran中,从OpenMP迁移到OpenACC非常简单.在Matlab中从parfor到GPU循环是否有一种简单的方法?
代码之间的时间比较:
Fortran OpenMP: 83.1 seconds
Fortran ACC: 2.4 seconds
Matlab parfor: 1182 seconds
Run Code Online (Sandbox Code Playgroud)
最后一点,我应该说上面的代码解决了一个简单的真实商业周期模型,并在此基础上编写.
首先,正如Dev-iL 已经提到的,您可以使用 GPU 编码器。
它(我使用 R2019a)只需要对代码进行少量更改:
function cdapted()
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 1e-8;
cnt = 1;
while dif>tol
for ik=1:nk
for iz = 1:nz
tmpmax = double(intmin);
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
% I've commented out fprintf because double2single cannot handle it
% (could be manually uncommented in the converted version if needed)
% ------------
% if mod(cnt,1)==0
% fprintf('%1.5f : %1.5f \n', cnt, dif);
% end
cnt = cnt+1;
end
end
Run Code Online (Sandbox Code Playgroud)
构建这个的脚本是:
% unload mex files
clear mex
%% Build for gpu, float64
% Produces ".\codegen\mex\cdapted" folder and "cdapted_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg cdapted
% benchmark it (~7.14s on my GTX1080Ti)
timeit(@() cdapted_mex,0)
%% Build for gpu, float32:
% Produces ".\codegen\cdapted\single" folder
scfg = coder.config('single');
codegen -double2single scfg cdapted
% Produces ".\codegen\mex\cdapted_single" folder and "cdapted_single_mex.mexw64"
cfg = coder.gpuConfig('mex');
codegen -config cfg .\codegen\cdapted\single\cdapted_single.m
% benchmark it (~2.09s on my GTX1080Ti)
timeit(@() cdapted_single_mex,0)
Run Code Online (Sandbox Code Playgroud)
因此,如果您的 Fortran 二进制文件使用 float32 精度(我怀疑是这样),则此 Matlab Coder 结果与之相当。但这并不意味着两者都非常高效。Matlab Coder 生成的代码还远未达到高效的程度。而且它没有充分利用 GPU(即使 TDP 约为 50%)。
接下来,我同意user10597469和Nicky Mattsson 的观点,即您的 Matlab 代码看起来不像正常的“本机”矢量化 Matlab 代码。
有很多事情需要调整。(但arrayfun几乎不比for)。首先,让我们删除for循环:
function vertorized1()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
while dif>tol
%% orig-noparfor:
t=tic();
for ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
c1_x = c1_.^(1-eta)/(1-eta);
c2 = c1_x + reshape(ev', [1 nz nk]);
c2(c1_<0) = -Inf;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 17.7 9.8]
% tol = 1e-8 -> 1124 iterations :: t_acc = [1758.6 972.0]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 931.7443
Run Code Online (Sandbox Code Playgroud)
现在,非常明显的是,循环内大多数计算密集型计算while(例如^(1-eta)/(1-eta))实际上会产生可以预先计算的常数。一旦我们修复了这个问题,结果就会比基于原始parfor版本(在我的 2xE5-2630v3 上)快一点:
function vertorized2()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
while dif>tol
%% orig:
t=tic();
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 2.4 1.7]
% tol = 1e-8 -> 1124 iterations :: t_acc = [188.3 115.9]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
Run Code Online (Sandbox Code Playgroud)
这种矢量化代码仍然效率低下(例如reshape(ev',...),它消耗了约 60% 的时间,可以通过重新排序维度轻松避免),但它在某种程度上适合gpuArray():
function vertorized2()
t_tot = tic();
beta = 0.984;
eta = 2;
alpha = 0.35;
delta = 0.01;
rho = 0.95;
sigma = 0.005;
zmin=-0.0480384;
zmax=0.0480384;
nz = 4;
nk=4800;
v=zeros(nk,nz);
v0=zeros(nk,nz);
ev=zeros(nk,nz);
c0=zeros(nk,nz);
%Grid for productivity z
%[1 x 4] grid of values for z
zgrid = linspace(zmin,zmax,nz);
zgrid = exp(zgrid);
% [4 x 4] Markov transition matrix of z
tran_z = zeros([4,4]);
tran_z(1,1) = 0.996757;
tran_z(1,2) = 0.00324265;
tran_z(1,3) = 0;
tran_z(1,4) = 0;
tran_z(2,1) = 0.000385933;
tran_z(2,2) = 0.998441;
tran_z(2,3) = 0.00117336;
tran_z(2,4) = 0;
tran_z(3,1) = 0;
tran_z(3,2) = 0.00117336;
tran_z(3,3) = 0.998441;
tran_z(3,4) = 0.000385933;
tran_z(4,1) = 0;
tran_z(4,2) = 0;
tran_z(4,3) = 0.00324265;
tran_z(4,4) = 0.996757;
% Grid for capital k
kmin = 0.95*(1/(alpha*zgrid(1)))*((1/beta)-1+delta)^(1/(alpha-1));
kmax = 1.05*(1/(alpha*zgrid(nz)))*((1/beta)-1+delta)^(1/(alpha-1));
% [1 x 4800] grid of possible values of k
kgrid = linspace(kmin, kmax, nk);
% Compute initial wealth c0(k,z)
for iz=1:nz
c0(:,iz) = zgrid(iz)*kgrid.^alpha + (1-delta)*kgrid;
end
dif = 10000;
tol = 0.4;
tol = 1e-8;
cnt = 1;
t_acc=zeros([1,2]);
%% constants:
kgrid_ = reshape(kgrid,[1 1 numel(kgrid)]);
c1_ = c0 - kgrid_;
mask=zeros(size(c1_));
mask(c1_<0)=-Inf;
c1_x = c1_.^(1-eta)/(1-eta);
while dif>tol
%% orig:
t=tic();
parfor ik=1:nk
for iz = 1:nz
tmpmax = -intmax;
for i = 1:nk
c1 = c0(ik,iz) - kgrid(i);
if (c1<0)
continue
end
c1 = c1^(1-eta)/(1-eta)+ev(i,iz);
if tmpmax<c1
tmpmax = c1;
end
end
v(ik,iz) = tmpmax;
end
end
t_acc(1) = t_acc(1) + toc(t);
%% better:
t=tic();
c2 = c1_x + reshape(ev', [1 nz nk]);
c2 = c2 + mask;
v_ = max(c2,[],3);
t_acc(2) = t_acc(2) + toc(t);
%% compare
assert(isequal(v_,v));
v=v_;
%% other
ev = beta*v*tran_z;
dif = max(max(abs(v-v0)));
v0 = v;
if mod(cnt,1)==0
fprintf('%1.5f : %1.5f \n', cnt, dif);
end
cnt = cnt+1;
end
disp(t_acc);
disp(toc(t_tot));
end
% toc result:
% tol = 0.4 -> 12 iterations :: t_acc = [ 2.4 1.7]
% tol = 1e-8 -> 1124 iterations :: t_acc = [188.3 115.9]
%
% (all 1124 iterations) with commented-out orig :: t_tot = 117.6217
Run Code Online (Sandbox Code Playgroud)
这个约 15 秒的结果比我们从 Matlab Coder 获得的结果(约 2 秒)差约 7 倍。但此选项需要的工具箱较少。在实践中,gpuArray当您从编写“本机 Matlab 代码”开始时,这是最方便的。包括互动使用。
最后,如果您使用 Matlab Coder 构建最终的矢量化版本(您必须进行一些琐碎的调整),它不会比第一个版本更快。速度会慢 2-3 倍。