cgx*_*cgx 10 matlab fortran julia
我正在使用不同的语言来解决一个简单的值函数迭代问题,在该问题中我循环遍历状态空间网格。我试图了解性能差异以及如何调整每个代码。为了后代,我在下面发布了每种语言的完整工作示例。但是,我相信大部分调整都是在while循环中完成的。我有点困惑我在 Fortran 中做错了什么,因为速度似乎低于标准。
Matlab ~2.7secs :我暂时避免使用该repmat函数的更有效的解决方案,以保持代码的可比性。代码似乎自动多线程到 4 个线程
beta = 0.98;
sigma = 0.5;
R = 1/beta;
a_grid = linspace(0,100,1001);
tic
[V_mat, next_mat] = valfun(beta, sigma, R ,a_grid);
toc
Run Code Online (Sandbox Code Playgroud)
凡 valfun()
function [V_mat, next_mat] = valfun(beta, sigma, R, a_grid)
zeta = 1-1/sigma;
len = length(a_grid);
V_mat = zeros(2,len);
next_mat = zeros(2,len);
u = zeros(2,len,len);
c = zeros(2,len,len);
for i = 1:len
c(1,:,i) = a_grid(i) - a_grid/R + 20.0;
c(2,:,i) = a_grid(i) - a_grid/R;
end
u = c.^zeta * zeta^(-1);
u(c<=0) = -1e8;
tol = 1e-4;
outeriter = 0;
diff = 1000.0;
while (diff>tol) %&& (outeriter<20000)
outeriter = outeriter + 1;
V_last = V_mat;
for i = 1:len
[V_mat(1,i), next_mat(1,i)] = max( u(1,:,i) + beta*V_last(2,:));
[V_mat(2,i), next_mat(2,i)] = max( u(2,:,i) + beta*V_last(1,:));
end
diff = max(abs(V_mat - V_last));
end
fprintf("\n Value Function converged in %i steps. \n", outeriter)
end
Run Code Online (Sandbox Code Playgroud)
Julia(编译后)~5.4 秒(4 线程(9425469 分配:22.43 GiB)),~7.8 秒(1 线程(2912564 分配:22.29 GiB))
[编辑:添加正确的广播后,@views 现在只有 1.8-2.1 秒,见下文!]
using LinearAlgebra, UnPack, BenchmarkTools
struct paramsnew
?::Float64
?::Float64
R::Float64
end
function valfun(params, a_grid)
@unpack ?,?, R = params
? = 1-1/?
len = length(a_grid)
V_mat = zeros(2,len)
next_mat = zeros(2,len)
u = zeros(2,len,len)
c = zeros(2,len,len)
@inbounds for i in 1:len
c[1,:,i] = @. a_grid[i] - a_grid/R .+ 20.0
c[2,:,i] = @. a_grid[i] - a_grid/R
end
u = c.^? * ?^(-1)
u[c.<=0] .= typemin(Float64)
tol = 1e-4
outeriter = 0
test = 1000.0
while test>tol
outeriter += 1
V_last = deepcopy(V_mat)
@inbounds Threads.@threads for i in 1:len # loop over grid points
V_mat[1,i], next_mat[1,i] = findmax( u[1,:,i] .+ ?*V_last[2,:])
V_mat[2,i], next_mat[2,i] = findmax( u[2,:,i] .+ ?*V_last[1,:])
end
test = maximum( abs.(V_mat - V_last)[.!isnan.( V_mat - V_last )])
end
print("\n Value Function converged in ", outeriter, " steps.")
return V_mat, next_mat
end
a_grid = collect(0:0.1:100)
p1 = paramsnew(0.98, 1/2, 1/0.98);
@time valfun(p1,a_grid)
print("\n should be compiled now \n")
@btime valfun(p1,a_grid)
Run Code Online (Sandbox Code Playgroud)
Fortran (O3, mkl, qopenmp) ~9.2secs:我在声明openmp变量时也一定做错了什么,因为在使用时编译会因某些网格大小而崩溃openmp(SIGSEGV 错误)。
module mod_calc
use omp_lib
implicit none
integer, parameter :: dp = selected_real_kind(33,4931), len = 1001
public :: dp, len
contains
subroutine linspace(from, to, array)
real(dp), intent(in) :: from, to
real(dp), intent(out) :: array(:)
real(dp) :: range
integer :: n, i
n = size(array)
range = to - from
if (n == 0) return
if (n == 1) then
array(1) = from
return
end if
do i=1, n
array(i) = from + range * (i - 1) / (n - 1)
end do
end subroutine
subroutine calc_val()
real(dp):: bbeta, sigma, R, zeta, tol, test
real(dp):: a_grid(len), V_mat(2,len), V_last(2,len), &
u(len,len,2), c(len,len,2)
integer :: outeriter, i, sss, next_mat(2,len), fu
character(len=*), parameter :: FILE_NAME = 'data.txt' ! File name.
call linspace(from=0._dp, to=100._dp, array=a_grid)
bbeta = 0.98
sigma = 0.5
R = 1.0/0.98
zeta = 1.0 - 1.0/sigma
tol = 1e-4
test = 1000.0
outeriter = 0
do i = 1,len
c(:,i,1) = a_grid(i) - a_grid/R + 20.0
c(:,i,2) = a_grid(i) - a_grid/R
end do
u = c**zeta * 1.0/zeta
where (c<=0)
u = -1e6
end where
V_mat = 0.0
next_mat = 0.0
do while (test>tol .and. outeriter<20000)
outeriter = outeriter+1
V_last = V_mat
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP SHARED(V_mat, next_mat,V_last, u, bbeta) &
!$OMP PRIVATE(i)
!$OMP DO SCHEDULE(static)
do i=1,len
V_mat(1,i) = maxval(u(:,i,1) + bbeta*V_last(2,:))
next_mat(1,i) = maxloc(u(:,i,1) + bbeta*V_last(2,:),1)
V_mat(2,i) = maxval(u(:,i,2) + bbeta*V_last(1,:))
next_mat(2,i) = maxloc(u(:,i,2) + bbeta*V_last(1,:),1)
end do
!$OMP END DO
!$OMP END PARALLEL
test = maxval(abs(log(V_last/V_mat)))
end do
end subroutine
end module mod_calc
program main
use mod_calc
implicit none
integer:: clck_counts_beg,clck_rate,clck_counts_end
call omp_set_num_threads(4)
call system_clock ( clck_counts_beg, clck_rate )
call calc_val()
call system_clock ( clck_counts_end, clck_rate )
write (*, '("Time = ",f6.3," seconds.")') (clck_counts_end - clck_counts_beg) / real(clck_rate)
end program main
Run Code Online (Sandbox Code Playgroud)
应该有办法减少分配量(Julia 报告 32-45% 的 gc 时间!)但现在我太新手看不到它们,所以欢迎任何评论和提示。
向@viewswhile 循环添加和正确广播大大提高了 Julia 的速度(我猜是预期的),因此现在击败了 Matlab 循环。有了 4 个线程,代码现在只需要 1.97 秒。具体来说,
@inbounds for i in 1:len
c[1,:,i] = @views @. a_grid[i] - a_grid/R .+ 20.0
c[2,:,i] = @views @. a_grid[i] - a_grid/R
end
u = @. c^? * ?^(-1)
@. u[c<=0] = typemin(Float64)
while test>tol && outeriter<20000
outeriter += 1
V_last = deepcopy(V_mat)
@inbounds Threads.@threads for i in 1:len # loop over grid points
V_mat[1,i], next_mat[1,i] = @views findmax( @. u[1,:,i] + ?*V_last[2,:])
V_mat[2,i], next_mat[2,i] = @views findmax( @. u[2,:,i] + ?*V_last[1,:])
end
test = @views maximum( @. abs(V_mat - V_last)[!isnan( V_mat - V_last )])
end
Run Code Online (Sandbox Code Playgroud)
Ian*_*ush 11
fortran 如此缓慢的原因是它使用四倍精度 - 我不知道 Julia 或 Matlab,但看起来好像在这种情况下使用了双精度。此外,如评论中所述,某些循环顺序对于 Fortran 是不正确的,而且您在 Fortran 代码中使用精度的方式不一致,您的大多数常量都是单精度的。纠正所有这些导致以下结果:
注意我已经关闭了这些并行化,所有结果都是单线程的。代码如下:
module mod_calc
!!$ use omp_lib
implicit none
!!$ integer, parameter :: dp = selected_real_kind(33,4931), len = 1001
integer, parameter :: dp = selected_real_kind(15), len = 1001
public :: dp, len
contains
subroutine linspace(from, to, array)
real(dp), intent(in) :: from, to
real(dp), intent(out) :: array(:)
real(dp) :: range
integer :: n, i
n = size(array)
range = to - from
if (n == 0) return
if (n == 1) then
array(1) = from
return
end if
do i=1, n
array(i) = from + range * (i - 1) / (n - 1)
end do
end subroutine
subroutine calc_val()
real(dp):: bbeta, sigma, R, zeta, tol, test
real(dp):: a_grid(len), V_mat(len,2), V_last(len,2), &
u(len,len,2), c(len,len,2)
integer :: outeriter, i, sss, next_mat(2,len), fu
character(len=*), parameter :: FILE_NAME = 'data.txt' ! File name.
call linspace(from=0._dp, to=100._dp, array=a_grid)
bbeta = 0.98_dp
sigma = 0.5_dp
R = 1.0_dp/0.98_dp
zeta = 1.0_dp - 1.0_dp/sigma
tol = 1e-4_dp
test = 1000.0_dp
outeriter = 0
do i = 1,len
c(:,i,1) = a_grid(i) - a_grid/R + 20.0_dp
c(:,i,2) = a_grid(i) - a_grid/R
end do
u = c**zeta * 1.0_dp/zeta
where (c<=0)
u = -1e6_dp
end where
V_mat = 0.0_dp
next_mat = 0.0_dp
do while (test>tol .and. outeriter<20000)
outeriter = outeriter+1
V_last = V_mat
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP SHARED(V_mat, next_mat,V_last, u, bbeta) &
!$OMP PRIVATE(i)
!$OMP DO SCHEDULE(static)
do i=1,len
V_mat(i,1) = maxval(u(:,i,1) + bbeta*V_last(:, 2))
next_mat(i,1) = maxloc(u(:,i,1) + bbeta*V_last(:, 2),1)
V_mat(i,2) = maxval(u(:,i,2) + bbeta*V_last(:, 1))
next_mat(i,2) = maxloc(u(:,i,2) + bbeta*V_last(:, 1),1)
end do
!$OMP END DO
!$OMP END PARALLEL
test = maxval(abs(log(V_last/V_mat)))
end do
Write( *, * ) test
end subroutine
end module mod_calc
program main
use mod_calc
implicit none
integer:: clck_counts_beg,clck_rate,clck_counts_end
!!$ call omp_set_num_threads(2)
call system_clock ( clck_counts_beg, clck_rate )
call calc_val()
call system_clock ( clck_counts_end, clck_rate )
write (*, '("Time = ",f6.3," seconds.")') (clck_counts_end - clck_counts_beg) / real(clck_rate)
end program main
Run Code Online (Sandbox Code Playgroud)
编译/链接:
ian@eris:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ian@eris:~/work/stack$ gfortran -Wall -Wextra -O3 jul.f90
jul.f90:36:48:
character(len=*), parameter :: FILE_NAME = 'data.txt' ! File name.
1
Warning: Unused parameter ‘file_name’ declared at (1) [-Wunused-parameter]
jul.f90:35:57:
integer :: outeriter, i, sss, next_mat(2,len), fu
1
Warning: Unused variable ‘fu’ declared at (1) [-Wunused-variable]
jul.f90:35:36:
integer :: outeriter, i, sss, next_mat(2,len), fu
1
Warning: Unused variable ‘sss’ declared at (1) [-Wunused-variable]
Run Code Online (Sandbox Code Playgroud)
跑步:
ian@eris:~/work/stack$ ./a.out
9.8343643237979391E-005
Time = 0.908 seconds.
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
1380 次 |
| 最近记录: |