Matlab 与 Julia 与 Fortran 中的速度

cgx*_*cgx 10 matlab fortran julia

我正在使用不同的语言来解决一个简单的值函数迭代问题,在该问题中我循环遍历状态空间网格。我试图了解性能差异以及如何调整每个代码。为了后代,我在下面发布了每种语言的完整工作示例。但是,我相信大部分调整都是在while循环中完成的。我有点困惑我在 Fortran 中做错了什么,因为速度似乎低于标准。

Matlab ~2.7secs :暂时避免使用该repmat函数的更有效的解决方案,以保持代码的可比性。代码似乎自动多线程到 4 个线程

beta = 0.98;
sigma = 0.5;
R = 1/beta;

a_grid = linspace(0,100,1001);
tic
[V_mat, next_mat] = valfun(beta, sigma, R ,a_grid);
toc
Run Code Online (Sandbox Code Playgroud)

凡 valfun()

function [V_mat, next_mat] = valfun(beta, sigma, R, a_grid)
    zeta = 1-1/sigma;
    len = length(a_grid);
    V_mat = zeros(2,len);
    next_mat = zeros(2,len);

    u = zeros(2,len,len);
    c = zeros(2,len,len);

    for i = 1:len
        c(1,:,i) = a_grid(i) - a_grid/R + 20.0;
        c(2,:,i) = a_grid(i) - a_grid/R;
    end

    u = c.^zeta * zeta^(-1);
    u(c<=0) = -1e8;

    tol = 1e-4;
    outeriter = 0;
    diff = 1000.0;

     while (diff>tol)  %&& (outeriter<20000)
        outeriter = outeriter + 1;
        V_last = V_mat;

        for i = 1:len 
            [V_mat(1,i), next_mat(1,i)] = max( u(1,:,i) + beta*V_last(2,:));
            [V_mat(2,i), next_mat(2,i)] = max( u(2,:,i) + beta*V_last(1,:));
        end
        diff = max(abs(V_mat - V_last));
    end

    fprintf("\n Value Function converged in %i steps. \n", outeriter)  
end
Run Code Online (Sandbox Code Playgroud)

Julia(编译后)~5.4 秒(4 线程(9425469 分配:22.43 GiB)),~7.8 秒(1 线程(2912564 分配:22.29 GiB))

[编辑:添加正确的广播后,@views 现在只有 1.8-2.1 秒,见下文!]

using LinearAlgebra, UnPack, BenchmarkTools

struct paramsnew
    ?::Float64
    ?::Float64
    R::Float64
end

function valfun(params, a_grid)
    @unpack ?,?, R = params
    ? = 1-1/?

    len = length(a_grid)
    V_mat = zeros(2,len)
    next_mat = zeros(2,len)
    u = zeros(2,len,len)
    c = zeros(2,len,len)

    @inbounds for i in 1:len
        c[1,:,i] = @. a_grid[i] - a_grid/R .+ 20.0
        c[2,:,i] = @. a_grid[i] - a_grid/R
    end

    u = c.^? * ?^(-1)
    u[c.<=0] .= typemin(Float64)

    tol = 1e-4
    outeriter = 0
    test = 1000.0

    while test>tol 
        outeriter += 1
        V_last = deepcopy(V_mat)

        @inbounds Threads.@threads for i in 1:len # loop over grid points
            V_mat[1,i], next_mat[1,i] = findmax( u[1,:,i] .+ ?*V_last[2,:])
            V_mat[2,i], next_mat[2,i] = findmax( u[2,:,i] .+ ?*V_last[1,:])
        end

        test = maximum( abs.(V_mat - V_last)[.!isnan.( V_mat - V_last )])
    end

    print("\n Value Function converged in ", outeriter, " steps.")
    return V_mat, next_mat
end

a_grid = collect(0:0.1:100)
p1 = paramsnew(0.98, 1/2, 1/0.98);

@time valfun(p1,a_grid)
print("\n should be compiled now \n")

@btime valfun(p1,a_grid)
Run Code Online (Sandbox Code Playgroud)

Fortran (O3, mkl, qopenmp) ~9.2secs:我在声明openmp变量时也一定做错了什么,因为在使用时编译会因某些网格大小而崩溃openmp(SIGSEGV 错误)。

module mod_calc
    use omp_lib
    implicit none
    integer, parameter :: dp = selected_real_kind(33,4931), len = 1001
    public :: dp, len

    contains

    subroutine linspace(from, to, array)
        real(dp), intent(in) :: from, to
        real(dp), intent(out) :: array(:)
        real(dp) :: range
        integer :: n, i
        n = size(array)
        range = to - from

        if (n == 0) return

        if (n == 1) then
            array(1) = from
            return
        end if

        do i=1, n
            array(i) = from + range * (i - 1) / (n - 1)
        end do
    end subroutine

    subroutine calc_val() 
        real(dp)::  bbeta, sigma, R, zeta, tol, test
        real(dp)::  a_grid(len), V_mat(2,len), V_last(2,len), &
                    u(len,len,2), c(len,len,2)
        
        integer :: outeriter, i, sss, next_mat(2,len), fu
        character(len=*), parameter :: FILE_NAME = 'data.txt'   ! File name.

        call linspace(from=0._dp, to=100._dp, array=a_grid)

        bbeta = 0.98
        sigma = 0.5
        R = 1.0/0.98
        zeta = 1.0 - 1.0/sigma

        tol = 1e-4
        test = 1000.0
        outeriter = 0

        do i = 1,len
            c(:,i,1) = a_grid(i) - a_grid/R + 20.0
            c(:,i,2) = a_grid(i) - a_grid/R
        end do

        u = c**zeta * 1.0/zeta

        where (c<=0)
            u = -1e6
        end where

        V_mat = 0.0
        next_mat = 0.0

        do while (test>tol .and. outeriter<20000)
            outeriter = outeriter+1
            V_last = V_mat

            !$OMP PARALLEL DEFAULT(NONE)          &
            !$OMP SHARED(V_mat, next_mat,V_last, u, bbeta)  &
            !$OMP PRIVATE(i)                      
            !$OMP DO SCHEDULE(static)
            do i=1,len 
                V_mat(1,i)    = maxval(u(:,i,1) + bbeta*V_last(2,:))
                next_mat(1,i) = maxloc(u(:,i,1) + bbeta*V_last(2,:),1)
                V_mat(2,i)    = maxval(u(:,i,2) + bbeta*V_last(1,:))
                next_mat(2,i) = maxloc(u(:,i,2) + bbeta*V_last(1,:),1)
            end do
            !$OMP END DO
            !$OMP END PARALLEL

            test = maxval(abs(log(V_last/V_mat)))
        end do
    end subroutine
end module mod_calc

program main
    use mod_calc

    implicit none
    integer:: clck_counts_beg,clck_rate,clck_counts_end
    
    call omp_set_num_threads(4)

    call system_clock ( clck_counts_beg, clck_rate )
    call calc_val()
    call system_clock ( clck_counts_end, clck_rate )
    write (*, '("Time = ",f6.3," seconds.")')  (clck_counts_end - clck_counts_beg) / real(clck_rate)
end program main
Run Code Online (Sandbox Code Playgroud)

应该有办法减少分配量(Julia 报告 32-45% 的 gc 时间!)但现在我太新手看不到它们,所以欢迎任何评论和提示。

编辑:

@viewswhile 循环添加和正确广播大大提高了 Julia 的速度(我猜是预期的),因此现在击败了 Matlab 循环。有了 4 个线程,代码现在只需要 1.97 秒。具体来说,

    @inbounds for i in 1:len
        c[1,:,i] = @views @. a_grid[i] - a_grid/R .+ 20.0
        c[2,:,i] = @views @. a_grid[i] - a_grid/R
    end

    u = @. c^? * ?^(-1)
    @. u[c<=0] = typemin(Float64)

    while test>tol && outeriter<20000
        outeriter += 1
        V_last = deepcopy(V_mat)

        @inbounds Threads.@threads for i in 1:len # loop over grid points
            V_mat[1,i], next_mat[1,i] = @views findmax( @. u[1,:,i] + ?*V_last[2,:])
            V_mat[2,i], next_mat[2,i] = @views findmax( @. u[2,:,i] + ?*V_last[1,:])
        end
        test = @views maximum( @. abs(V_mat - V_last)[!isnan( V_mat - V_last )])
    end
Run Code Online (Sandbox Code Playgroud)

Ian*_*ush 11

fortran 如此缓慢的原因是它使用四倍精度 - 我不知道 Julia 或 Matlab,但看起来好像在这种情况下使用了双精度。此外,如评论中所述,某些循环顺序对于 Fortran 是不正确的,而且您在 Fortran 代码中使用精度的方式不一致,您的大多数常量都是单精度的。纠正所有这些导致以下结果:

  • 原文:test = 9.83440674663232047922921588613472439E-0005 时间 = 31.413 秒。
  • 优化:测试 = 9.8343643237979391E-005 时间 = 0.912 秒。

注意我已经关闭了这些并行化,所有结果都是单线程的。代码如下:

module mod_calc
!!$    use omp_lib
    implicit none
!!$    integer, parameter :: dp = selected_real_kind(33,4931), len = 1001
    integer, parameter :: dp = selected_real_kind(15), len = 1001
    public :: dp, len

    contains

    subroutine linspace(from, to, array)
        real(dp), intent(in) :: from, to
        real(dp), intent(out) :: array(:)
        real(dp) :: range
        integer :: n, i
        n = size(array)
        range = to - from

        if (n == 0) return

        if (n == 1) then
            array(1) = from
            return
        end if

        do i=1, n
            array(i) = from + range * (i - 1) / (n - 1)
        end do
    end subroutine

    subroutine calc_val() 
        real(dp)::  bbeta, sigma, R, zeta, tol, test
        real(dp)::  a_grid(len), V_mat(len,2), V_last(len,2), &
                    u(len,len,2), c(len,len,2)
        
        integer :: outeriter, i, sss, next_mat(2,len), fu
        character(len=*), parameter :: FILE_NAME = 'data.txt'   ! File name.

        call linspace(from=0._dp, to=100._dp, array=a_grid)

        bbeta = 0.98_dp
        sigma = 0.5_dp
        R = 1.0_dp/0.98_dp
        zeta = 1.0_dp - 1.0_dp/sigma

        tol = 1e-4_dp
        test = 1000.0_dp
        outeriter = 0

        do i = 1,len
            c(:,i,1) = a_grid(i) - a_grid/R + 20.0_dp
            c(:,i,2) = a_grid(i) - a_grid/R
        end do

        u = c**zeta * 1.0_dp/zeta

        where (c<=0)
            u = -1e6_dp
        end where

        V_mat = 0.0_dp
        next_mat = 0.0_dp

        do while (test>tol .and. outeriter<20000)
            outeriter = outeriter+1
            V_last = V_mat

            !$OMP PARALLEL DEFAULT(NONE)          &
            !$OMP SHARED(V_mat, next_mat,V_last, u, bbeta)  &
            !$OMP PRIVATE(i)                      
            !$OMP DO SCHEDULE(static)
            do i=1,len 
                V_mat(i,1)    = maxval(u(:,i,1) + bbeta*V_last(:, 2))
                next_mat(i,1) = maxloc(u(:,i,1) + bbeta*V_last(:, 2),1)
                V_mat(i,2)    = maxval(u(:,i,2) + bbeta*V_last(:, 1))
                next_mat(i,2) = maxloc(u(:,i,2) + bbeta*V_last(:, 1),1)
            end do
            !$OMP END DO
            !$OMP END PARALLEL

            test = maxval(abs(log(V_last/V_mat)))
         end do
         Write( *, * ) test
    end subroutine
end module mod_calc

program main
    use mod_calc

    implicit none
    integer:: clck_counts_beg,clck_rate,clck_counts_end
    
!!$    call omp_set_num_threads(2)

    call system_clock ( clck_counts_beg, clck_rate )
    call calc_val()
    call system_clock ( clck_counts_end, clck_rate )
    write (*, '("Time = ",f6.3," seconds.")')  (clck_counts_end - clck_counts_beg) / real(clck_rate)
end program main
Run Code Online (Sandbox Code Playgroud)

编译/链接:

ian@eris:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

ian@eris:~/work/stack$ gfortran -Wall -Wextra -O3   jul.f90 
jul.f90:36:48:

         character(len=*), parameter :: FILE_NAME = 'data.txt'   ! File name.
                                                1
Warning: Unused parameter ‘file_name’ declared at (1) [-Wunused-parameter]
jul.f90:35:57:

         integer :: outeriter, i, sss, next_mat(2,len), fu
                                                         1
Warning: Unused variable ‘fu’ declared at (1) [-Wunused-variable]
jul.f90:35:36:

         integer :: outeriter, i, sss, next_mat(2,len), fu
                                    1
Warning: Unused variable ‘sss’ declared at (1) [-Wunused-variable]
Run Code Online (Sandbox Code Playgroud)

跑步:

ian@eris:~/work/stack$ ./a.out
   9.8343643237979391E-005
Time =  0.908 seconds.
Run Code Online (Sandbox Code Playgroud)