使用 OMP 加速对称矩阵的计算

nvh*_*h10 0 fortran transpose openmp matrix-multiplication intel-fortran

我的矩阵计算是:C=CA*B

这里 C 是一个对称矩阵,所以我想通过只考虑上三角形然后取相反的 elelement 来加速这个计算。我使用了 OMP,发现我的实现比整个矩阵 C 的正常计算慢。

我还看到 C=C-AxB 的计算比 C=C+AxB 慢。

附上我的程序。请建议我!

    Program testspeed
implicit none
integer nstate,nmeas,i,j,l
integer(kind=8) :: tclock1, tclock2, clock_rate
real(kind=8) :: elapsed_time
double precision, allocatable, dimension(:,:):: B,C,A
nstate =20000
nmeas=10000
allocate (B(nmeas,nstate),C(nstate,nstate),A(nstate,nmeas))
A=1d0
B=1d0
call system_clock(tclock1)
write(*,*) "1"
!$omp parallel do
do j = 1, nstate
    do l = 1,nmeas
        do i = 1, j
            C(j,i) = C(j,i) - A(j,l)*B(l,i)
            C(i,j)=C(j,i)
        end do
    end do
end do
!$omp end parallel do
write(*,*) "2"
call system_clock(tclock2, clock_rate)
elapsed_time = float(tclock2 - tclock1) / float(clock_rate)
write(*,*) elapsed_time
end Program testspeed
Run Code Online (Sandbox Code Playgroud)

Ian*_*ush 5

我教给学生的基本规则之一是,现在没有人应该自己编写密集矩阵乘法 - 并且不应该这样做 30 年以上。您应该改用 BLAS 库。下面我将使用 BLAS 库与您的循环排序和更好的循环排序进行比较,并与matmul我用作参考以检查结果是否正确的 Fortran 内在函数进行比较。BLAS 并且matmul不利用 C 的对称性,但它们仍然是最快的例程 - BLAS 比您编写的循环排序快 200-300 倍。注意我还稍微缩小了矩阵的大小,因为我厌倦了等待原始文件在更大的情况下运行:

ijb@ijb-Latitude-5410:~/work/stack$ cat mm.f90
Program testspeed

  Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64

  Use omp_lib, Only : omp_get_max_threads
  
  Implicit None
  Integer nstate,nmeas,i,j,l
  Integer(li) :: tclock1, tclock2, clock_rate
  Real(wp) :: elapsed_time
  Real( wp ), Allocatable, Dimension(:,:):: B,C,A
  Real( wp ), Allocatable, Dimension(:,:):: C_test
  Real( wp ), Allocatable, Dimension(:,:):: C_start

  Write( *, * ) 'Using ', omp_get_max_threads(), ' threads'
  
!!$  nstate =2000
!!$  nmeas=1000
  nstate = 5000
  nmeas  = 2500
  Allocate (B(nmeas,nstate),C(nstate,nstate),A(nstate,nmeas))
  Allocate( C_test, Mold = C )
  Allocate( C_start, Mold = C )
!!$  A=1.0_wp
!!$  B=1.0_wp
  ! Random numbers are a much better test
  Call Random_number( A )
  B = Transpose( A ) ! make sure result is symmetric
  Call Random_number( C_start )
  ! Make Initial C Symmetric
  C_start = 0.5_wp * ( C_start + Transpose( C_start ) )

  Write( *, * ) 'Matix sizes ', Shape( A ), Shape( B ), Shape( C )
  
  C_test = C_start
  Call system_Clock(tclock1)
  C_test = C_test -  Matmul( A, B )
  Call system_Clock(tclock2, clock_rate)
  elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
  Write( *,'( a, t20, f8.3 )' ) 'Matmul', elapsed_time

  C = C_start
  Call system_Clock(tclock1)
  !$omp parallel do
  Do j = 1, nstate
     Do l = 1,nmeas
        Do i = 1, j
           C(j,i) = C(j,i) - A(j,l)*B(l,i)
           C(i,j)=C(j,i)
        End Do
     End Do
  End Do
  !$omp end parallel do
  Call system_Clock(tclock2, clock_rate)
  elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
  Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
       'Orig loops', elapsed_time, Maxval( Abs( C_test - C ) )

  C = C_start
  Call system_Clock(tclock1)
  !$omp parallel default( none ) shared ( nstate, nmeas, A, B, C ), private( i, j, l )
  !$omp do 
  Do i = 1, nstate
     Do l = 1,nmeas
        Do j = 1, i
           C(j,i) = C(j,i) - A(j,l)*B(l,i)
        End Do
     End Do
  End Do
  !$omp end do
  !$omp do
  Do i = 1, nstate
     Do j = 1, i
        C( i, j ) = C( j, i )
     End Do
  End Do
  !$omp end do
  !$omp end parallel
  Call system_Clock(tclock2, clock_rate)
  elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
  Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
       'Sensible loops', elapsed_time, Maxval( Abs( C_test - C ) )

  C = C_start
  Call system_Clock(tclock1)
  Call dgemm( 'N', 'N', nstate, nstate, nmeas, -1.0_wp, A, Size( A, Dim = 1 ), &
                                                        B, Size( B, Dim = 1 ), &
                                               +1.0_wp, C, Size( C, Dim = 1 ) )
  Call system_Clock(tclock2, clock_rate)
  elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
  Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
       'BLAS ', elapsed_time, Maxval( Abs( C_test - C ) )
  
End Program testspeed
ijb@ijb-Latitude-5410:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

ijb@ijb-Latitude-5410:~/work/stack$ gfortran -fopenmp -Wall -Wextra -std=f2018 -O3  mm.f90 -lopenblas
ijb@ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=1
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
 Using            1  threads
 Matix sizes         5000        2500        2500        5000        5000        5000
Matmul                4.793
Orig loops          421.564  Max error   0.488853402203E-11
Sensible loops       20.742  Max error   0.488853402203E-11
BLAS                  2.185  Max error   0.682121026330E-12
ijb@ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=2
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
 Using            2  threads
 Matix sizes         5000        2500        2500        5000        5000        5000
Matmul                4.968
Orig loops          324.319  Max error   0.466116034659E-11
Sensible loops       17.656  Max error   0.466116034659E-11
BLAS                  1.161  Max error   0.682121026330E-12
ijb@ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=3
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
 Using            3  threads
 Matix sizes         5000        2500        2500        5000        5000        5000
Matmul                4.852
Orig loops          243.268  Max error   0.500222085975E-11
Sensible loops       15.802  Max error   0.500222085975E-11
BLAS                  0.852  Max error   0.682121026330E-12
ijb@ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=4
ijb@ijb-Latitude-5410:~/work/stack$ ./a.out
 Using            4  threads
 Matix sizes         5000        2500        2500        5000        5000        5000
Matmul                4.994
Orig loops          189.189  Max error   0.477484718431E-11
Sensible loops       14.245  Max error   0.477484718431E-11
BLAS                  0.707  Max error   0.682121026330E-12
Run Code Online (Sandbox Code Playgroud)

对于 BLAS,我使用了 openblas - 这是免费的。在 Linux 系统上,一个简单的 apt get 或类似的就足够了。

另请注意

  1. 如果您必须编写自己的循环,则您的最内层循环应该尽可能地遍历数组的第一个索引。这是因为 Fortran 将其数组排序为主要列。这就是我在“合理”循环排序中所做的
  2. Real( 8 )不可移植,不能保证编译器支持,也不能保证做你期望的,不应该使用。类似的Integer( 8 )。请查看我为更好的方法所做的工作,该方法应该适用于所有编译器。
  3. Float不是标准的内在 -Real像我一样使用
  4. 如果结果不正确,则基准测试毫无意义,因此您应该始终包含测试结果的方法。这里我使用 Fortran 内在函数matmul来提供参考版本。您的原始代码未初始化 C,因此结果不可信 - 但由于您不检查您获得了 C 的正确值,因此您无法知道这一点。
  5. 我个人!$omp parallel do非常不喜欢,我认为 OpenMP 中有这样的捷径是错误的。而是将它们分成!$omp parallel!$omp do- 了解线程创建和工作共享是不同的事情非常重要,将它们缠绕在一行中并不是学习 OpenMP 的好方法。