Mic*_*ael 9 python numpy cython f2py numba
我将Python加速器(Numba,Cython,f2py)与简单的For循环和Numpy的einsum进行比较以解决特定问题(见下文).到目前为止,Numpy是这个问题最快的因素(速度提高了6倍),但是如果我应该尝试额外的优化,或者我做错了什么,我想要一些反馈.这个简单的代码基于一个更大的代码,它有许多这些einsum调用,但没有明确的for循环.我正在检查这些加速器是否可以做得更好.
在Mac OS X Yosemite上使用Python 2.7.9完成计时,安装了gcc-5.3.0(--with-fortran --without-multilib)来自Homebrew.也做了%timeit电话; 这些单一的通话时间相当准确.
In [1]: %run -i test_numba.py
test_numpy: 0.0805640220642
Matches Numpy output: True
test_dumb: 1.43043899536
Matches Numpy output: True
test_numba: 0.464295864105
Matches Numpy output: True
test_cython: 0.627640008926
Matches Numpy output: True
test_f2py: 5.01890516281
Matches Numpy output: True
test_f2py_order: 2.31424307823
Matches Numpy output: True
test_f2py_reorder: 0.507861852646
Matches Numpy output: True
Run Code Online (Sandbox Code Playgroud)
主要代码:
import numpy as np
import numba
import time
import test_f2py as tf2py
import pyximport
pyximport.install(setup_args={'include_dirs':np.get_include()})
import test_cython as tcyth
def test_dumb(f,b):
fnew = np.empty((f.shape[1],f.shape[2]))
for i in range(f.shape[0]):
for l in range(f.shape[3]):
fnew += f[i,:,:,l] * b[i,l]
return fnew
def test_dumber(f,b):
fnew = np.empty((f.shape[1],f.shape[2]))
for i in range(f.shape[0]):
for j in range(f.shape[1]):
for k in range(f.shape[2]):
for l in range(f.shape[3]):
fnew[j,k] += f[i,j,k,l] * b[i,l]
return fnew
@numba.jit(nopython=True)
def test_numba(f,b):
fnew = np.zeros((f.shape[1],f.shape[2])) #NOTE: can't be empty, gives errors
for i in range(f.shape[0]):
for j in range(f.shape[1]):
for k in range(f.shape[2]):
for l in range(f.shape[3]):
fnew[j,k] += f[i,j,k,l] * b[i,l]
return fnew
def test_numpy(f,b):
return np.einsum('i...k,ik->...',f,b)
def test_f2py(f,b):
return tf2py.test_f2py(f,b)
def test_f2py_order(f,b):
return tf2py.test_f2py(f,b)
def test_f2py_reorder(f,b):
return tf2py.test_f2py_reorder(f,b)
def test_cython(f,b):
return tcyth.test_cython(f,b)
if __name__ == '__main__':
#goal is to create: fnew = sum f*b over dim 0 and 3.
f = np.random.rand(32,33,2000,64)
b = np.random.rand(32,64)
f1 = np.asfortranarray(f)
b1 = np.asfortranarray(b)
f2 = np.asfortranarray(np.transpose(f,[1,2,0,3]))
funcs = [test_dumb,test_numba, test_cython, \
test_f2py,test_f2py_order,test_f2py_reorder]
tstart = time.time()
fnew_numpy= test_numpy(f,b)
tstop = time.time()
print test_numpy.__name__+': '+str(tstop-tstart)
print 'Matches Numpy output: '+str(np.allclose(fnew_numpy,fnew_numpy))
print ''
for func in funcs:
tstart = time.time()
if func.__name__ == 'test_f2py_order':
fnew = func(f1,b1)
elif func.__name__ == 'test_f2py_reorder':
fnew = func(f2,b1)
else:
fnew = func(f,b)
tstop = time.time()
print func.__name__+': '+str(tstop-tstart)
print 'Matches Numpy output: '+str(np.allclose(fnew,fnew_numpy))
print ''
Run Code Online (Sandbox Code Playgroud)
f2py文件(使用f2py -c -m test_f2py test_f2py.F90编译):
!file: test_f2py
subroutine test_f2py(f,b,fnew,n1,n2,n3,n4)
integer :: n1,n2,n3,n4
real(8), dimension(n1,n2,n3,n4) :: f
real(8), dimension(n1,n4) :: b
real(8), dimension(n2,n3) :: fnew
!f2py intent(in) f
!f2py intent(in) b
!f2py intent(out) fnew
!f2py intent(in) n1
!f2py intent(in) n2
!f2py intent(in) n3
!f2py intent(in) n4
integer :: i1,i2,i3,i4
do i1=1,n1
do i2=1,n2
do i3=1,n3
do i4=1,n4
fnew(i2,i3) = fnew(i2,i3) + f(i1,i2,i3,i4)*b(i1,i4)
enddo
enddo
enddo
enddo
end subroutine test_f2py
subroutine test_f2py_reorder(f,b,fnew,n1,n2,n3,n4)
integer :: n1,n2,n3,n4
real(8), dimension(n1,n2,n3,n4) :: f
real(8), dimension(n3,n4) :: b
real(8), dimension(n1,n2) :: fnew
!f2py intent(in) f
!f2py intent(in) b
!f2py intent(out) fnew
!f2py intent(in) n1
!f2py intent(in) n2
!f2py intent(in) n3
!f2py intent(in) n4
integer :: i1,i2,i3,i4
do i3=1,n3
do i4=1,n4
do i1=1,n1
do i2=1,n2
fnew(i1,i2) = fnew(i1,i2) + f(i1,i2,i3,i4)*b(i3,i4)
enddo
enddo
enddo
enddo
end subroutine test_f2py_reorder
Run Code Online (Sandbox Code Playgroud)
和Cython .pyx文件(在主程序中使用pyximport编译):
#/usr/bin python
import numpy as np
cimport numpy as np
def test_cython(np.ndarray[np.float64_t,ndim=4] f, np.ndarray[np.float64_t,ndim=2] b):
# cdef np.ndarray[np.float64_t,ndim=4] f
# cdef np.ndarray[np.float64_t,ndim=2] b
cdef np.ndarray[np.float64_t,ndim=2] fnew = np.empty((f.shape[1],f.shape[2]),dtype=np.float64)
cdef int i,j,k,l
cdef int Ni = f.shape[0]
cdef int Nj = f.shape[1]
cdef int Nk = f.shape[2]
cdef int Nl = f.shape[3]
for i in range(Ni):
for j in range(Nj):
for k in range(Nk):
for l in range(Nl):
fnew[j,k] += f[i,j,k,l] * b[i,l]
return fnew
Run Code Online (Sandbox Code Playgroud)
小智 8
通常这些加速器用于通过Python循环或许多中间结果来加速代码,而einsum
已经很好地优化了(参见源代码).你不应该期望他们轻易击败einsum
,但你可能会在性能上接近它.
对于Numba,重要的是将编译时间排除在基准之外.这可以通过两次运行jitted函数(使用相同类型的输入)来完成.例如,通过IPython我得到:
f = np.random.rand(32,33,500,64)
b = np.random.rand(32,64)
%time _ = test_numba(f,b) # First invocation
# Wall time: 466 ms
%time _ = test_numba(f,b)
# Wall time: 73 ms
%timeit test_numba(f, b)
# 10 loops, best of 3: 72.7 ms per loop
%timeit test_numpy(f, b)
# 10 loops, best of 3: 62.8 ms per loop
Run Code Online (Sandbox Code Playgroud)
对于您的Cython代码,可以进行一些改进:
就像是:
cimport cython
import numpy as np
@cython.boundscheck(False)
@cython.wraparound(False)
def test_cython(double[:,:,:,::1] f, double[:,::1] b):
cdef int i, j, k, l, Ni, Nj, Nk, Nl
Ni = f.shape[0]
Nj = f.shape[1]
Nk = f.shape[2]
Nl = f.shape[3]
fnew = np.empty((Nj, Nk))
cdef double[:,::1] fnew_v = fnew
for i in range(Ni):
for j in range(Nj):
for k in range(Nk):
for l in range(Nl):
fnew_v[j,k] += f[i,j,k,l] * b[i,l]
return fnew
Run Code Online (Sandbox Code Playgroud)
在最新的Ubuntu 15.10(x86)上,这给了我同样的速度einsum
.但是,在与Anaconda发行版相同的PC上的Windows(x86)上,这个Cython代码的速度大约是其速度的一半einsum
.我认为这可能与gcc版本(5.2.1 vs 4.7.0)和插入SSE指令的能力有关(einsum
用SSE2内在函数编码).也许提供不同的编译器选项会有所帮助,但我不确定.
我几乎不知道任何Fortran所以我不能对此发表评论.
既然你的目标是击败einsum
我认为明显的下一步是看待增加的并行性.生成一些线程应该相当容易cython.parallel
.如果这还没有使系统内存带宽饱和,那么您可以尝试明确包含最新的CPU指令,如AVX2和Fused Multiply-Add.
您可以尝试的另一件事是重新排序和重塑f
并执行操作np.dot
.如果你的Numpy带有一个很好的BLAS库,这应该可以实现你能想到的几乎所有优化,不过代价是失去一般性,可能是一个非常昂贵的f
数组副本.
解析完字符串参数后,einsum
使用编译版本nditer
在所有轴上执行乘积和计算。源代码很容易在 numpy github 上找到。
不久前,我做了一个einsum
类似于编写补丁的工作。作为其中的一部分,我编写了一个cython
执行乘积和的脚本。您可以在以下位置查看此代码:
https://github.com/hpaulj/numpy-einsum
我没有尝试让我的代码快速运行einsum
。我只是想了解它是如何工作的。
归档时间: |
|
查看次数: |
2267 次 |
最近记录: |