Ond*_*ian 5 python performance openmp cython
我使用 cython 使用自定义指标来计算成对距离矩阵,作为scipy.spatial.distance.pdist的更快替代方案。
我的指标具有以下形式
def mymetric(u,v,w):
np.sum(w * (1 - np.abs(np.abs(u - v) / np.pi - 1))**2)
Run Code Online (Sandbox Code Playgroud)
使用 scipy 的成对距离可以计算为
x = sp.spatial.distance.pdist(r, metric=lambda u, v: mymetric(u, v, w))
Run Code Online (Sandbox Code Playgroud)
这里,r是维度为 的m向量n矩阵,是维度为 的“权重”因子。mnwn
由于我的问题m相当高,计算速度非常慢。m = 2000这大约n = 10需要 20 秒。
我在 cython 中实现了一个简单的函数来计算成对距离,并立即得到了非常有希望的结果——加速超过 500 倍。
import numpy as np
cimport numpy as np
import cython
from libc.math cimport fabs, M_PI
@cython.wraparound(False)
@cython.boundscheck(False)
def pairwise_distance(np.ndarray[np.double_t, ndim=2] r, np.ndarray[np.double_t, ndim=1] w):
cdef int i, j, k, c, size
cdef np.ndarray[np.double_t, ndim=1] ans
size = r.shape[0] * (r.shape[0] - 1) / 2
ans = np.zeros(size, dtype=r.dtype)
c = -1
for i in range(r.shape[0]):
for j in range(i + 1, r.shape[0]):
c += 1
for k in range(r.shape[1]):
ans[c] += w[k] * (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))**2.0
return ans
Run Code Online (Sandbox Code Playgroud)
我想使用 OpenMP 进一步加快计算速度,但是,以下解决方案比串行版本慢大约 3 倍。
import numpy as np
cimport numpy as np
import cython
from cython.parallel import prange, parallel
cimport openmp
from libc.math cimport fabs, M_PI
@cython.wraparound(False)
@cython.boundscheck(False)
def pairwise_distance_omp(np.ndarray[np.double_t, ndim=2] r, np.ndarray[np.double_t, ndim=1] w):
cdef int i, j, k, c, size, m, n
cdef np.double_t a
cdef np.ndarray[np.double_t, ndim=1] ans
m = r.shape[0]
n = r.shape[1]
size = m * (m - 1) / 2
ans = np.zeros(size, dtype=r.dtype)
with nogil, parallel(num_threads=8):
for i in prange(m, schedule='dynamic'):
for j in range(i + 1, m):
c = i * (m - 1) - i * (i + 1) / 2 + j - 1
for k in range(n):
ans[c] += w[k] * (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))**2.0
return ans
Run Code Online (Sandbox Code Playgroud)
我不知道为什么它实际上更慢,但我尝试引入以下更改。这不仅导致性能稍差,而且通过此实现的加速可以忽略不计。ans仅在数组的开头正确计算结果距离,其余部分只是零。
import numpy as np
cimport numpy as np
import cython
from cython.parallel import prange, parallel
cimport openmp
from libc.math cimport fabs, M_PI
from libc.stdlib cimport malloc, free
@cython.wraparound(False)
@cython.boundscheck(False)
def pairwise_distance_omp_2(np.ndarray[np.double_t, ndim=2] r, np.ndarray[np.double_t, ndim=1] w):
cdef int k, l, c, m, n
cdef Py_ssize_t i, j, d
cdef size_t size
cdef int *ci, *cj
cdef np.ndarray[np.double_t, ndim=1, mode="c"] ans
cdef np.ndarray[np.double_t, ndim=2, mode="c"] data
cdef np.ndarray[np.double_t, ndim=1, mode="c"] weight
data = np.ascontiguousarray(r, dtype=np.float64)
weight = np.ascontiguousarray(w, dtype=np.float64)
m = r.shape[0]
n = r.shape[1]
size = m * (m - 1) / 2
ans = np.zeros(size, dtype=r.dtype)
cj = <int*> malloc(size * sizeof(int))
ci = <int*> malloc(size * sizeof(int))
c = -1
for i in range(m):
for j in range(i + 1, m):
c += 1
ci[c] = i
cj[c] = j
with nogil, parallel(num_threads=8):
for d in prange(size, schedule='guided'):
for k in range(n):
ans[d] += weight[k] * (1.0 - fabs(fabs(data[ci[d], k] - data[cj[d], k]) / M_PI - 1.0))**2.0
return ans
Run Code Online (Sandbox Code Playgroud)
对于所有功能,我使用以下.pyxbld文件
def make_ext(modname, pyxfilename):
from distutils.extension import Extension
return Extension(name=modname,
sources=[pyxfilename],
extra_compile_args=['-O3', '-march=native', '-ffast-math', '-fopenmp'],
extra_link_args=['-fopenmp'],
)
Run Code Online (Sandbox Code Playgroud)
我对 cython 的经验为零,只了解 C 的基础知识。我将不胜感激任何关于可能导致这种意外行为的原因的建议,甚至如何更好地重新表述我的问题。
@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(False)
def pairwise_distance_2(np.ndarray[np.double_t, ndim=2] r, np.ndarray[np.double_t, ndim=1] w):
cdef int i, j, k, c, size
cdef np.ndarray[np.double_t, ndim=1] ans
cdef np.double_t accumulator, tmp
size = r.shape[0] * (r.shape[0] - 1) / 2
ans = np.zeros(size, dtype=r.dtype)
c = -1
for i in range(r.shape[0]):
for j in range(i + 1, r.shape[0]):
c += 1
accumulator = 0
for k in range(r.shape[1]):
tmp = (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))
accumulator += w[k] * (tmp*tmp)
ans[c] = accumulator
return ans
Run Code Online (Sandbox Code Playgroud)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(False)
def pairwise_distance_omp_2d(np.ndarray[np.double_t, ndim=2] r, np.ndarray[np.double_t, ndim=1] w):
cdef int i, j, k, c, size, m, n
cdef np.ndarray[np.double_t, ndim=1] ans
cdef np.double_t accumulator, tmp
m = r.shape[0]
n = r.shape[1]
size = m * (m - 1) / 2
ans = np.zeros(size, dtype=r.dtype)
with nogil, parallel(num_threads=8):
for i in prange(m, schedule='dynamic'):
for j in range(i + 1, m):
c = i * (m - 1) - i * (i + 1) / 2 + j - 1
accumulator = 0
for k in range(n):
tmp = (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))
ans[c] += w[k] * (tmp*tmp)
return ans
Run Code Online (Sandbox Code Playgroud)
当我尝试应用accumulator答案中提出的解决方案时,出现以下错误:
Error compiling Cython file:
------------------------------------------------------------
...
c = i * (m - 1) - i * (i + 1) / 2 + j - 1
accumulator = 0
for k in range(n):
tmp = (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))
accumulator += w[k] * (tmp*tmp)
ans[c] = accumulator
^
------------------------------------------------------------
pdist.pyx:207:36: Cannot read reduction variable in loop body
Run Code Online (Sandbox Code Playgroud)
完整代码:
@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(False)
def pairwise_distance_omp(np.ndarray[np.double_t, ndim=2] r, np.ndarray[np.double_t, ndim=1] w):
cdef int i, j, k, c, size, m, n
cdef np.ndarray[np.double_t, ndim=1] ans
cdef np.double_t accumulator, tmp
m = r.shape[0]
n = r.shape[1]
size = m * (m - 1) / 2
ans = np.zeros(size, dtype=r.dtype)
with nogil, parallel(num_threads=8):
for i in prange(m, schedule='dynamic'):
for j in range(i + 1, m):
c = i * (m - 1) - i * (i + 1) / 2 + j - 1
accumulator = 0
for k in range(n):
tmp = (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))
accumulator += w[k] * (tmp*tmp)
ans[c] = accumulator
return ans
Run Code Online (Sandbox Code Playgroud)
我自己没有计时,所以这可能没有太大帮助,但是:
如果您运行cython -a以获取初始尝试的带注释版本 ( pairwise_distance_omp),您会发现该ans[c] += ...行是黄色的,表明它有 Python 开销。查看与该行对应的 C 表明它正在检查是否被零除。其中一个关键部分是这样开始的:
if (unlikely(M_PI == 0)) {
Run Code Online (Sandbox Code Playgroud)
你知道这永远不会是真的(无论如何,你可能会接受 NaN 值,而不是异常)。您可以通过向函数添加以下额外的装饰器来避免此检查:
@cython.cdivision(True)
# other decorators
def pairwise_distance_omp # etc...
Run Code Online (Sandbox Code Playgroud)
这删除了相当多的 C 代码,包括必须在单线程中运行的代码。另一方面,大多数代码永远不应该运行,编译器应该能够解决这个问题,因此尚不清楚这会产生多大的差异。
第二个建议:
# at the top
cdef np.double_t accumulator, tmp
# further down later in the loop:
c = i * (m - 1) - i * (i + 1) / 2 + j - 1
accumulator = 0
for k in range(r.shape[1]):
tmp = (1.0 - fabs(fabs(r[i, k] - r[j, k]) / M_PI - 1.0))
accumulator = accumulator + w[k] * (tmp*tmp)
ans[c] = accumulator
Run Code Online (Sandbox Code Playgroud)
希望这有两个优点:1)tmp*tmp可能比浮点指数 2 的幂更快。2)您可以避免从数组中读取ans,这可能会有点慢,因为编译器总是必须小心其他线程没有读取数据。没有改变它(即使你知道它不应该改变)。
| 归档时间: |
|
| 查看次数: |
3876 次 |
| 最近记录: |