abe*_*rkk 3 python numpy image-processing convolution
我正在尝试仅使用 NumPy 来实现图像卷积代码,类似于cv2.filter2D(...)的做法。
import numpy as np
import time
# kernal
H = np.array([[0,1,0],[1,-4,1],[0,1,0]])
# image
SEED = 23
img = np.random.RandomState(SEED).randint(10, size=(4, 4))
# shapes
Hi, Wi = img.shape
Hk, Wk = H.shape
hk = Hk//2
wk = Wk//2
# padding
new_img = np.pad(img, (hk, wk), 'constant', constant_values=0)
pHi, pWi = new_img.shape
print('img: ')
print(new_img)
print('kernal: ')
print(H)
print('\n')
# image convolution
##################################################
# Method 1
st = time.time()
out = np.zeros((Hi, Wi))
for i in range(hk, pHi-hk):
for j in range(wk, pWi-wk):
batch = new_img[i-hk:i+hk+1, j-wk:j+wk+1]
out[i-hk][j-wk] = np.sum(batch*H)
print('out: ')
print(out)
print("[*] process time : %f" % (time.time()- st))
print('\n')
##################################################
# Method 2
st = time.time()
Hi, Wi = img.shape
out = np.zeros((Hi, Wi))
H_1d = H.ravel()
hl = len(H_1d)
for i in range(Wi):
img_slice = new_img[:, i:Hk+i]
for j in range(Hi):
out[j][i] = np.sum(img_slice[j:j+Hk,:]*H)
il = len(img_slice)
h1 = 0
h2 = hl
while h2 <= il:
index = h1//Hk
# print(index)
out[index][i] = np.sum(img_slice[h1:h2]*H_1d)
h1 = h1 + Hk
h2 = h2 + Hk
print('out: ')
print(out)
print("[*] process time : %f" % (time.time()- st))
print('\n')
##################################################
Run Code Online (Sandbox Code Playgroud)
它有效,但我需要更快、更有效的实施。为了使算法更快,我尝试在此处查看图像的卷积过程中找到模式,但仍然无法弄清楚应该做什么。
有人可以帮助我改进当前的代码,使其更快,仅使用 NumPy 吗?
我已经实施了几个非常快速的解决方案。根据输入大小,2650x与规范的简单卷积实现相比,这些解决方案可提供数倍的加速!
注意:不要阅读整个帖子/文章,而只是直接使用增强函数,滚动到下面的代码并仅复制第一个函数的代码conv2d_fast(...)并在您的代码中使用它,它包含内部所需的所有内容,简而言之,我只复制了代码此代码粘贴的两个最佳实现。
在我的一些输入实现中,获得了最大的加速:1) conv2d_fast- 755x2) conv2d_medium- 1830x3) conv2d_fast_numba- 2650x。
460x512图像大小和内核大小解决方案的加速3x3(此内核大小最常用于计算机视觉和卷积神经网络):1) conv2d_fast- 52.5x2) conv2d_medium- 126x3) conv2d_fast_numba- 319x。在下面显示的图中,参考图像大小和内核大小由红点表示,其他点为蓝色。
加速意味着快速解决方案与基本慢速解决方案(规范最简单的卷积实现)相比要快多少。例如,加速20x意味着如果基本慢速解决方案200ms在函数内部使用,则快速解决方案使用10ms。
我的代码中调用了慢速基参考函数conv2d_slow(...)。结果的所有计时/加速和正确性都是根据此函数来衡量的。它实现了计算卷积的规范最简单算法,如下一个公式/图片所示,取自OpenCV filter2D(...) doc:

算法conv2d_fast是纯 NumPy 专用的,没有 Python 循环,它是纯 NumPy 版本中最快的。
conv2d_medium包含Python循环,但速度也非常快,对于某些输入(当内核大小很大且接近图像大小时),它比以前的算法慢,但对于其他输入,它更快,平均2x比conv2d_fast常见内核大小更快。不选择此解决方案的唯一原因是因为它不是纯 NumPy,因为包含 Python 循环,而且对于大型内核,它比conv2d_fast.
conv2d_fast_numba是基于Numba的,Numba 是JIT编译器,它将 Python 代码转换为纯 C++ 代码并将其编译为机器代码,尽管没有要求它实现非 NumPy 解决方案,但我仍然这样做,因为 Numba 是专门为改进基于 Numpy 而创建的代码,因此它与 NumPy 密切相关。Numba 解决方案是最快的,3-8x平均比以前最快的解决方案更快conv2d_medium。
上面提到的所有函数都不需要额外的准备来使用它们(例如导入一些模块),只需复制粘贴单个选定的函数代码并在代码中使用它即可。
我的代码需要通过仅运行一次 next 命令来安装下一个 pip 模块python -m pip install numpy numba matplotlib timerit。但卷积的主要实现//conv2d_fast只需要安装模块。conv2d_slowconv2d_mediumnumpy
# Needs: python -m pip install numpy numba matplotlib timerit
# ---------- Fastest NumPy-only solution ----------
def conv2d_fast(img, krn):
import numpy as np
is0, is1, ks0, ks1 = *img.shape, *krn.shape
rs0, rs1 = is0 - ks0 + 1, is1 - ks1 + 1
ix0 = np.arange(ks0)[:, None] + np.arange(rs0)[None, :]
ix1 = np.arange(ks1)[:, None] + np.arange(rs1)[None, :]
res = krn[:, None, :, None] * img[(ix0.ravel()[:, None], ix1.ravel()[None, :])].reshape(ks0, rs0, ks1, rs1)
res = res.transpose(1, 3, 0, 2).reshape(rs0, rs1, -1).sum(axis = -1)
return res
# ---------- Slowest NumPy+Python solution for reference (base of time measurement) ----------
def conv2d_slow(img, krn):
import numpy as np
is0, is1, ks0, ks1 = *img.shape, *krn.shape
rs0, rs1 = is0 - ks0 + 1, is1 - ks1 + 1
res = np.zeros((rs0, rs1), dtype = krn.dtype)
for i in range(rs0):
for j in range(rs1):
res[i, j] = (krn * img[i : i + ks0, j : j + ks1]).sum()
return res
# ---------- Medium NumPy+Python solution ----------
def conv2d_medium(img, krn):
import numpy as np
is0, is1, ks0, ks1 = *img.shape, *krn.shape
rs0, rs1 = is0 - ks0 + 1, is1 - ks1 + 1
res = np.zeros((rs0, rs1), dtype = krn.dtype)
for k in range(ks0):
for l in range(ks1):
res[...] += krn[k, l] * img[k : k + rs0, l : l + rs1]
return res
# ---------- Fastest of all, NumPy+Numba solution ----------
def conv2d_fast_numba(img, krn, *, state = {}):
if 'f' not in state:
import numpy as np
def conv2d_fast_nm(img, krn):
is0, is1, ks0, ks1 = *img.shape, *krn.shape
rs0, rs1 = is0 - ks0 + 1, is1 - ks1 + 1
res = np.zeros((rs0, rs1), dtype = krn.dtype)
for k in range(ks0):
for l in range(ks1):
for i in range(rs0):
for j in range(rs1):
res[i, j] += krn[k, l] * img[i + k, j + l]
return res
import numba
state['f'] = numba.njit(cache = True, parallel = True, fastmath = True)(conv2d_fast_nm)
return state['f'](img, krn)
# ---------- Testing correctness, measuring speed, drawing plots ----------
def test():
import math, matplotlib, matplotlib.pyplot as plt, numpy as np
from timerit import Timerit
Timerit._default_asciimode = True
np.random.seed(0)
for ifuncs, (fname, funcs) in enumerate([
('fast', (conv2d_slow, conv2d_fast)),
('medium', (conv2d_slow, conv2d_medium)),
('numba', (conv2d_slow, conv2d_fast_numba)),
('fast_bmed', (conv2d_medium, conv2d_fast)),
('numba_bmed', (conv2d_medium, conv2d_fast_numba)),
]):
stats = []
for krn_size in [1, 2, 3, 4, 6, 8, 12, 16, 24]:
for img_size in [32, 64, 128, 256, 512]:
ih, iw = max(1, math.floor(img_size * 0.9)), img_size
kh, kw = max(1, math.floor(krn_size * 0.9)), krn_size
if krn_size == 3:
kh = 3
print(f'krn_size ({kh}, {kw}) img_size ({ih}, {iw})', flush = True)
krn = np.random.uniform(-1., 1., (kh, kw))
img = np.random.uniform(0., 255., (ih, iw))
for ifn, f in enumerate(funcs):
print(f'{f.__name__}: ', end = '', flush = True)
work = ih * iw * kh * kw
tim = Timerit(num = min(20, math.ceil(2 ** 15 / work)) * 3, verbose = 1)
for i, t in enumerate(tim):
r = f(img, krn)
rt = tim.mean()
if ifn == 0:
bt, ba = rt, r
else:
assert np.allclose(ba, r)
print(f'speedup {round(bt / rt, 2)}x')
stats.append({
'img_size': img_size,
'krn_size': krn_size,
'speedup': bt / rt,
})
stats = sorted(stats, key = lambda e: e['speedup'])
x = np.arange(len(stats))
y = np.array([e['speedup'] for e in stats])
plt.rcParams['figure.figsize'] = (12.8, 7.2)
for scale in ['linear', 'log']:
plt.clf()
plt.xlabel('iteration')
plt.ylabel(f'speedup_{fname}_{scale}')
plt.yscale(scale)
plt.scatter(x, y, marker = '.', color = [('blue', 'red')[stats[ii]['krn_size'] == 3 and stats[ii]['img_size'] == 512] for ii in range(x.size)])
for i in range(x.size):
plt.annotate(
(f"k{str(stats[i]['krn_size']).zfill(2)}\ni{str(stats[i]['img_size']).zfill(3)}\n" +
f"x{round(stats[i]['speedup'], 2 if stats[i]['speedup'] < 10 else 1 if stats[i]['speedup'] < 100 else None)}"),
(x[i], y[i]), fontsize = 'xx-small',
)
plt.subplots_adjust(left = 0.07, right = 0.99, bottom = 0.08, top = 0.99)
plt.xlim(left = -0.1)
ymin, ymax = np.amin(y), np.amax(y)
if scale == 'log':
plt.ylim((ymin / 1.05, ymax * 1.8))
elif scale == 'linear':
plt.ylim((ymin - (ymax - ymin) * 0.02, ymax + (ymax - ymin) * 0.08))
plt.yticks([ymin] + [e for e in plt.yticks()[0] if ymin + 10 ** -6 < e < ymax - 10 ** -6] + [ymax])
#plt.gca().get_yaxis().set_major_formatter(matplotlib.ticker.FormatStrFormatter('%.1f'))
plt.savefig(f'conv2d_numpy_{fname}_{scale}.png', dpi = 150)
plt.show()
if __name__ == '__main__':
test()
Run Code Online (Sandbox Code Playgroud)
下图包含不同输入大小的算法加速。x轴显示迭代索引,y轴显示加速比,点按加速比升序排序。每个点都标记为kXX iYYY xZZZ,其中XX是内核宽度,YYY是图像宽度,ZZZ是这些内核和图像尺寸的当前算法的加速比。每个图都有两个版本,分别是linear比例(常规线性y轴)和log比例(y轴以对数方式缩放)。每个图仅包含一个红点(其余为蓝色),该红点显示最常用的情况、内核3x3和图像460x512,该点应被视为当前图最有代表性的点。
点附近的标签可能看起来太小,因为 StackOverflow 以降低的分辨率显示绘图,在新的浏览器选项卡中打开图像并放大以以全1920x1080分辨率查看它们。
conv2d_fast linear:
conv2d_fast log:
conv2d_medium linear:
conv2d_medium log:
conv2d_fast_numba linear:
conv2d_fast_numba log:
conv2d_fast相比conv2d_medium,linear:
conv2d_fast相比conv2d_medium,log:
conv2d_fast_numba相比conv2d_medium,linear:
conv2d_fast_numba相比conv2d_medium,log:
上面代码的控制台输出:
krn_size (1, 1) img_size (28, 32)
conv2d_slow: Timed best=15.037 ms, mean=15.409 +- 0.2 ms
conv2d_fast: Timed best=100.701 us, mean=102.412 +- 1.4 us
speedup 150.46x
krn_size (1, 1) img_size (57, 64)
conv2d_slow: Timed best=61.679 ms, mean=65.878 +- 6.0 ms
conv2d_fast: Timed best=151.541 us, mean=192.332 +- 28.5 us
speedup 342.52x
krn_size (1, 1) img_size (115, 128)
conv2d_slow: Timed best=249.408 ms, mean=252.172 +- 2.4 ms
conv2d_fast: Timed best=328.012 us, mean=333.878 +- 6.6 us
speedup 755.28x
krn_size (1, 1) img_size (230, 256)
conv2d_slow: Timed best=1.013 s, mean=1.013 +- 0.0 s
conv2d_fast: Timed best=1.706 ms, mean=1.706 +- 0.0 ms
speedup 593.72x
krn_size (1, 1) img_size (460, 512)
conv2d_slow: Timed best=4.101 s, mean=4.101 +- 0.0 s
conv2d_fast: Timed best=10.182 ms, mean=10.182 +- 0.0 ms
speedup 402.72x
krn_size (1, 2) img_size (28, 32)
conv2d_slow: Timed best=14.418 ms, mean=15.877 +- 1.8 ms
conv2d_fast: Timed best=132.476 us, mean=135.178 +- 2.1 us
speedup 117.45x
krn_size (1, 2) img_size (57, 64)
conv2d_slow: Timed best=60.057 ms, mean=61.136 +- 1.3 ms
conv2d_fast: Timed best=231.222 us, mean=238.359 +- 11.6 us
speedup 256.49x
krn_size (1, 2) img_size (115, 128)
conv2d_slow: Timed best=264.794 ms, mean=268.452 +- 3.7 ms
conv2d_fast: Timed best=674.112 us, mean=729.595 +- 55.5 us
speedup 367.95x
krn_size (1, 2) img_size (230, 256)
conv2d_slow: Timed best=999.722 ms, mean=999.722 +- 0.0 ms
conv2d_fast: Timed best=4.910 ms, mean=4.910 +- 0.0 ms
speedup 203.61x
krn_size (1, 2) img_size (460, 512)
conv2d_slow: Timed best=4.360 s, mean=4.360 +- 0.0 s
conv2d_fast: Timed best=18.148 ms, mean=18.148 +- 0.0 ms
speedup 240.24x
krn_size (3, 3) img_size (28, 32)
conv2d_slow: Timed best=16.771 ms, mean=17.395 +- 0.7 ms
conv2d_fast: Timed best=256.153 us, mean=268.080 +- 13.5 us
speedup 64.89x
krn_size (3, 3) img_size (57, 64)
conv2d_slow: Timed best=72.964 ms, mean=72.964 +- 0.0 ms
conv2d_fast: Timed best=863.783 us, mean=863.783 +- 0.0 us
speedup 84.47x
krn_size (3, 3) img_size (115, 128)
conv2d_slow: Timed best=302.496 ms, mean=302.496 +- 0.0 ms
conv2d_fast: Timed best=6.490 ms, mean=6.490 +- 0.0 ms
speedup 46.61x
krn_size (3, 3) img_size (230, 256)
conv2d_slow: Timed best=1.281 s, mean=1.281 +- 0.0 s
conv2d_fast: Timed best=24.801 ms, mean=24.801 +- 0.0 ms
speedup 51.66x
krn_size (3, 3) img_size (460, 512)
conv2d_slow: Timed best=5.134 s, mean=5.134 +- 0.0 s
conv2d_fast: Timed best=97.726 ms, mean=97.726 +- 0.0 ms
speedup 52.53x
krn_size (3, 4) img_size (28, 32)
conv2d_slow: Timed best=16.275 ms, mean=16.351 +- 0.1 ms
conv2d_fast: Timed best=286.950 us, mean=295.749 +- 11.9 us
speedup 55.29x
krn_size (3, 4) img_size (57, 64)
conv2d_slow: Timed best=72.170 ms, mean=72.170 +- 0.0 ms
conv2d_fast: Timed best=1.824 ms, mean=1.824 +- 0.0 ms
speedup 39.57x
krn_size (3, 4) img_size (115, 128)
conv2d_slow: Timed best=305.169 ms, mean=305.169 +- 0.0 ms
conv2d_fast: Timed best=8.462 ms, mean=8.462 +- 0.0 ms
speedup 36.06x
krn_size (3, 4) img_size (230, 256)
conv2d_slow: Timed best=1.245 s, mean=1.245 +- 0.0 s
conv2d_fast: Timed best=31.527 ms, mean=31.527 +- 0.0 ms
speedup 39.5x
krn_size (3, 4) img_size (460, 512)
conv2d_slow: Timed best=5.262 s, mean=5.262 +- 0.0 s
conv2d_fast: Timed best=128.232 ms, mean=128.232 +- 0.0 ms
speedup 41.03x
krn_size (5, 6) img_size (28, 32)
conv2d_slow: Timed best=14.060 ms, mean=14.507 +- 0.4 ms
conv2d_fast: Timed best=469.288 us, mean=478.087 +- 8.8 us
speedup 30.34x
krn_size (5, 6) img_size (57, 64)
conv2d_slow: Timed best=67.638 ms, mean=67.638 +- 0.0 ms
conv2d_fast: Timed best=3.542 ms, mean=3.542 +- 0.0 ms
speedup 19.1x
krn_size (5, 6) img_size (115, 128)
conv2d_slow: Timed best=299.806 ms, mean=299.806 +- 0.0 ms
conv2d_fast: Timed best=18.730 ms, mean=18.730 +- 0.0 ms
speedup 16.01x
krn_size (5, 6) img_size (230, 256)
conv2d_slow: Timed best=1.294 s, mean=1.294 +- 0.0 s
conv2d_fast: Timed best=77.809 ms, mean=77.809 +- 0.0 ms
speedup 16.64x
krn_size (5, 6) img_size (460, 512)
conv2d_slow: Timed best=5.336 s, mean=5.336 +- 0.0 s
conv2d_fast: Timed best=317.518 ms, mean=317.518 +- 0.0 ms
speedup 16.81x
krn_size (7, 8) img_size (28, 32)
conv2d_slow: Timed best=12.076 ms, mean=12.076 +- 0.0 ms
conv2d_fast: Timed best=861.827 us, mean=861.827 +- 0.0 us
speedup 14.01x
krn_size (7, 8) img_size (57, 64)
conv2d_slow: Timed best=64.761 ms, mean=64.761 +- 0.0 ms
conv2d_fast: Timed best=8.889 ms, mean=8.889 +- 0.0 ms
speedup 7.29x
krn_size (7, 8) img_size (115, 128)
conv2d_slow: Timed best=293.768 ms, mean=293.768 +- 0.0 ms
conv2d_fast: Timed best=32.908 ms, mean=32.908 +- 0.0 ms
speedup 8.93x
krn_size (7, 8) img_size (230, 256)
conv2d_slow: Timed best=1.245 s, mean=1.245 +- 0.0 s
conv2d_fast: Timed best=139.752 ms, mean=139.752 +- 0.0 ms
speedup 8.91x
krn_size (7, 8) img_size (460, 512)
conv2d_slow: Timed best=5.535 s, mean=5.535 +- 0.0 s
conv2d_fast: Timed best=599.906 ms, mean=599.906 +- 0.0 ms
speedup 9.23x
krn_size (10, 12) img_size (28, 32)
conv2d_slow: Timed best=8.776 ms, mean=8.776 +- 0.0 ms
conv2d_fast: Timed best=1.801 ms, mean=1.801 +- 0.0 ms
speedup 4.87x
krn_size (10, 12) img_size (57, 64)
conv2d_slow: Timed best=55.155 ms, mean=55.155 +- 0.0 ms
conv2d_fast: Timed best=13.861 ms, mean=13.861 +- 0.0 ms
speedup 3.98x
krn_size (10, 12) img_size (115, 128)
conv2d_slow: Timed best=275.688 ms, mean=275.688 +- 0.0 ms
conv2d_fast: Timed best=65.345 ms, mean=65.345 +- 0.0 ms
speedup 4.22x
krn_size (10, 12) img_size (230, 256)
conv2d_slow: Timed best=1.215 s, mean=1.215 +- 0.0 s
conv2d_fast: Timed best=319.263 ms, mean=319.263 +- 0.0 ms
speedup 3.8x
krn_size (10, 12) img_size (460, 512)
conv2d_slow: Timed best=5.413 s, mean=5.413 +- 0.0 s
conv2d_fast: Timed best=1.360 s, mean=1.360 +- 0.0 s
speedup 3.98x
krn_size (14, 16) img_size (28, 32)
conv2d_slow: Timed best=6.660 ms, mean=6.660 +- 0.0 ms
conv2d_fast: Timed best=2.498 ms, mean=2.498 +- 0.0 ms
speedup 2.67x
krn_size (14, 16) img_size (57, 64)
conv2d_slow: Timed best=49.958 ms, mean=49.958 +- 0.0 ms
conv2d_fast: Timed best=24.663 ms, mean=24.663 +- 0.0 ms
speedup 2.03x
krn_size (14, 16) img_size (115, 128)
conv2d_slow: Timed best=273.521 ms, mean=273.521 +- 0.0 ms
conv2d_fast: Timed best=120.138 ms, mean=120.138 +- 0.0 ms
speedup 2.28x
krn_size (14, 16) img_size (230, 256)
conv2d_slow: Timed best=1.329 s, mean=1.329 +- 0.0 s
conv2d_fast: Timed best=537.025 ms, mean=537.025 +- 0.0 ms
speedup 2.47x
krn_size (14, 16) img_size (460, 512)
conv2d_slow: Timed best=5.615 s, mean=5.615 +- 0.0 s
conv2d_fast: Timed best=2.623 s, mean=2.623 +- 0.0 s
speedup 2.14x
krn_size (21, 24) img_size (28, 32)
conv2d_slow: Timed best=2.203 ms, mean=2.203 +- 0.0 ms
conv2d_fast: Timed best=955.684 us, mean=955.684 +- 0.0 us
speedup 2.3x
krn_size (21, 24) img_size (57, 64)
conv2d_slow: Timed best=41.180 ms, mean=41.180 +- 0.0 ms
conv2d_fast: Timed best=40.526 ms, mean=40.526 +- 0.0 ms
speedup 1.02x
krn_size (21, 24) img_size (115, 128)
conv2d_slow: Timed best=288.575 ms, mean=288.575 +- 0.0 ms
conv2d_fast: Timed best=277.724 ms, mean=277.724 +- 0.0 ms
speedup 1.04x
krn_size (21, 24) img_size (230, 256)
conv2d_slow: Timed best=1.334 s, mean=1.334 +- 0.0 s
conv2d_fast: Timed best=1.288 s, mean=1.288 +- 0.0 s
speedup 1.04x
krn_size (21, 24) img_size (460, 512)
conv2d_slow: Timed best=6.351 s, mean=6.351 +- 0.0 s
conv2d_fast: Timed best=5.996 s, mean=5.996 +- 0.0 s
speedup 1.06x
krn_size (1, 1) img_size (28, 32)
conv2d_slow: Timed best=15.644 ms, mean=16.388 +- 0.9 ms
conv2d_medium: Timed best=34.708 us, mean=36.174 +- 1.8 us
speedup 453.03x
krn_size (1, 1) img_size (57, 64)
conv2d_slow: Timed best=62.969 ms, mean=77.952 +- 15.0 ms
conv2d_medium: Timed best=54.261 us, mean=62.300 +- 4.8 us
speedup 1251.24x
krn_size (1, 1) img_size (115, 128)
conv2d_slow: Timed best=285.944 ms, mean=487.718 +- 148.2 ms
conv2d_medium: Timed best=246.865 us, mean=266.581 +- 14.8 us
speedup 1829.53x
krn_size (1, 1) img_size (230, 256)
conv2d_slow: Timed best=1.474 s, mean=1.474 +- 0.0 s
conv2d_medium: Timed best=1.400 ms, mean=1.400 +- 0.0 ms
speedup 1052.98x
krn_size (1, 1) img_size (460, 512)
conv2d_slow: Timed best=4.434 s, mean=4.434 +- 0.0 s
conv2d_medium: Timed best=5.222 ms, mean=5.222 +- 0.0 ms
speedup 849.22x
krn_size (1, 2) img_size (28, 32)
conv2d_slow: Timed best=14.759 ms, mean=15.430 +- 1.2 ms
conv2d_medium: Timed best=69.904 us, mean=77.803 +- 6.8 us
speedup 198.33x
krn_size (1, 2) img_size (57, 64)
conv2d_slow: Timed best=60.825 ms, mean=62.482 +- 1.8 ms
conv2d_medium: Timed best=138.342 us, mean=139.711 +- 1.3 us
speedup 447.22x
krn_size (1, 2) img_size (115, 128)
conv2d_slow: Timed best=261.207 ms, mean=263.777 +- 2.6 ms
conv2d_medium: Timed best=343.167 us, mean=352.699 +- 9.5 us
speedup 747.88x
krn_size (1, 2) img_size (230, 256)
conv2d_slow: Timed best=1.031 s, mean=1.031 +- 0.0 s
conv2d_medium: Timed best=1.887 ms, mean=1.887 +- 0.0 ms
speedup 546.21x
krn_size (1, 2) img_size (460, 512)
conv2d_slow: Timed best=4.126 s, mean=4.126 +- 0.0 s
conv2d_medium: Timed best=10.341 ms, mean=10.341 +- 0.0 ms
speedup 398.96x
krn_size (3, 3) img_size (28, 32)
conv2d_slow: Timed best=16.776 ms, mean=16.975 +- 0.2 ms
conv2d_medium: Timed best=283.528 us, mean=287.829 +- 6.3 us
speedup 58.98x
krn_size (3, 3) img_size
| 归档时间: |
|
| 查看次数: |
3281 次 |
| 最近记录: |