And*_*ers 13 python optimization numpy julia
我尝试将此处提供的NN代码移植到Julia,希望提高网络培训速度.在我的桌面上,事实证明是这样的.
然而,在我的MacBook上,Python + numpy击败了Julia.
使用相同参数进行训练时,Python的速度是Julia的两倍(4.4s vs 10.6s,一个纪元).考虑到Julia比我的桌面上的Python更快(大约2秒),似乎有一些资源,Python/numpy在mac上使用Julia不是.即使并行化代码,我也只能达到~6.6s(尽管这可能是因为我没有在编写并行代码方面经验丰富).我认为问题可能是Julia的BLAS比mac中原生使用的vecLib库慢,但是尝试不同的构建似乎并没有让我更接近.我尝试使用USE_SYSTEM_BLAS = 1和使用MKL构建两者,其中MKL给出了更快的结果(上面公布的时间).
我将发布笔记本电脑的版本信息以及我的Julia实现,以供参考.我目前无法访问桌面,但我在Windows上使用openBLAS运行相同版本的Julia,与使用openBLAS的干净安装Python 2.7相比.
这里有什么我想念的吗?
编辑:我知道我的Julia代码在优化方面有很多不足之处,我非常感谢任何提高速度的技巧.然而,这不是Julia在我的笔记本电脑上运行速度慢的情况,而是Python更快.在我的桌面上,Python在约13秒内运行一个纪元,在笔记本电脑上它只需要大约4.4秒.我最感兴趣的是这种差异来自哪里.我意识到这个问题可能有点糟糕.
笔记本电脑版本:
julia> versioninfo()
Julia Version 0.6.2
Commit d386e40c17 (2017-12-13 18:08 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin17.4.0)
CPU: Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz
WORD_SIZE: 64
BLAS: libmkl_rt
LAPACK: libmkl_rt
LIBM: libopenlibm
LLVM: libLLVM-3.9.1 (ORCJIT, broadwell)
Run Code Online (Sandbox Code Playgroud)
Python 2.7.14 (default, Mar 22 2018, 14:43:05)
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import numpy
>>> numpy.show_config()
lapack_opt_info:
extra_link_args = ['-Wl,-framework', '-Wl,Accelerate']
extra_compile_args = ['-msse3']
define_macros = [('NO_ATLAS_INFO', 3), ('HAVE_CBLAS', None)]
openblas_lapack_info:
NOT AVAILABLE
atlas_3_10_blas_threads_info:
NOT AVAILABLE
atlas_threads_info:
NOT AVAILABLE
openblas_clapack_info:
NOT AVAILABLE
atlas_3_10_threads_info:
NOT AVAILABLE
atlas_blas_info:
NOT AVAILABLE
atlas_3_10_blas_info:
NOT AVAILABLE
atlas_blas_threads_info:
NOT AVAILABLE
openblas_info:
NOT AVAILABLE
blas_mkl_info:
NOT AVAILABLE
blas_opt_info:
extra_link_args = ['-Wl,-framework', '-Wl,Accelerate']
extra_compile_args = ['-msse3', '-I/System/Library/Frameworks/vecLib.framework/Headers']
define_macros = [('NO_ATLAS_INFO', 3), ('HAVE_CBLAS', None)]
blis_info:
NOT AVAILABLE
atlas_info:
NOT AVAILABLE
atlas_3_10_info:
NOT AVAILABLE
lapack_mkl_info:
NOT AVAILABLE
Run Code Online (Sandbox Code Playgroud)
朱莉娅代码(顺序):
using MLDatasets
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
network(num_layers, sizearr, biases, weights)
end
?(z) = 1/(1+e^(-z))
?_prime(z) = ?(z)*(1-?(z))
function (net::network)(a)
for (w, b) in zip(net.weights, net.biases)
a = ?.(w*a + b)
end
return a
end
function SGDtrain(net::network, training_data, epochs, mini_batch_size, ?, test_data=nothing)
n_test = test_data != nothing ? length(test_data):nothing
n = length(training_data)
for j in 1:epochs
training_data = shuffle(training_data)
mini_batches = [training_data[k:k+mini_batch_size-1] for k in 1:mini_batch_size:n]
@time for batch in mini_batches
update_batch(net, batch, ?)
end
if test_data != nothing
println("Epoch ", j,": ", evaluate(net, test_data), "/", n_test)
else
println("Epoch ", j," complete.")
end
end
end
function update_batch(net::network, batch, ?)
?_b = net.biases .- net.biases
?_w = net.weights .- net.weights
for (x, y) in batch
?_?_b, ?_?_w = backprop(net, x, y)
?_b += ?_?_b
?_w += ?_?_w
end
net.biases -= (?/length(batch))?_b
net.weights -= (?/length(batch))?_w
end
function backprop(net::network, x, y)
?_b = copy(net.biases)
?_w = copy(net.weights)
len = length(net.sizearr)
activation = x
activations = Array{Array{Float64,1}}(len)
activations[1] = x
zs = copy(net.biases)
for i in 1:len-1
b = net.biases[i]; w = net.weights[i]
z = w*activation .+ b
zs[i] = z
activation = ?.(z)
activations[i+1] = activation[:]
end
? = (activations[end] - y) .* ?_prime.(zs[end])
?_b[end] = ?[:]
?_w[end] = ?*activations[end-1]'
for l in 1:net.num_layers-2
z = zs[end-l]
? = net.weights[end-l+1]'? .* ?_prime.(z)
?_b[end-l] = ?[:]
?_w[end-l] = ?*activations[end-l-1]'
end
return (?_b, ?_w)
end
function evaluate(net::network, test_data)
test_results = [(findmax(net(x))[2] - 1, y) for (x, y) in test_data]
return sum(Int(x == y) for (x, y) in test_results)
end
function loaddata(rng = 1:50000)
train_x, train_y = MNIST.traindata(Float64, Vector(rng))
train_x = [train_x[:,:,x][:] for x in 1:size(train_x, 3)]
train_y = [vectorize(x) for x in train_y]
traindata = [(x, y) for (x, y) in zip(train_x, train_y)]
test_x, test_y = MNIST.testdata(Float64)
test_x = [test_x[:,:,x][:] for x in 1:size(test_x, 3)]
testdata = [(x, y) for (x, y) in zip(test_x, test_y)]
return traindata, testdata
end
function vectorize(n)
ev = zeros(10,1)
ev[n+1] = 1
return ev
end
function main()
net = network([784, 30, 10])
traindata, testdata = loaddata()
SGDtrain(net, traindata, 10, 10, 1.25, testdata)
end
Run Code Online (Sandbox Code Playgroud)
Chr*_*kas 34
我开始运行你的代码:
7.110379 seconds (1.37 M allocations: 20.570 GiB, 19.81%gc time)
Epoch 1: 7960/10000
6.147297 seconds (1.27 M allocations: 20.566 GiB, 18.33%gc time)
Run Code Online (Sandbox Code Playgroud)
哎呀,每个纪元分配21GiB?那是你的问题.它经常打垃圾收集,计算机拥有的内存越少,它就越多.让我们来解决这个问题.
主要思想是预先分配缓冲区,然后修改数组而不是创建新数组.在您的代码中,您从以下开始backprop:
?_b = copy(net.biases)
?_w = copy(net.weights)
len = length(net.sizearr)
activation = x
activations = Array{Array{Float64,1}}(len)
activations[1] = x
zs = copy(net.biases)
Run Code Online (Sandbox Code Playgroud)
你正在使用的事实copy意味着你应该预先分配东西!所以让我们从zs和开始吧activations.我扩展了您的网络以保存这些缓存数组:
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
zs::Array{Array{Float64,1},1}
activations::Array{Array{Float64,1},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
zs = [randn(y) for y in sizes[2:end]]
activations = [randn(y) for y in sizes[1:end]]
network(num_layers, sizearr, biases, weights, zs, activations)
end
Run Code Online (Sandbox Code Playgroud)
然后我改变你backprop的使用那些缓存:
function backprop(net::network, x, y)
?_b = copy(net.biases)
?_w = copy(net.weights)
len = length(net.sizearr)
activations = net.activations
activations[1] .= x
zs = net.zs
for i in 1:len-1
b = net.biases[i]; w = net.weights[i];
z = zs[i]; activation = activations[i+1]
z .= w*activations[i] .+ b
activation .= ?.(z)
end
? = (activations[end] - y) .* ?_prime.(zs[end])
?_b[end] = ?[:]
?_w[end] = ?*activations[end-1]'
for l in 1:net.num_layers-2
z = zs[end-l]
? = net.weights[end-l+1]'? .* ?_prime.(z)
?_b[end-l] = ?[:]
?_w[end-l] = ?*activations[end-l-1]'
end
return (?_b, ?_w)
end
Run Code Online (Sandbox Code Playgroud)
这导致分配的内存大幅减少.但还有很多事情要做.首先,让我们改变*成A_mul_B!.此函数是矩阵乘法,它写入数组C(A_mul_B!(C,A,B))而不是创建新矩阵,这可以大大减少您的内存分配.所以我做了:
for l in 1:net.num_layers-2
z = zs[end-l]
? = net.weights[end-l+1]'? .* ?_prime.(z)
?_b[end-l] .= vec(?)
atransp = activations[end-l-1]'
A_mul_B!(?_w[end-l],?,atransp)
end
Run Code Online (Sandbox Code Playgroud)
但'我使用的不是分配,而是reshape因为我只想要一个视图:
for l in 1:net.num_layers-2
z = zs[end-l]
? = net.weights[end-l+1]'? .* ?_prime.(z)
?_b[end-l] .= vec(?)
atransp = reshape(activations[end-l-1],1,length(activations[end-l-1]))
A_mul_B!(?_w[end-l],?,atransp)
end
Run Code Online (Sandbox Code Playgroud)
(它也会更快地打开OpenBLAS.虽然这可能与MKL不同).但是你还在复制
?_b = copy(net.biases)
?_w = copy(net.weights)
Run Code Online (Sandbox Code Playgroud)
并且你在每一步都分配了一堆δ,所以我做的下一个更改预先分配了这些并完成所有操作(它看起来就像之前的更改).
然后我做了一些剖析.在Juno,这只是:
@profile main()
Juno.profiler()
Run Code Online (Sandbox Code Playgroud)
或者如果您不使用Juno,您可以使用ProfileView.jl替换第二部分.我有:
所以大部分时间都花在了BLAS上,但是有一个问题.看到像这样的操作?_w += ?_?_w正在创建一堆矩阵!相反,我们希望通过其变化矩阵循环并就地更新每个矩阵.这扩展为:
function update_batch(net::network, batch, ?)
?_b = net.?_b
?_w = net.?_w
for i in 1:length(?_b)
fill!(?_b[i],0.0)
end
for i in 1:length(?_w)
fill!(?_w[i],0.0)
end
for (x, y) in batch
?_?_b, ?_?_w = backprop(net, x, y)
?_b .+= ?_?_b
for i in 1:length(?_w)
?_w[i] .+= ?_?_w[i]
end
end
for i in 1:length(?_b)
net.biases[i] .-= (?/length(batch)).*?_b[i]
end
for i in 1:length(?_w)
net.weights[i] .-= (?/length(batch)).*?_w[i]
end
end
Run Code Online (Sandbox Code Playgroud)
我在同一行上做了一些更改,我的最终代码如下:
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
weights_transp::Array{Array{Float64,2},1}
zs::Array{Array{Float64,1},1}
activations::Array{Array{Float64,1},1}
?_b::Array{Array{Float64,1},1}
?_w::Array{Array{Float64,2},1}
?_?_b::Array{Array{Float64,1},1}
?_?_w::Array{Array{Float64,2},1}
?s::Array{Array{Float64,2},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
weights_transp = [randn(x, y) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
zs = [randn(y) for y in sizes[2:end]]
activations = [randn(y) for y in sizes[1:end]]
?_b = [zeros(y) for y in sizes[2:end]]
?_w = [zeros(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
?_?_b = [zeros(y) for y in sizes[2:end]]
?_?_w = [zeros(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
?s = [zeros(y,1) for y in sizes[2:end]]
network(num_layers, sizearr, biases, weights, weights_transp, zs, activations,?_b,?_w,?_?_b,?_?_w,?s)
end
function update_batch(net::network, batch, ?)
?_b = net.?_b
?_w = net.?_w
for i in 1:length(?_b)
?_b[i] .= 0.0
end
for i in 1:length(?_w)
?_w[i] .= 0.0
end
?_?_b = net.?_?_b
?_?_w = net.?_?_w
for (x, y) in batch
backprop!(net, x, y)
for i in 1:length(?_b)
?_b[i] .+= ?_?_b[i]
end
for i in 1:length(?_w)
?_w[i] .+= ?_?_w[i]
end
end
for i in 1:length(?_b)
net.biases[i] .-= (?/length(batch)).*?_b[i]
end
for i in 1:length(?_w)
net.weights[i] .-= (?/length(batch)).*?_w[i]
end
end
function backprop!(net::network, x, y)
?_b = net.?_?_b
?_w = net.?_?_w
len = length(net.sizearr)
activations = net.activations
activations[1] .= x
zs = net.zs
?s = net.?s
for i in 1:len-1
b = net.biases[i]; w = net.weights[i];
z = zs[i]; activation = activations[i+1]
A_mul_B!(z,w,activations[i])
z .+= b
activation .= ?.(z)
end
? = ?s[end]
? .= (activations[end] .- y) .* ?_prime.(zs[end])
?_b[end] .= vec(?)
atransp = reshape(activations[end-1],1,length(activations[end-1]))
A_mul_B!(?_w[end],?,atransp)
for l in 1:net.num_layers-2
z = zs[end-l]
transpose!(net.weights_transp[end-l+1],net.weights[end-l+1])
A_mul_B!(?s[end-l],net.weights_transp[end-l+1],?)
? = ?s[end-l]
? .*= ?_prime.(z)
?_b[end-l] .= vec(?)
atransp = reshape(activations[end-l-1],1,length(activations[end-l-1]))
A_mul_B!(?_w[end-l],?,atransp)
end
return nothing
end
Run Code Online (Sandbox Code Playgroud)
其他一切都保持不变.为了看到我完成了,我加入@time了backprop电话并得到:
0.000070 seconds (8 allocations: 352 bytes)
0.000066 seconds (8 allocations: 352 bytes)
0.000090 seconds (8 allocations: 352 bytes)
Run Code Online (Sandbox Code Playgroud)
所以这是不分配的.我添加@time到for (x, y) in batch循环中并得到
0.000636秒(80个分配:3.438 KiB)0.000610秒(80个分配:3.438 KiB)0.000624秒(80个分配:3.438 KiB)
所以告诉我基本上剩下的所有分配来自迭代器(这可以改进,但可能不会改善时间).所以最后的时机是:
Epoch 2: 8428/10000
4.005540 seconds (586.87 k allocations: 23.925 MiB)
Epoch 1: 8858/10000
3.488674 seconds (414.49 k allocations: 17.082 MiB)
Epoch 2: 9104/10000
Run Code Online (Sandbox Code Playgroud)
这在我的机器上快了近2倍,但每个循环的内存分配减少了1200倍.这意味着在RAM越来越小的机器上,这种方法应该做得更好(我的桌面有相当多的内存,所以它真的不在乎太多!).
最终的配置文件显示大部分时间都在A_mul_B!通话中,因此几乎所有内容都受到我的OpenBLAS速度的限制,所以我完成了.我可以做的一些额外的事情是多线程其他一些循环,但给分析得到的回报将很小,所以我会把它留给你(基本上只是把它Threads.@threads放在循环上?_w[i] .+= ?_?_w[i]).
希望这不仅可以改进您的代码,还可以教授如何分析,预分配,使用就地操作以及考虑性能.