我很难使用 Julia 读取大型文本文件(968MB,870 万行)。每一行是这样的:
0.3295747E+01 0.3045123E+01 0.3325542E+01 0.1185458E+01 -0.4827727E-05 -0.1033694E-04 0.3306459E-03
Run Code Online (Sandbox Code Playgroud)
我曾经parse.(Float64, split(line))将每一行转换为数字。
function openfile()
datafile = open("data.dat","r")
lines = readlines(datafile)
close(datafile)
lines
end
function parseline(lines::Array{String})
for line in lines
zzz = parse.(Float64, split(line))
end
end
import Base: tryparse_internal
function myreadfile(str::String, T::Type, dlm=' ', eol='\n')
row, clm, bg, ed = 0, 0, 0, 0
data = Array{T}(undef,0)
isnu0, isnu = false, false
for (idx, chr) in enumerate(str)
isnu = false
(chr != eol && chr != dlm) && (isnu = true)
if isnu0 == false && isnu == true
bg, isnu0 = idx, true
end
if isnu0 == true && isnu == false
ed, isnu0 = idx-1, false
push!(data, tryparse_internal(T, str, bg, ed))
end
end
isnu == true && (push!(data, tryparse(T, str[bg:end])))
data
end
@time lines = openfile()
@time parseline(lines)
using DelimitedFiles
@time readdlm("data.dat")
@time myreadfile(read("data.dat",String), Float64)
Run Code Online (Sandbox Code Playgroud)
并得到
3.584656 seconds (17.59 M allocations: 1.240 GiB, 28.44% gc time)
78.099010 seconds (276.14 M allocations: 6.080 GiB, 1.50% gc time)
52.504199 seconds (185.93 M allocations: 3.960 GiB, 0.53% gc time)
46.085581 seconds (61.70 M allocations: 2.311 GiB, 0.28% gc time)
Run Code Online (Sandbox Code Playgroud)
与fortran代码比较
call cpu_time(start)
open(10, file="data.dat",status="old")
do i=1, 8773632
read(10,*) a, b, c, d, e, f, g
end do
call cpu_time(finish)
print '("Time = ",f6.3," seconds.")',finish-start
Run Code Online (Sandbox Code Playgroud)
这是 Time = 14.812 seconds.
似乎 Julia 花更长的时间做同样的事情。有没有更好的方法将字符串转换为浮点数?split 和 parse 太慢了。
正如上面的评论所说,最快的很可能是readdlm函数。这将返回一个最有可能是您想要的矩阵。
如果您确实想手动完成,通常最好通读文件并逐行处理,而不是将所有内容存储在大型中间对象中。内存读写速度很慢。就像是
ret = open("data.dat","r") do datafile
[parse.(Float64, split(line)) for line in eachline(datafile)]
end
Run Code Online (Sandbox Code Playgroud)
无论如何,它可能不会比你的最后一行快。