Sil*_*ost 931 python text-files line-count
我需要在python中获取大文件(数十万行)的行数.记忆和时间方面最有效的方法是什么?
目前我这样做:
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
Run Code Online (Sandbox Code Playgroud)
有可能做得更好吗?
Kyl*_*yle 578
一行,可能很快:
num_lines = sum(1 for line in open('myfile.txt'))
Run Code Online (Sandbox Code Playgroud)
Yuv*_*dam 329
你不能比这更好.
毕竟,任何解决方案都必须读取整个文件,找出\n你有多少,并返回结果.
没有阅读整个文件,你有更好的方法吗?不确定......最好的解决方案永远是I/O绑定,你能做的最好就是确保你不使用不必要的内存,但看起来你已经覆盖了.
Rya*_*rom 193
我相信内存映射文件将是最快的解决方案.我尝试了四个函数:OP(opcount)发布的函数; 对文件(simplecount)中的行进行简单迭代; 带有内存映射字段的读取行(mmap)(mapcount); 以及Mykola Kharechko(bufcount)提供的缓冲读取解决方案.
我运行了五次每个函数,并计算了一个120万行文本文件的平均运行时间.
Windows XP,Python 2.5,2GB RAM,2 GHz AMD处理器
这是我的结果:
mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
Run Code Online (Sandbox Code Playgroud)
编辑:Python 2.6的数字:
mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
Run Code Online (Sandbox Code Playgroud)
因此,缓冲区读取策略似乎是Windows/Python 2.6中最快的
这是代码:
from __future__ import with_statement
import time
import mmap
import random
from collections import defaultdict
def mapcount(filename):
f = open(filename, "r+")
buf = mmap.mmap(f.fileno(), 0)
lines = 0
readline = buf.readline
while readline():
lines += 1
return lines
def simplecount(filename):
lines = 0
for line in open(filename):
lines += 1
return lines
def bufcount(filename):
f = open(filename)
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
def opcount(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
counts = defaultdict(list)
for i in range(5):
for func in [mapcount, simplecount, bufcount, opcount]:
start_time = time.time()
assert func("big_file.txt") == 1209138
counts[func].append(time.time() - start_time)
for key, vals in counts.items():
print key.__name__, ":", sum(vals) / float(len(vals))
Run Code Online (Sandbox Code Playgroud)
Mic*_*con 116
我不得不在一个类似的问题上发布这个问题,直到我的声望得分有所提高(感谢任何撞到我的人!).
所有这些解决方案都忽略了一种使这种运行速度更快的方法,即使用无缓冲(原始)接口,使用bytearrays,并进行自己的缓冲.(这仅适用于Python 3.在Python 2中,默认情况下可能使用或不使用原始接口,但在Python 3中,您将默认使用Unicode.)
使用修改版本的计时工具,我相信以下代码比任何提供的解决方案更快(并且更加pythonic):
def rawcount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
Run Code Online (Sandbox Code Playgroud)
使用单独的生成器函数,可以更快地运行:
def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024*1024)
def rawgencount(filename):
f = open(filename, 'rb')
f_gen = _make_gen(f.raw.read)
return sum( buf.count(b'\n') for buf in f_gen )
Run Code Online (Sandbox Code Playgroud)
这可以使用itertools在线生成器表达式完全完成,但它看起来非常奇怪:
from itertools import (takewhile,repeat)
def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
return sum( buf.count(b'\n') for buf in bufgen )
Run Code Online (Sandbox Code Playgroud)
这是我的时间:
function average, s min, s ratio
rawincount 0.0043 0.0041 1.00
rawgencount 0.0044 0.0042 1.01
rawcount 0.0048 0.0045 1.09
bufcount 0.008 0.0068 1.64
wccount 0.01 0.0097 2.35
itercount 0.014 0.014 3.41
opcount 0.02 0.02 4.83
kylecount 0.021 0.021 5.05
simplecount 0.022 0.022 5.25
mapcount 0.037 0.031 7.46
Run Code Online (Sandbox Code Playgroud)
Óla*_*age 85
您可以执行子进程并运行 wc -l filename
import subprocess
def file_len(fname):
p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
Run Code Online (Sandbox Code Playgroud)
Mar*_*ark 42
这是一个python程序,它使用多处理库来分配跨机器/核心的行数.我的测试使用8核Windows 64服务器改进了计算2000万行文件的时间从26秒到7秒.注意:不使用内存映射会使事情变得更慢.
import multiprocessing, sys, time, os, mmap
import logging, logging.handlers
def init_logger(pid):
console_format = 'P{0} %(levelname)s %(message)s'.format(pid)
logger = logging.getLogger() # New logger at root level
logger.setLevel( logging.INFO )
logger.handlers.append( logging.StreamHandler() )
logger.handlers[0].setFormatter( logging.Formatter( console_format, '%d/%m/%y %H:%M:%S' ) )
def getFileLineCount( queues, pid, processes, file1 ):
init_logger(pid)
logging.info( 'start' )
physical_file = open(file1, "r")
# mmap.mmap(fileno, length[, tagname[, access[, offset]]]
m1 = mmap.mmap( physical_file.fileno(), 0, access=mmap.ACCESS_READ )
#work out file size to divide up line counting
fSize = os.stat(file1).st_size
chunk = (fSize / processes) + 1
lines = 0
#get where I start and stop
_seedStart = chunk * (pid)
_seekEnd = chunk * (pid+1)
seekStart = int(_seedStart)
seekEnd = int(_seekEnd)
if seekEnd < int(_seekEnd + 1):
seekEnd += 1
if _seedStart < int(seekStart + 1):
seekStart += 1
if seekEnd > fSize:
seekEnd = fSize
#find where to start
if pid > 0:
m1.seek( seekStart )
#read next line
l1 = m1.readline() # need to use readline with memory mapped files
seekStart = m1.tell()
#tell previous rank my seek start to make their seek end
if pid > 0:
queues[pid-1].put( seekStart )
if pid < processes-1:
seekEnd = queues[pid].get()
m1.seek( seekStart )
l1 = m1.readline()
while len(l1) > 0:
lines += 1
l1 = m1.readline()
if m1.tell() > seekEnd or len(l1) == 0:
break
logging.info( 'done' )
# add up the results
if pid == 0:
for p in range(1,processes):
lines += queues[0].get()
queues[0].put(lines) # the total lines counted
else:
queues[0].put(lines)
m1.close()
physical_file.close()
if __name__ == '__main__':
init_logger( 'main' )
if len(sys.argv) > 1:
file_name = sys.argv[1]
else:
logging.fatal( 'parameters required: file-name [processes]' )
exit()
t = time.time()
processes = multiprocessing.cpu_count()
if len(sys.argv) > 2:
processes = int(sys.argv[2])
queues=[] # a queue for each process
for pid in range(processes):
queues.append( multiprocessing.Queue() )
jobs=[]
prev_pipe = 0
for pid in range(processes):
p = multiprocessing.Process( target = getFileLineCount, args=(queues, pid, processes, file_name,) )
p.start()
jobs.append(p)
jobs[0].join() #wait for counting to finish
lines = queues[0].get()
logging.info( 'finished {} Lines:{}'.format( time.time() - t, lines ) )
Run Code Online (Sandbox Code Playgroud)
Dan*_*Lee 13
我会使用Python的文件对象方法readlines,如下所示:
with open(input_file) as foo:
lines = len(foo.readlines())
Run Code Online (Sandbox Code Playgroud)
这将打开文件,在文件中创建行列表,计算列表的长度,将其保存到变量并再次关闭文件.
pki*_*kit 10
def file_len(full_path):
""" Count number of lines in a file."""
f = open(full_path)
nr_of_lines = sum(1 for line in f)
f.close()
return nr_of_lines
Run Code Online (Sandbox Code Playgroud)
rad*_*tek 10
这是我用的,看起来很干净:
import subprocess
def count_file_lines(file_path):
"""
Counts the number of lines in a file using wc utility.
:param file_path: path to file
:return: int, no of lines
"""
num = subprocess.check_output(['wc', '-l', file_path])
num = num.split(' ')
return int(num[0])
Run Code Online (Sandbox Code Playgroud)
更新:这比使用纯python略快,但代价是内存使用.子进程将在执行命令时使用与父进程相同的内存占用分叉新进程.
我使用这个版本获得了一个小的(4-8%)改进,它重新使用了一个常量缓冲区,所以它应该避免任何内存或GC开销:
lines = 0
buffer = bytearray(2048)
with open(filename) as f:
while f.readinto(buffer) > 0:
lines += buffer.count('\n')
Run Code Online (Sandbox Code Playgroud)
您可以使用缓冲区大小,也许可以看到一些改进.
num_lines = sum(1 for line in open('my_file.txt'))
Run Code Online (Sandbox Code Playgroud)
可能是最好的,另一种选择是
num_lines = len(open('my_file.txt').read().splitlines())
Run Code Online (Sandbox Code Playgroud)
以下是两者表现的比较
In [20]: timeit sum(1 for line in open('Charts.ipynb'))
100000 loops, best of 3: 9.79 µs per loop
In [21]: timeit len(open('Charts.ipynb').read().splitlines())
100000 loops, best of 3: 12 µs per loop
Run Code Online (Sandbox Code Playgroud)
一线解决方案
import os
os.system("wc -l filename")
Run Code Online (Sandbox Code Playgroud)
我的片段
os.system('wc -l*.txt')
>>> os.system('wc -l *.txt')
0 bar.txt
1000 command.txt
3 test_file.txt
1003 total
Run Code Online (Sandbox Code Playgroud)
一个类似于这个答案的单行bash解决方案,使用现代subprocess.check_output功能:
def line_count(file):
return int(subprocess.check_output('wc -l {}'.format(file), shell=True).split()[0])
Run Code Online (Sandbox Code Playgroud)
这是我用纯python发现的最快的东西.您可以通过设置缓冲区来使用任何数量的内存,但2**16似乎是我计算机上的最佳位置.
from functools import partial
buffer=2**16
with open(myfile) as f:
print sum(x.count('\n') for x in iter(partial(f.read,buffer), ''))
Run Code Online (Sandbox Code Playgroud)
我在这里找到了答案为什么在C++中读取stdin的行比Python慢得多?并稍微调整一下.这是一个非常好的阅读,以了解如何快速计算线,但wc -l仍然比其他任何东西快75%.
只是为了完成上面的方法,我尝试了一个带有fileinput模块的变体:
import fileinput as fi
def filecount(fname):
for line in fi.input(fname):
pass
return fi.lineno()
Run Code Online (Sandbox Code Playgroud)
并将60mil行文件传递给上述所有方法:
mapcount : 6.1331050396
simplecount : 4.588793993
opcount : 4.42918205261
filecount : 43.2780818939
bufcount : 0.170812129974
Run Code Online (Sandbox Code Playgroud)
对我来说有点意外的是,fileinput很糟糕,并且比其他所有方法都要糟糕得多......
经过perfplot分析后,不得不推荐缓冲读取解决方案
def buf_count_newlines_gen(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
Run Code Online (Sandbox Code Playgroud)
它速度快,内存效率高。大多数其他解决方案大约慢 20 倍。Unix 命令行工具wc也做得很好,但只适用于非常大的文件。
重现情节的代码:
import mmap
import subprocess
from functools import partial
import perfplot
def setup(n):
fname = "t.txt"
with open(fname, "w") as f:
for i in range(n):
f.write(str(i) + "\n")
return fname
def for_enumerate(fname):
i = 0
with open(fname) as f:
for i, _ in enumerate(f):
pass
return i + 1
def sum1(fname):
return sum(1 for _ in open(fname))
def mmap_count(fname):
with open(fname, "r+") as f:
buf = mmap.mmap(f.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
def for_open(fname):
lines = 0
for _ in open(fname):
lines += 1
return lines
def buf_count_newlines(fname):
lines = 0
buf_size = 2 ** 16
with open(fname) as f:
buf = f.read(buf_size)
while buf:
lines += buf.count("\n")
buf = f.read(buf_size)
return lines
def buf_count_newlines_gen(fname):
def _make_gen(reader):
b = reader(2 ** 16)
while b:
yield b
b = reader(2 ** 16)
with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count
def wc_l(fname):
return int(subprocess.check_output(["wc", "-l", fname]).split()[0])
def sum_partial(fname):
with open(fname) as f:
count = sum(x.count("\n") for x in iter(partial(f.read, 2 ** 16), ""))
return count
def read_count(fname):
return open(fname).read().count("\n")
b = perfplot.bench(
setup=setup,
kernels=[
for_enumerate,
sum1,
mmap_count,
for_open,
wc_l,
buf_count_newlines,
buf_count_newlines_gen,
sum_partial,
read_count,
],
n_range=[2 ** k for k in range(27)],
xlabel="num lines",
)
b.save("out.png")
b.show()
Run Code Online (Sandbox Code Playgroud)
对于我来说,这种变体将是最快的:
#!/usr/bin/env python
def main():
f = open('filename')
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
print lines
if __name__ == '__main__':
main()
Run Code Online (Sandbox Code Playgroud)
原因:缓冲比逐行读取更快,而且string.count速度也非常快
此代码更短更清晰.这可能是最好的方式:
num_lines = open('yourfile.ext').read().count('\n')
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
766743 次 |
| 最近记录: |