Python - 字典是否很难找到每个字符的频率？

Question

Python - 字典是否很难找到每个字符的频率？

psi*_*lia 24 python algorithm probability frequency

我试图使用O(n)复杂度的算法在任何给定文本中找到每个符号的频率.我的算法看起来像:

s = len(text) 
P = 1.0/s 
freqs = {} 
for char in text: 
    try: 
       freqs[char]+=P 
    except: 
       freqs[char]=P

Run Code Online (Sandbox Code Playgroud)

但我怀疑这个字典方法足够快,因为它取决于字典方法的底层实现.这是最快的方法吗？

更新:如果使用集合和整数,速度不会增加.这是因为该算法已经具有O(n)复杂度,因此不可能实现必要的加速.

例如,1MB文本的结果:

without collections:
real    0m0.695s

with collections:
real    0m0.625s

Run Code Online (Sandbox Code Playgroud)

Answer 1

jfs*_*jfs 44

性能比较

注意:表中的时间不包括加载文件所需的时间.

| approach       | american-english, |      big.txt, | time w.r.t. defaultdict |
|                |     time, seconds | time, seconds |                         |
|----------------+-------------------+---------------+-------------------------|
| Counter        |             0.451 |         3.367 |                     3.6 |
| setdefault     |             0.348 |         2.320 |                     2.5 |
| list           |             0.277 |         1.822 |                       2 |
| try/except     |             0.158 |         1.068 |                     1.2 |
| defaultdict    |             0.141 |         0.925 |                       1 |
| numpy          |             0.012 |         0.076 |                   0.082 |
| S.Mark's ext.  |             0.003 |         0.019 |                   0.021 |
| ext. in Cython |             0.001 |         0.008 |                  0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g

Run Code Online (Sandbox Code Playgroud)

使用的文件:'/usr/share/dict/american-english'和'big.txt'.

比较'Counter','setdefault','list','try/except','defaultdict','numpy','cython'和@ S.Mark的解决方案的脚本位于http:// gist. github.com/347000

最快的解决方案是用Cython编写的Python扩展:

import cython

@cython.locals(
    chars=unicode,
    i=cython.Py_ssize_t,
    L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
    for i in range(0x10000): # unicode code points > 0xffff are not supported
        L[i] = 0

    for c in chars:
        L[c] += 1

    return {unichr(i): L[i] for i in range(0x10000) if L[i]}

Run Code Online (Sandbox Code Playgroud)

以前的比较:

* python (dict) : 0.5  seconds
* python (list) : 0.5  (ascii) (0.2 if read whole file in memory)
* perl          : 0.5
* python (numpy): 0.07 
* c++           : 0.05
* c             : 0.008 (ascii)

Run Code Online (Sandbox Code Playgroud)

输入数据:

$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études

$ du -h /usr/share/dict/american-english
912K    /usr/share/dict/american-english

Run Code Online (Sandbox Code Playgroud)

python(Counter):0.5秒

#!/usr/bin/env python3.1
import collections, fileinput, textwrap

chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))

Run Code Online (Sandbox Code Playgroud)

运行:

$ time -p python3.1 count_char.py /usr/share/dict/american-english

Run Code Online (Sandbox Code Playgroud)

Counter({'e': 87823, 's': 86620, 'i': 66548, 'a': 62778, 'n': 56696, 'r':
56286, 't': 51588, 'o': 48425, 'l': 39914, 'c': 30020, 'd': 28068, 'u': 25810,
"'": 24511, 'g': 22262, 'p': 20917, 'm': 20747, 'h': 18453, 'b': 14137, 'y':
12367, 'f': 10049, 'k': 7800, 'v': 7573, 'w': 6924, 'z': 3088, 'x': 2082, 'M':
1686, 'C': 1549, 'S': 1515, 'q': 1447, 'B': 1387, 'j': 1376, 'A': 1345, 'P':
974, 'L': 912, 'H': 860, 'T': 858, 'G': 811, 'D': 809, 'R': 749, 'K': 656, 'E':
618, 'J': 539, 'N': 531, 'W': 507, 'F': 502, 'O': 354, 'I': 344, 'V': 330, 'Z':
150, 'Y': 140, 'é': 128, 'U': 117, 'Q': 63, 'X': 42, 'è': 29, 'ö': 12, 'ü': 12,
'ó': 10, 'á': 10, 'ä': 7, 'ê': 6, 'â': 6, 'ñ': 6, 'ç': 4, 'å': 3, 'û': 3, 'í':
2, 'ô': 2, 'Å': 1})
real 0.44
user 0.43
sys 0.01

perl:0.5秒

time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english

Run Code Online (Sandbox Code Playgroud)

输出:

{'S' => 1515,'K' => 656,'' => 29,'d' => 28068,'Y' => 140,'E' => 618,'y' => 12367,'g' => 22262,'e' => 87823,'' => 2,'J' => 539,'' => 241,'' => 3,'' => 6,'' => 4,'' => 128,'D' => 809,'q' => 1447,'b' => 14137,'z' => 3088,'w' => 6924,'Q' => 63,'' => 10,'M' => 1686,'C' => 1549,'' => 10,'L' => 912,'X' => 42,'P' => 974,'' => 12,'\'' => 24511,'' => 6,'a' => 62778,'T' => 858,'N' => 531,'j' => 1376,'Z' => 150,'u' => 25810,'k' => 7800,'t' => 51588,'' => 6,'W' => 507,'v' => 7573,'s' => 86620,'B' => 1387,'H' => 860,'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' => 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,'R' => 749,'o' => 48425}
real 0.51
user 0.49
sys 0.02

python(numpy):0.07秒

根据Ants Aasma的答案(修改为支持unicode):

#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy

filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'

# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]

# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)

# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts  if c[0] not in ' \t\n')

Run Code Online (Sandbox Code Playgroud)

输出:

("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01

Run Code Online (Sandbox Code Playgroud)

c ++:0.05秒

// $ g++ *.cc -lboost_program_options 
// $ ./a.out /usr/share/dict/american-english    
#include <iostream>
#include <fstream>
#include <cstdlib> // exit

#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>

int main(int argc, char* argv[]) {
  using namespace std;

  // open input file
  if (argc != 2) {
    cerr << "Usage: " << argv[0] << " <filename>\n";
    exit(2);
  }
  wifstream f(argv[argc-1]); 

  // assume the file has utf-8 encoding
  locale utf8_locale(locale(""), 
      new boost::program_options::detail::utf8_codecvt_facet);
  f.imbue(utf8_locale); 

  // count characters frequencies
  typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;  
  hashtable_t counts;
  for (wchar_t ch; f >> ch; )
    counts[ch]++;

  // print result
  wofstream of("output.utf8");
  of.imbue(utf8_locale);
  BOOST_FOREACH(hashtable_t::value_type i, counts) 
    of << "(" << i.first << " " << i.second << ") ";
  of << endl;
}

Run Code Online (Sandbox Code Playgroud)

结果:

$ cat output.utf8

Run Code Online (Sandbox Code Playgroud)

(í 2) (O 354) (P 974) (Q 63) (R 749) (S 1,515) (ñ 6) (T 858) (U 117) (ó 10) (ô 2) (V 330) (W 507) (X 42) (ö 12) (Y 140) (Z 150) (û 3) (ü 12) (a 62,778) (b 14,137) (c 30,020) (d 28,068) (e 87,823) (f 10,049) (g 22,262) (h 18,453) (i 66,548) (j 1,376) (k 7,800) (l 39,914) (m 20,747) (n 56,696) (o 48,425) (p 20,917) (q 1,447) (r 56,286) (s 86,620) (t 51,588) (u 25,810) (Å 1) (' 24,511) (v 7,573) (w 6,924) (x 2,082) (y 12,367) (z 3,088) (A 1,345) (B 1,387) (C 1,549) (á 10) (â 6) (D 809) (E 618) (F 502) (ä 7) (å 3) (G 811) (H 860) (ç 4) (I 344) (J 539) (è 29) (K 656) (é 128) (ê 6) (L 912) (M 1,686) (N 531)

c(ascii):0.0079秒

// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>

enum { N = 256 };
size_t counts[N];

int main(void) {
  // count characters
  int ch = -1;
  while((ch = getchar()) != EOF)
    ++counts[ch];

  // print result
  size_t i = 0;
  for (; i < N; ++i) 
    if (counts[i])
      printf("('%c' %zu) ", (int)i, counts[i]);
  return 0;
}

Run Code Online (Sandbox Code Playgroud)

+1:哇!这是一个非常彻底的比较,有趣的方法! (4认同)
确实很彻底.+1 (2认同)

Answer 2

YOU*_*YOU 16

如何在循环内避免浮点运算并在完成所有操作后执行此操作？

通过这种方式,你可以每次都做+1,它应该更快.

最好使用collections.defaultdict作为S.Lott建议.

freqs=collections.defaultdict(int)

for char in text: 
   freqs[char]+=1

Run Code Online (Sandbox Code Playgroud)

或者你可能想尝试,在Python 2.7+中的collections.Counter

>>> collections.Counter("xyzabcxyz")
Counter({'y': 2, 'x': 2, 'z': 2, 'a': 1, 'c': 1, 'b': 1})

Run Code Online (Sandbox Code Playgroud)

要么

您可以尝试psyco,它为python进行即时编译.你有循环,所以我认为你会用psyco获得一些性能提升

编辑1:

我做了一些基准基big.txt(〜6.5 MB),其中所用的拼写校正由彼得·诺维格

Text Length: 6488666

dict.get : 11.9060001373 s
93 chars {u' ': 1036511, u'$': 110, u'(': 1748, u',': 77675, u'0': 3064, u'4': 2417, u'8': 2527, u'<': 2, u'@': 8, ....

if char in dict : 9.71799993515 s
93 chars {u' ': 1036511, u'$': 110, u'(': 1748, u',': 77675, u'0': 3064, u'4': 2417, u'8': 2527, u'<': 2, u'@': 8, ....

dict try/catch : 7.35899996758 s
93 chars {u' ': 1036511, u'$': 110, u'(': 1748, u',': 77675, u'0': 3064, u'4': 2417, u'8': 2527, u'<': 2, u'@': 8, ....

collections.default : 7.29699993134 s
93 chars defaultdict(<type 'int'>, {u' ': 1036511, u'$': 110, u'(': 1748, u',': 77675, u'0': 3064, u'4': 2417, u'8': 2527, u'<': 2, u'@': 8, ....

Run Code Online (Sandbox Code Playgroud)

_{CPU规格:1.6GHz Intel Mobile Atom CPU}

根据它,dict.get是最慢和collections.defaultdict最快的,try/except也是快速的.

编辑2:

增加了collections.Counter基准测试,它比dict.get我的笔记本电脑慢了15秒

collections.Counter : 15.3439998627 s
93 chars Counter({u' ': 1036511, u'e': 628234, u't': 444459, u'a': 395872, u'o': 382683, u'n': 362397, u'i': 348464,

Run Code Online (Sandbox Code Playgroud)

Answer 3

YOU*_*YOU 10

我已经写了字符计数器C扩展到Python,看起来像300X的速度比collections.Counter和150倍的速度比collections.default(int)

C Char Counter : 0.0469999313354 s
93 chars {u' ': 1036511, u'$': 110, u'(': 1748, u',': 77675, u'0': 3064, u'4': 2417, u'8': 2527, u'<': 2, u'@': 8,

Run Code Online (Sandbox Code Playgroud)

这是Char Counter C扩展代码

static PyObject *
CharCounter(PyObject *self, PyObject *args, PyObject *keywds)
{
    wchar_t *t1;unsigned l1=0;

    if (!PyArg_ParseTuple(args,"u#",&t1,&l1)) return NULL;

    PyObject *resultList,*itemTuple;

    for(unsigned i=0;i<=0xffff;i++)char_counter[i]=0;

    unsigned chlen=0;

    for(unsigned i=0;i<l1;i++){
        if(char_counter[t1[i]]==0)char_list[chlen++]=t1[i];
        char_counter[t1[i]]++;
    }

    resultList = PyList_New(0);

    for(unsigned i=0;i<chlen;i++){
        itemTuple = PyTuple_New(2);

        PyTuple_SetItem(itemTuple, 0,PyUnicode_FromWideChar(&char_list[i],1));
        PyTuple_SetItem(itemTuple, 1,PyInt_FromLong(char_counter[char_list[i]]));

        PyList_Append(resultList, itemTuple);
        Py_DECREF(itemTuple);

    };

    return resultList;
}

Run Code Online (Sandbox Code Playgroud)

其中char_counter和char_list在模块级别进行malloc-ated,因此每次函数调用时都不需要malloc.

char_counter=(unsigned*)malloc(sizeof(unsigned)*0x10000);
char_list=(wchar_t*)malloc(sizeof(wchar_t)*0x10000);

Run Code Online (Sandbox Code Playgroud)

它返回一个带有元组的List

[(u'T', 16282), (u'h', 287323), (u'e', 628234), (u' ', 1036511), (u'P', 8946), (u'r', 303977), (u'o', 382683), ...

Run Code Online (Sandbox Code Playgroud)

要转换为dict格式,就dict()行了.

dict(CharCounter(text))

Run Code Online (Sandbox Code Playgroud)

PS:基准包括转换为dict的时间

CharCounter只接受Python Unicode String u"",如果文本是utf8,则需要.decode("utf8")提前做.

输入支持Unicode直到基本多语言平面(BMP) - 0x0000到0xFFFF

Answer 4

Tuo*_*nen 6

不,它不是最快的,因为您知道角色的范围有限,您可以使用列表和直接索引,使用字符的数字表示来存储频率.

Answer 5

Ign*_*ams 5

这是非常非常难以击败的dict.由于Python中的几乎所有东西都是基于字典的,所以它的调整非常高.

@ S.Lott:从理论上讲,只有渐近性能很重要.在实践中,大O符号隐藏的常数因素也很重要. (7认同)
@Daniel Stutsbach Philosoraptor同意. (2认同)

归档时间：	15 年，10 月前
查看次数：	5131 次
最近记录：	15 年前