Moh*_*shi 19 python algorithm performance
以下代码k为给定列表生成长度的所有分区(k子集分区).该算法可以在本主题中找到.
def algorithm_u(ns, m):
def visit(n, a):
ps = [[] for i in xrange(m)]
for j in xrange(n):
ps[a[j + 1]].append(ns[j])
return ps
def f(mu, nu, sigma, n, a):
if mu == 2:
yield visit(n, a)
else:
for v in f(mu - 1, nu - 1, (mu + sigma) % 2, n, a):
yield v
if nu == mu + 1:
a[mu] = mu - 1
yield visit(n, a)
while a[nu] > 0:
a[nu] = a[nu] - 1
yield visit(n, a)
elif nu > mu + 1:
if (mu + sigma) % 2 == 1:
a[nu - 1] = mu - 1
else:
a[mu] = mu - 1
if (a[nu] + sigma) % 2 == 1:
for v in b(mu, nu - 1, 0, n, a):
yield v
else:
for v in f(mu, nu - 1, 0, n, a):
yield v
while a[nu] > 0:
a[nu] = a[nu] - 1
if (a[nu] + sigma) % 2 == 1:
for v in b(mu, nu - 1, 0, n, a):
yield v
else:
for v in f(mu, nu - 1, 0, n, a):
yield v
def b(mu, nu, sigma, n, a):
if nu == mu + 1:
while a[nu] < mu - 1:
yield visit(n, a)
a[nu] = a[nu] + 1
yield visit(n, a)
a[mu] = 0
elif nu > mu + 1:
if (a[nu] + sigma) % 2 == 1:
for v in f(mu, nu - 1, 0, n, a):
yield v
else:
for v in b(mu, nu - 1, 0, n, a):
yield v
while a[nu] < mu - 1:
a[nu] = a[nu] + 1
if (a[nu] + sigma) % 2 == 1:
for v in f(mu, nu - 1, 0, n, a):
yield v
else:
for v in b(mu, nu - 1, 0, n, a):
yield v
if (mu + sigma) % 2 == 1:
a[nu - 1] = 0
else:
a[mu] = 0
if mu == 2:
yield visit(n, a)
else:
for v in b(mu - 1, nu - 1, (mu + sigma) % 2, n, a):
yield v
n = len(ns)
a = [0] * (n + 1)
for j in xrange(1, m + 1):
a[n - m + j] = j - 1
return f(m, n, 0, n, a)
Run Code Online (Sandbox Code Playgroud)
我们知道给定列表的k子集数量等于,Stirling number对于某些大型列表来说可能非常大.
上面的代码返回一个Python生成器,它可以为给定列表生成所有可能的k子集分区,并调用它的下一个方法.因此,如果我想随机获得这些分区中的一个,我必须在一些随机时间调用next方法(如果Stirling数字很大,这会使它非常慢)或者使用该itertools.islice方法得到一个大小为1的切片和以前一样真的很慢
我试图避免列出所有分区,因为这会浪费时间和速度甚至内存(因为计算很多,内存在我的情况下很重要).
问题是如何只生成一个k子集分区而不生成其余的?或者至少使程序比上面解释的更快.我需要性能因为我每次只需要其中一个并且我运行的应用程序可能超过一千万次.
我很感激任何帮助.
编辑:示例
清单: { 1, 2, 3 }
对于k = 3:
{ {1}, {2}, {3} }
Run Code Online (Sandbox Code Playgroud)
对于k = 2:
{ {1, 2}, {3} }
{ {1, 3}, {2} }
{ {1}, {2, 3} }
Run Code Online (Sandbox Code Playgroud)
并且对于k = 1:
{ {1, 2, 3} }
Run Code Online (Sandbox Code Playgroud)
考虑k = 2,有什么办法可以随机生成这3个分区中的一个,而不生成其他2个分区?请注意,我想为任何给定的k生成随机分区,不仅是任何k的随机分区,这意味着如果我将k设置为2,我想只生成这3个中的一个而不是全部5个中的一个.
问候,
穆罕默德
Pet*_*vaz 12
您可以通过存储先前计算的值,使用递归算法有效地计算斯特林数:
fact=[1]
def nCr(n,k):
"""Return number of ways of choosing k elements from n"""
while len(fact)<=n:
fact.append(fact[-1]*len(fact))
return fact[n]/(fact[k]*fact[n-k])
cache = {}
def count_part(n,k):
"""Return number of ways of partitioning n items into k non-empty subsets"""
if k==1:
return 1
key = n,k
if key in cache:
return cache[key]
# The first element goes into the next partition
# We can have up to y additional elements from the n-1 remaining
# There will be n-1-y left over to partition into k-1 non-empty subsets
# so n-1-y>=k-1
# y<=n-k
t = 0
for y in range(0,n-k+1):
t += count_part(n-1-y,k-1) * nCr(n-1,y)
cache[key] = t
return t
Run Code Online (Sandbox Code Playgroud)
一旦你知道有多少选择,你可以调整这个递归代码来生成一个特定的分区:
def ith_subset(A,k,i):
"""Return ith k-subset of A"""
# Choose first element x
n = len(A)
if n==k:
return A
if k==0:
return []
for x in range(n):
# Find how many cases are possible with the first element being x
# There will be n-x-1 left over, from which we choose k-1
extra = nCr(n-x-1,k-1)
if i<extra:
break
i -= extra
return [A[x]] + ith_subset(A[x+1:],k-1,i)
def gen_part(A,k,i):
"""Return i^th k-partition of elements in A (zero-indexed) as list of lists"""
if k==1:
return [A]
n=len(A)
# First find appropriate value for y - the extra amount in this subset
for y in range(0,n-k+1):
extra = count_part(n-1-y,k-1) * nCr(n-1,y)
if i<extra:
break
i -= extra
# We count through the subsets, and for each subset we count through the partitions
# Split i into a count for subsets and a count for the remaining partitions
count_partition,count_subset = divmod(i,nCr(n-1,y))
# Now find the i^th appropriate subset
subset = [A[0]] + ith_subset(A[1:],y,count_subset)
S=set(subset)
return [subset] + gen_part([a for a in A if a not in S],k-1,count_partition)
Run Code Online (Sandbox Code Playgroud)
作为一个例子,我编写了一个测试程序,它产生4个数字的不同分区:
def test(A):
n=len(A)
for k in [1,2,3,4]:
t = count_part(n,k)
print k,t
for i in range(t):
print " ",i,gen_part(A,k,i)
test([1,2,3,4])
Run Code Online (Sandbox Code Playgroud)
此代码打印:
1 1
0 [[1, 2, 3, 4]]
2 7
0 [[1], [2, 3, 4]]
1 [[1, 2], [3, 4]]
2 [[1, 3], [2, 4]]
3 [[1, 4], [2, 3]]
4 [[1, 2, 3], [4]]
5 [[1, 2, 4], [3]]
6 [[1, 3, 4], [2]]
3 6
0 [[1], [2], [3, 4]]
1 [[1], [2, 3], [4]]
2 [[1], [2, 4], [3]]
3 [[1, 2], [3], [4]]
4 [[1, 3], [2], [4]]
5 [[1, 4], [2], [3]]
4 1
0 [[1], [2], [3], [4]]
Run Code Online (Sandbox Code Playgroud)
作为另一个例子,有1千万个分区1,2,3,... 14分为4个部分.此代码可以使用pypy在44秒内生成所有分区.
有50,369,882,873,307,917,364,901分区1,2,3,...,40分为4个部分.此代码可以在120秒内生成1000万个这样的代码,并在单个处理器上运行pypy.
要将事物联系在一起,您可以使用此代码生成列表A到k个非空子集的单个随机分区:
import random
def random_ksubset(A,k):
i = random.randrange(0,count_part(len(A),k))
return gen_part(A,k,i)
Run Code Online (Sandbox Code Playgroud)
这样的事情怎么样:
import itertools
import random
def random_ksubset(ls, k):
# we need to know the length of ls, so convert it into a list
ls = list(ls)
# sanity check
if k < 1 or k > len(ls):
return []
# Create a list of length ls, where each element is the index of
# the subset that the corresponding member of ls will be assigned
# to.
#
# We require that this list contains k different values, so we
# start by adding each possible different value.
indices = list(range(k))
# now we add random values from range(k) to indices to fill it up
# to the length of ls
indices.extend([random.choice(list(range(k))) for _ in range(len(ls) - k)])
# shuffle the indices into a random order
random.shuffle(indices)
# construct and return the random subset: sort the elements by
# which subset they will be assigned to, and group them into sets
return [{x[1] for x in xs} for (_, xs) in
itertools.groupby(sorted(zip(indices, ls)), lambda x: x[0])]
Run Code Online (Sandbox Code Playgroud)
这会生成随机k子集分区,如下所示:
>>> ls = {1,2,3}
>>> print(random_ksubset(ls, 2))
[set([1, 2]), set([3])]
>>> print(random_ksubset(ls, 2))
[set([1, 3]), set([2])]
>>> print(random_ksubset(ls, 2))
[set([1]), set([2, 3])]
>>> print(random_ksubset(ls, 2))
[set([1]), set([2, 3])]
Run Code Online (Sandbox Code Playgroud)
这种方法满足OP要求获得一个随机生成的分区,而不需要枚举所有可能的分区.这里的内存复杂性是线性的 由于排序,运行时复杂度为O(N log N).我想如果这很重要,可能会使用更复杂的构造返回值的方法将其降低到线性.
正如@Leon指出的那样,这满足了他的选项2在尝试定义问题时的要求.这不会做的是确定性地生成分区#N(这是Leon的选项1,它允许您随机选择一个整数N然后检索相应的分区).莱昂的澄清很重要,因为为了满足问题的精神,应该以相同的概率生成集合的每个可能的划分.关于我们的玩具问题,情况就是这样:
>>> from collections import Counter
>>> Counter(frozenset(map(frozenset, random_ksubset(ls, 2))) for _ in range(10000))
Counter({frozenset({frozenset({2, 3}), frozenset({1})}): 3392,
frozenset({frozenset({1, 3}), frozenset({2})}): 3212,
frozenset({frozenset({1, 2}), frozenset({3})}): 3396})
Run Code Online (Sandbox Code Playgroud)
然而.通常,此方法不会以相同的概率生成每个分区.考虑:
>>> Counter(frozenset(map(frozenset, random_ksubset(range(4), 2)))
... for _ in range(10000)).most_common()
[(frozenset({frozenset({1, 3}), frozenset({0, 2})}), 1671),
(frozenset({frozenset({1, 2}), frozenset({0, 3})}), 1667),
(frozenset({frozenset({2, 3}), frozenset({0, 1})}), 1642),
(frozenset({frozenset({0, 2, 3}), frozenset({1})}), 1285),
(frozenset({frozenset({2}), frozenset({0, 1, 3})}), 1254),
(frozenset({frozenset({0, 1, 2}), frozenset({3})}), 1245),
(frozenset({frozenset({1, 2, 3}), frozenset({0})}), 1236)]
Run Code Online (Sandbox Code Playgroud)
我们在这里可以看到,我们更有可能生成"更平衡"的分区(因为有更多的方法可以构建这些分区).包含单例集的分区生成频率较低.
似乎在集合的k分区上有效的均匀采样方法是一个未解决的研究问题(也见mathoverflow).Nijenhuis和Wilf为所有分区提供了采样代码(第12章),这可以与拒绝测试一起使用,@ PeterdeRivaz的答案也可以统一采样k分区.这两种方法的缺点是它们需要计算斯特林数,它在n中呈指数增长,算法是递归的,我认为这会使它们在大输入上变慢.正如您在评论中提到"数百万"分区一样,我认为这些方法只能在一定的输入大小下处理.
A. Nijenhuis和H. Wilf.计算机和计算器的组合算法.学术出版社,Orlando FL,第二版,1978年.
探索莱昂的选项1可能很有趣.这是一个粗略的过程,使用@Amadan建议将整数值解释为k-ary数来确定性地生成集合的特定分区.请注意,并非每个整数值都会生成有效的k子集分区(因为我们不允许空子集):
def amadan(ls, N, k):
"""
Given a collection `ls` with length `b`, a value `k`, and a
"partition number" `N` with 0 <= `N` < `k**b`, produce the Nth
k-subset paritition of `ls`.
"""
ls = list(ls)
b = len(ls)
if not 0 <= N < k**b: return None
# produce the k-ary index vector from the number N
index = []
# iterate through each of the subsets
for _ in range(b):
index.append(N % k)
N //= k
# subsets cannot be empty
if len(set(index)) != k: return None
return frozenset(frozenset(x[1] for x in xs) for (_, xs) in
itertools.groupby(sorted(zip(index, ls)),
lambda x:x[0]))
Run Code Online (Sandbox Code Playgroud)
我们可以确认这会正确生成斯特林数:
>>> for i in [(4,1), (4,2), (4,3), (4,4), (5,1), (5,2), (5,3), (5,4), (5,5)]:
... b,k = i
... r = [amadan(range(b), N, k) for N in range(k**b)]
... r = [x for x in r if x is not None]
... print(i, len(set(r)))
(4, 1) 1
(4, 2) 7
(4, 3) 6
(4, 4) 1
(5, 1) 1
(5, 2) 15
(5, 3) 25
(5, 4) 10
(5, 5) 1
Run Code Online (Sandbox Code Playgroud)
这也可以以相同的概率产生每个可能的分区; 我不太确定.这是一个测试用例,它起作用:
>>> b,k = 4,3
>>> r = [amadan(range(b), N, k) for N in range(k**b)]
>>> r = [x for x in r if x is not None]
>>> print(Counter([' '.join(sorted(''.join(map(str, x)) for x in p)) for p in r]))
Counter({'0 13 2': 6,
'01 2 3': 6,
'0 12 3': 6,
'03 1 2': 6,
'02 1 3': 6,
'0 1 23': 6})
Run Code Online (Sandbox Code Playgroud)
另一个工作案例:
>>> b,k = 5,4
>>> r = [amadan(range(b), N, k) for N in range(k**b)]
>>> r = [x for x in r if x is not None]
>>> print(Counter([' '.join(sorted(''.join(map(str, x)) for x in p)) for p in r]))
Counter({'0 12 3 4': 24,
'04 1 2 3': 24,
'0 1 23 4': 24,
'01 2 3 4': 24,
'03 1 2 4': 24,
'0 13 2 4': 24,
'0 1 24 3': 24,
'02 1 3 4': 24,
'0 1 2 34': 24,
'0 14 2 3': 24})
Run Code Online (Sandbox Code Playgroud)
所以,将它包装在一个函数中:
def random_ksubset(ls, k):
ls = list(ls)
maxn = k**len(ls)-1
rv = None
while rv is None:
rv = amadan(ls, random.randint(0, maxn), k)
return rv
Run Code Online (Sandbox Code Playgroud)
然后我们可以做到:
>>> random_ksubset(range(3), 2)
frozenset({frozenset({2}), frozenset({0, 1})})
>>> random_ksubset(range(3), 2)
frozenset({frozenset({1, 2}), frozenset({0})})
>>> random_ksubset(range(3), 2)
frozenset({frozenset({1, 2}), frozenset({0})})
>>> random_ksubset(range(3), 2)
frozenset({frozenset({2}), frozenset({0, 1})})
Run Code Online (Sandbox Code Playgroud)
TL;博士:
给定n和k的k子集分区可以被分成类型,基于哪些元素首先进入尚未空的部分.这些类型中的每一个由具有n-1位的位模式表示,其中k-1被设置.虽然分区数量很大(由第二个斯特林数字给出),但类型的数量要小得多,例如:
n = 21,k = 8
分区数:S(21,8)= 132,511,015,347,084
类型数:(n-1选择k-1)= 77,520
根据位模式中零的位置,计算每种类型的分区数是多么简单.如果您列出所有类型(通过迭代所有n:k位模式)并保持分区数的运行总计,则可以在此列表上使用二进制搜索来查找具有分区数的分区给定等级(在Log 2(n-1选择k-1)步骤中;对于上面的例子为17),然后将比特模式转换为分区并计算每个元素进入哪个部分(以n步为单位).该方法的每个部分都可以迭代完成,不需要递归.
这是一个非递归的解决方案.我试图推出自己的,但它可能(部分)与彼得的答案或现有方法重叠.
如果你有一组n个元素,例如n = 8:
{A,B,C,d,E,F,G,H}
然后k子集分区将采用这种形状,例如k = 5:
{a,e} {b,c,h} {d} {f} {g}
此分区也可以写为:
1,2,2,3,1,4,5,2
其中列出了每个元素进入的部分.因此,具有从1到k的值的n个数字序列表示n个元素的k子集分区.
但是,并非所有这些序列都是有效的分区; 从1到k的每个数字必须存在,否则会有空的部分:
1,2,2,3,1,3,5,2→{a,e} {b,c,h} {d,f} {} {g}
另外,为避免重复,数字x只能在使用数字x-1后使用.因此,第一个数字始终为1,第二个数字最多为2,依此类推.如果在示例中我们在3之前使用数字4和5,我们会得到重复的分区:
1,2,2,3,1,4,5,2→{a,e} {b,c,h} {d} {f} {g}
1,2,2,4,1,5,3 ,2→{a,e} {b,c,h} {g} {d} {f}
根据首次使用每个部件的时间对分区进行分组时,您将获得以下类型:
1,1,1,1,2,3,4,5 0001111 11111111 1 1
1,1,1,2,12,3,4,5 0010111 11112111 2 2
1,1,1,2,3,123,4,5 0011011 11111311 3 3
1,1,1,2,3,4,1234,5 0011101 11111141 4 4
1,1,1,2,3,4,5,12345 0011110 11111115 5 5
1,1,2,12,12,3,4,5 0100111 11122111 2*2 4
1,1,2,12,3,123,4,5 0101011 11121311 2*3 6
1,1,2,12,3,4,1234,5 0101101 11121141 2*4 8
1,1,2,12,3,4,5,12345 0101110 11121115 2*5 10
1,1,2,3,123,123,4,5 0110011 11113311 3*3 9
1,1,2,3,123,4,1234,5 0110101 11113141 3*4 12
1,1,2,3,123,4,5,12345 0110110 11113115 3*5 15
1,1,2,3,4,1234,1234,5 0111001 11111441 4*4 16
1,1,2,3,4,1234,5,12345 0111010 11111415 4*5 20
1,1,2,3,4,5,12345,12345 0111100 11111155 5*5 25
1,2,12,12,12,3,4,5 1000111 11222111 2*2*2 8
1,2,12,12,3,123,4,5 1001011 11221311 2*2*3 12
1,2,12,12,3,4,1234,5 1001101 11221141 2*2*4 16
1,2,12,12,3,4,5,12345 1001110 11221115 2*2*5 20
1,2,12,3,123,123,4,5 1010011 11213311 2*3*3 18
1,2,12,3,123,4,1234,5 1010101 11213141 2*3*4 24
1,2,12,3,123,4,5,12345 1010110 11213115 2*3*5 30
1,2,12,3,4,1234,1234,5 1011001 11211441 2*4*4 32
1,2,12,3,4,1234,5,12345 1011010 11211415 2*4*5 40
1,2,12,3,4,5,12345,12345 1011100 11211155 2*5*5 50
1,2,3,123,123,123,4,5 1100011 11133311 3*3*3 27
1,2,3,123,123,4,1234,5 1100101 11133141 3*3*4 36
1,2,3,123,123,4,5,12345 1100110 11133115 3*3*5 45
1,2,3,123,4,1234,1234,5 1101001 11131441 3*4*4 48
1,2,3,123,4,1234,5,12345 1101010 11131415 3*4*5 60
1,2,3,123,4,5,12345,12345 1101100 11131155 3*5*5 75
1,2,3,4,1234,1234,1234,5 1110001 11114441 4*4*4 64
1,2,3,4,1234,1234,5,12345 1110010 11114415 4*4*5 80
1,2,3,4,1234,5,12345,12345 1110100 11114155 4*5*5 100
1,2,3,4,5,12345,12345,12345 1111000 11111555 5*5*5 125 SUM = 1050
Run Code Online (Sandbox Code Playgroud)
在上图中,类型的分区:
1,2,12,3,123,4,1234,5
意思是:
a进入第1部分
b进入第2部分
c进入部分1或2
进入部分3
e进入部分1,2或3
进入部分4
g进入部分1,2,3或4
h进入第5部分
因此,此类型的分区具有可以具有2个值的数字,可以具有3个值的数字,以及可以具有4个值的数字(这在上图的第3列中指示).因此,这种类型总共有2×3×4个分区(如第4和第5列所示).这些的总和当然是斯特林数:S(8,5)= 1050.
图中的第二列是另一种标记分区类型的方法:从1开始,每个数字都是之前使用过的数字,或者是一个向上的数字(即到目前为止使用的最高数字+ 1).如果我们用0和1表示这两个选项,我们得到例如:
1,2,12,3,123,4,1234,5→1010101
其中1010101表示:
从1开始
→逐步升级到2
0→重复1或2
1→升级到3
0→重复1,2或3
1→升级到4
0→重复1,2,3或4
1→升级到五
因此,每个具有n-1个数字和k-1个的二进制序列代表一种分区.我们可以通过从左到右迭代数字来计算一个类型的分区数,当我们找到一个时递增一个因子,并在找到零时乘以因子,例如:
1,2,12,3,123,4,1234,5→1010101
产品= 1,系数= 1
1→增量系数:2
0→乘积×系数= 2
1→增量系数:3
0→乘积×系数= 6
1→增量系数:4
0→乘积×系数= 24
1→增量系数:5
再次对于这个例子,我们发现有24个这种类型的分区.因此,计算每种类型的分区可以通过使用任何方法(例如Gosper的Hack)迭代所有设置了k-1位的n-1位整数来完成:
0001111 1 1
0010111 2 3
0011011 3 6
0011101 4 10
0011110 5 15
0100111 4 19
0101011 6 25
0101101 8 33
0101110 10 43
0110011 9 52
0110101 12 64
0110110 15 79
0111001 16 95
0111010 20 115
0111100 25 140
1000111 8 148
1001011 12 160
1001101 16 176
1001110 20 196
1010011 18 214
1010101 24 238
1010110 30 268
1011001 32 300
1011010 40 340
1011100 50 390
1100011 27 417
1100101 36 453
1100110 45 498
1101001 48 546
1101010 60 606
1101100 75 681
1110001 64 745
1110010 80 825
1110100 100 925
1111000 125 1050
Run Code Online (Sandbox Code Playgroud)
找到一个随机分区然后意味着选择一个从1到S(n,k)的数字,越过每个分区类型的计数,同时保持一个运行总计(上面的第3列),并选择相应的分区类型,然后计算值重复的数字,例如:
S(8,5)= 1050
随机选择:例如333
类型:1011010→1,2,12,3,4,1234,5,12345
范围:301 - 340
变化:333 - 301 = 32
位数选项:2,4 ,5
位数值:20,5,1
变化:32 = 1×20 + 2×5 + 2×1
位:1,2,2(基于0)→2,3,3(基于1)
分区: 1,2,2,3,4,3,5,3
和5个部分中的8个元素的第333个分区是:
1,2,2,3,4,3,5,3→{a} {b,c} {d,f,h} {e} {g}
有许多选项可以将其转换为代码; 如果将n-1位数字存储为运行总计,则可以使用列表上的二进制搜索进行后续查找,其长度为C(n-1,k-1),以减少O(C)的时间复杂度(n-1,k-1))至O(Log 2(C(n-1,k-1))).
我用JavaScript做了第一次测试(抱歉,我不会说Python); 它并不漂亮,但它演示了这种方法并且非常快.例子是n = 21和k = 8的情况; 它为77,520种类型的分区创建计数表,返回分区总数132,511,015,347,084,然后检索该范围内的10个随机选择的分区.在我的计算机上,此代码在3.7秒内返回一百万个随机选择的分区.(注意:代码是从零开始的,与上面的解释不同)
function kSubsetPartitions(n, k) { // Constructor
this.types = [];
this.count = [];
this.total = 0;
this.elems = n;
var bits = (1 << k - 1) - 1, done = 1 << n - 1;
do {
this.total += variations(bits);
this.types.push(bits);
this.count.push(this.total);
}
while (!((bits = next(bits)) & done));
function variations(bits) {
var product = 1, factor = 1, mask = 1 << n - 2;
while (mask) {
if (bits & mask) ++factor;
else product *= factor;
mask >>= 1;
}
return product;
}
function next(a) { // Gosper's Hack
var c = (a & -a), r = a + c;
return (((r ^ a) >> 2) / c) | r;
}
}
kSubsetPartitions.prototype.partition = function(rank) {
var range = 1, type = binarySearch(this.count, rank);
if (type) {
rank -= this.count[type - 1];
range = this.count[type] - this.count[type - 1];
}
return translate(this.types[type], this.elems, range, rank);
// This translates the bit pattern format and creates the correct partition
// for the given rank, using a letter format for demonstration purposes
function translate(bits, len, range, rank) {
var partition = [["A"]], part, max = 0, mask = 1 << len - 2;
for (var i = 1; i < len; i++, mask >>= 1) {
if (!(bits & mask)) {
range /= (max + 1);
part = Math.floor(rank / range);
rank %= range;
}
else part = ++max;
if (!partition[part]) partition[part] = "";
partition[part] += String.fromCharCode(65 + i);
}
return partition.join(" / ");
}
function binarySearch(array, value) {
var low = 0, mid, high = array.length - 1;
while (high - low > 1) {
mid = Math.ceil((high + low) / 2);
if (value < array[mid]) high = mid;
else low = mid;
}
return value < array[low] ? low : high;
}
}
var ksp = new kSubsetPartitions(21, 8);
document.write("Number of k-subset partitions for n,k = 21,8 → " +
ksp.total.toLocaleString("en-US") + "<br>");
for (var tests = 10; tests; tests--) {
var rnd = Math.floor(Math.random() * ksp.total);
document.write("Partition " + rnd.toLocaleString("en-US", {minimumIntegerDigits:
15}) + " → " + ksp.partition(rnd) + "<br>");
}Run Code Online (Sandbox Code Playgroud)
没有必要为每个分区类型存储位模式,因为它们可以从它们的索引重新创建(参见例如本答案中的第二个算法).如果仅存储每个分区类型的变化数的运行总计,则会将内存需求减半.
C++中的第二个代码示例仅存储计数,并将分区作为包含每个元素的部件号的n长度数组返回.代码末尾的用法示例.在我的计算机上,它在12秒内创建n = 40和k = 32的计数列表,然后在24秒内返回1000万个分区.
n的值可以高达65,k可以高达64,但是对于某些组合,分区的数量将大于2 64,这个代码显然无法处理.如果你把它翻译成Python,就不应该有这样的限制.(注意:如果k = 1,则启用二项式系数函数的零检查.)
class kSubsetPartitions {
std::vector <uint64_t> count;
uint64_t total;
uint8_t n;
uint8_t k;
public:
kSubsetPartitions(uint8_t n, uint8_t k) {
this->total = 0;
this->n = n;
this->k = k;
uint64_t bits = ((uint64_t) 1 << k - 1) - 1;
uint64_t types = choose(n - 1, k - 1);
this->count.reserve(types);
while (types--) {
this->total += variations(bits);
this->count.push_back(this->total);
bits = next(bits);
}
}
uint64_t range() {
return this->total;
}
void partition(uint64_t rank, uint8_t *buffer) {
uint64_t range = 1;
uint64_t type = binarySearch(rank);
if (type) {
rank -= this->count[type - 1];
range = this->count[type] - this->count[type - 1];
}
format(pattern(type), range, rank, buffer);
}
private:
uint64_t pattern(uint64_t type) {
uint64_t steps, bits = 0, mask = (uint64_t) 1 << this->n - 2;
uint8_t ones = this->k - 1;
for (uint8_t i = this->n - 1; i; i--, mask >>= 1) {
if (i > ones) {
steps = choose(i - 1, ones);
if (type >= steps) {
type -= steps;
bits |= mask;
--ones;
}
}
else bits |= mask;
}
return bits;
}
uint64_t choose(uint8_t x, uint8_t y) { // C(x,y) using Pascal's Triangle
static std::vector <std::vector <uint64_t> > triangle;
if (triangle.empty()) {
triangle.resize(this->n);
triangle[0].push_back(1);
for (uint8_t i = 1; i < this->n; i++) {
triangle[i].push_back(1);
for (uint8_t j = 1; j < i; j++) {
triangle[i].push_back(triangle[i - 1][j - 1] + triangle[i - 1][j]);
}
triangle[i].push_back(1);
}
}
return triangle[x][y];
}
void format(uint64_t bits, uint64_t range, uint64_t rank, uint8_t *buffer) {
uint64_t mask = (uint64_t) 1 << this->n - 2;
uint8_t max = 0, part;
*buffer = 0;
while (mask) {
if (!(bits & mask)) {
range /= (max + 1);
part = rank / range;
rank %= range;
}
else part = ++max;
*(++buffer) = part;
mask >>= 1;
}
}
uint64_t binarySearch(uint64_t rank) {
uint64_t low = 0, mid, high = this->count.size() - 1;
while (high - low > 1) {
mid = (high + low + 1) / 2;
if (rank < this->count[mid]) high = mid;
else low = mid;
}
return rank < this->count[low] ? low : high;
}
uint64_t variations(uint64_t bits) {
uint64_t product = 1;
uint64_t mask = (uint64_t) 1 << this->n - 2;
uint8_t factor = 1;
while (mask) {
if (bits & mask) ++factor;
else product *= factor;
mask >>= 1;
}
return product;
}
uint64_t next(uint64_t a) { // Gosper's Hack
// if (!a) return a; // k=1 => a=0 => c=0 => division by zero!
uint64_t c = (a & -a), r = a + c;
return (((r ^ a) >> 2) / c) | r;
}
};
// USAGE EXAMPLE:
// uint8_t buffer[40];
// kSubsetPartitions* ksp = new kSubsetPartitions(40, 32);
// uint64_t range = ksp->range();
// ksp->partition(any_integer_below_range, buffer);
Run Code Online (Sandbox Code Playgroud)
下面概述了n和k的值,这些值导致超过2 64个分区,并导致上面代码中的溢出.最多n = 26,k的所有值都给出有效结果.
25: - 26: - 27: 8-13 28: 7-15 29: 6-17 30: 6-18 31: 5-20 32: 5-21
33: 5-22 34: 4-23 35: 4-25 36: 4-26 37: 4-27 38: 4-28 39: 4-29 40: 4-31
41: 4-32 42: 4-33 43: 3-34 44: 3-35 45: 3-36 46: 3-37 47: 3-38 48: 3-39
49: 3-40 50: 3-42 51: 3-43 52: 3-44 53: 3-45 54: 3-46 55: 3-47 56: 3-48
57: 3-49 58: 3-50 59: 3-51 60: 3-52 61: 3-53 62: 3-54 63: 3-55 64: 3-56
65: 3-57
Run Code Online (Sandbox Code Playgroud)
不存储每种类型的分区数的版本是可能的,并且几乎不需要内存.查找与随机选择的整数对应的分区会比较慢,但是如果对整数的选择进行了排序,则它可能比每次查找需要二进制排序的版本更快.
您将从第一个位模式开始,计算此类型的分区数,查看第一个整数是否落入此范围,计算其分区,然后继续下一个位模式.
| 归档时间: |
|
| 查看次数: |
1442 次 |
| 最近记录: |