我目前正在使用C++对一些数据结构进行基准测试,我想在处理Zipf分布式数字时测试它们.
我正在使用本网站提供的发电机:http://www.cse.usf.edu/~christen/tools/toolpage.html
我调整了实现以使用Mersenne Twister生成器.
它运作良好,但它真的很慢.在我的情况下,范围可以很大(大约一百万),并且生成的随机数的数量可以是几百万.
alpha参数不会随时间变化,它是固定的.
我试图预先计算所有sum_prob.它的速度要快得多,但在大范围内仍然会变慢.
有没有更快的方法来生成Zipf分布式数字?即使是不太精确的东西也会受到欢迎
谢谢
我能找到的唯一 C++11 Zipf 随机生成器显式计算并使用了概率std::discrete_distribution。这适用于小范围,但如果您需要生成范围非常宽的 Zipf 值(在我的例子中用于数据库测试),则没有用,因为它会耗尽内存。因此,我用 C++ 实现了下面提到的算法。
我没有严格测试这段代码,一些优化可能是可能的,但它只需要恒定的空间,并且似乎运行良好。
\n\n#include <algorithm>\n#include <cmath>\n#include <random>\n\n/** Zipf-like random distribution.\n *\n * "Rejection-inversion to generate variates from monotone discrete\n * distributions", Wolfgang H\xc3\xb6rmann and Gerhard Derflinger\n * ACM TOMACS 6.3 (1996): 169-184\n */\ntemplate<class IntType = unsigned long, class RealType = double>\nclass zipf_distribution\n{\npublic:\n typedef RealType input_type;\n typedef IntType result_type;\n\n static_assert(std::numeric_limits<IntType>::is_integer, "");\n static_assert(!std::numeric_limits<RealType>::is_integer, "");\n\n zipf_distribution(const IntType n=std::numeric_limits<IntType>::max(),\n const RealType q=1.0)\n : n(n)\n , q(q)\n , H_x1(H(1.5) - 1.0)\n , H_n(H(n + 0.5))\n , dist(H_x1, H_n)\n {}\n\n IntType operator()(std::mt19937& rng)\n {\n while (true) {\n const RealType u = dist(rng);\n const RealType x = H_inv(u);\n const IntType k = clamp<IntType>(std::round(x), 1, n);\n if (u >= H(k + 0.5) - h(k)) {\n return k;\n }\n }\n }\n\nprivate:\n /** Clamp x to [min, max]. */\n template<typename T>\n static constexpr T clamp(const T x, const T min, const T max)\n {\n return std::max(min, std::min(max, x));\n }\n\n /** exp(x) - 1 / x */\n static double\n expxm1bx(const double x)\n {\n return (std::abs(x) > epsilon)\n ? std::expm1(x) / x\n : (1.0 + x/2.0 * (1.0 + x/3.0 * (1.0 + x/4.0)));\n }\n\n /** H(x) = log(x) if q == 1, (x^(1-q) - 1)/(1 - q) otherwise.\n * H(x) is an integral of h(x).\n *\n * Note the numerator is one less than in the paper order to work with all\n * positive q.\n */\n const RealType H(const RealType x)\n {\n const RealType log_x = std::log(x);\n return expxm1bx((1.0 - q) * log_x) * log_x;\n }\n\n /** log(1 + x) / x */\n static RealType\n log1pxbx(const RealType x)\n {\n return (std::abs(x) > epsilon)\n ? std::log1p(x) / x\n : 1.0 - x * ((1/2.0) - x * ((1/3.0) - x * (1/4.0)));\n }\n\n /** The inverse function of H(x) */\n const RealType H_inv(const RealType x)\n {\n const RealType t = std::max(-1.0, x * (1.0 - q));\n return std::exp(log1pxbx(t) * x);\n }\n\n /** That hat function h(x) = 1 / (x ^ q) */\n const RealType h(const RealType x)\n {\n return std::exp(-q * std::log(x));\n }\n\n static constexpr RealType epsilon = 1e-8;\n\n IntType n; ///< Number of elements\n RealType q; ///< Exponent\n RealType H_x1; ///< H(x_1)\n RealType H_n; ///< H(n)\n std::uniform_real_distribution<RealType> dist; ///< [H(x_1), H(n)]\n};\nRun Code Online (Sandbox Code Playgroud)\n
小智 5
单独的预先计算并没有多大帮助。但很明显 sum_prob 是累积的并且具有升序。因此,如果我们使用二进制搜索来查找 zipf_value,我们会将生成 Zipf 分布数的顺序从 O(n) 降低到 O(log(n))。这在效率上有很大的提高。
在这里,只需将其中的zipf()函数替换为genzipf.c以下函数:
int zipf(double alpha, int n)
{
static int first = TRUE; // Static first time flag
static double c = 0; // Normalization constant
static double *sum_probs; // Pre-calculated sum of probabilities
double z; // Uniform random number (0 < z < 1)
int zipf_value; // Computed exponential value to be returned
int i; // Loop counter
int low, high, mid; // Binary-search bounds
// Compute normalization constant on first call only
if (first == TRUE)
{
for (i=1; i<=n; i++)
c = c + (1.0 / pow((double) i, alpha));
c = 1.0 / c;
sum_probs = malloc((n+1)*sizeof(*sum_probs));
sum_probs[0] = 0;
for (i=1; i<=n; i++) {
sum_probs[i] = sum_probs[i-1] + c / pow((double) i, alpha);
}
first = FALSE;
}
// Pull a uniform random number (0 < z < 1)
do
{
z = rand_val(0);
}
while ((z == 0) || (z == 1));
// Map z to the value
low = 1, high = n, mid;
do {
mid = floor((low+high)/2);
if (sum_probs[mid] >= z && sum_probs[mid-1] < z) {
zipf_value = mid;
break;
} else if (sum_probs[mid] >= z) {
high = mid-1;
} else {
low = mid+1;
}
} while (low <= high);
// Assert that zipf_value is between 1 and N
assert((zipf_value >=1) && (zipf_value <= n));
return(zipf_value);
}
Run Code Online (Sandbox Code Playgroud)