在Python中使用pymc随机化网络

Swi*_*ing 8 random networking montecarlo python-3.x pymc

数据集中有两列,分别是user_id和site_name.它记录每个用户浏览的每个站点名称.

toy_dict = {'site_name': {0: u'\u4eac\u4e1c\u7f51\u4e0a\u5546\u57ce',
1: u'\u963f\u91cc\u4e91',
2: u'\u6dd8\u5b9d\u7f51',
3: u'\u624b\u673a\u6dd8\u5b9d\u7f51',
4: u'\u6211\u4eec\u7684\u70b9\u5fc3\u7f51',
5: u'\u8c46\u74e3\u7f51',
6: u'\u9ad8\u5fb7\u5730\u56fe',
7: u'\u817e\u8baf\u7f51',
8: u'\u70b9\u5fc3',
9: u'\u767e\u5ea6',
10: u'\u641c\u72d7',
11: u'\u8c37\u6b4c',
12: u'AccuWeather\u6c14\u8c61\u9884\u62a5',
13: u'\u79fb\u52a8\u68a6\u7f51',
14: u'\u817e\u8baf\u7f51',
15: u'\u641c\u72d7\u7f51',
16: u'360\u624b\u673a\u52a9\u624b',
17: u'\u641c\u72d0',
18: u'\u767e\u5ea6'},
'user_id': {0: 37924550,
1: 37924550,
2: 37924550,
3: 37924550,
4: 37924550,
5: 37924550,
6: 37924550,
7: 37924550,
8: 37924551,
9: 37924551,
10: 37924551,
11: 37924551,
12: 37924551,
13: 37924552,
14: 45285152,
15: 45285153,
16: 45285153,
17: 45285153,
18: 45285153}}
Run Code Online (Sandbox Code Playgroud)

现在我想重建随机网络,同时确保观察到的网络中有n个站点的人在随机网络中也有n个站点.

numpy.random.shuffle在Python是效率低时的数据量是巨大的.

我目前使用以下Python脚本:

import pandas as pd
import numpy as np
import itertools
from collections import Counter


for i in range (10): # reconstruct random network for 10 times
    name='site_exp'+str(i)
    name=pd.DataFrame(toy_dict)# read data
    np.random.shuffle(name['site_name'].values) # shuffle the data
    users=name['user_id'].drop_duplicates()
    groups=name.groupby('user_id')

    pairs = []
    for ui in users[:5]:
        userdata = groups.get_group(ui)
        userdata=userdata.drop_duplicates()
        site_list=userdata['site_name'].values
        pair=list(itertools.combinations(site_list,2))
        for j in pair:
            pairs.append(j)
    site_exp=pd.DataFrame(pairs, columns = ['node1', 'node2'], dtype= str)
    site_exp['pair']=site_exp['node1']+'<--->'+site_exp['node2']
    counterdict=Counter(site_exp['pair'].values)
    counterdict=pd.DataFrame(list(counterdict.items()),columns=['pair','site_obs'])
    counterdict.to_csv('site_exp'+str(i) + '.csv')
Run Code Online (Sandbox Code Playgroud)

我想知道我们是否可以在Python中使用蒙特卡罗算法并降低计算复杂度?

Dav*_*vid 0

洗牌复杂性

正如这里所解释的, np.shuffle 的时间复杂度是 O(n) ,所以至少在下面的程序中它不应该成为瓶颈,但让我们探讨下面问题的不同方面。

问题形式化和复杂性

如果我理解正确,您的问题可以表示为一个二部图,其中包含 N_u 个用户节点、N_s 个网站节点和它们之间的 N_v 个边,反映了访问情况,请参见下面的面板 (A)。

然后,计算访问同一对网站(您的反字典)的用户数量只需对应于 网站节点上的加权二分网络投影,请参见下面的面板 (B)。

暴力方法的加权二分网络投影的复杂度是 O(N_u^2*N_s ) 因此,当迭代多个随机化时,洗牌带来的 O(N_v) 应该可以忽略不计(当然,除非 N_v > N_u^2*N_s)。在图非常大的情况下,还有一些方法可以对二分网络投影进行采样。

在下面的小虚拟示例中,使用二分网络投影比您的实现快大约 150 倍(0.00024 秒与 0.03600 秒),并产生相同的结果。

二分图设置

代码1

# import modules
import collections
import itertools
import time
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import pandas as pd
import pymc3 as pm

# generate fake data for demonstration purposes
np.random.seed(0)
nvisits = 24
nusers = 12
nsites = 6
userz = np.random.choice(['U'+str(user).zfill(3) for user in range(nusers)], nvisits)
sitez = np.random.choice(range(nsites), nvisits)
users = np.unique(userz)
sites = np.unique(sitez)

# copy original implementation from the question
def get_site_pairs(users, sites, userz, sitez):
    dct = dict()
    dct['user'] = userz
    dct['site'] = sitez
    name=pd.DataFrame(dct)
    groups=name.groupby('user')
    pairs = []
    for ui in users:
        userdata = groups.get_group(ui)
        userdata=userdata.drop_duplicates()
        site_list=userdata['site'].values
        pair=list(itertools.combinations(site_list, 2))
        for j in pair:
            pairs.append(sorted(j))
    site_exp=pd.DataFrame(pairs, columns=['node1', 'node2'], dtype=str)
    site_exp['pair'] = site_exp['node1']+'<--->'+site_exp['node2']
    counterdict=collections.Counter(site_exp['pair'].values)
    counterdict=pd.DataFrame(list(counterdict.items()), columns=['pair','site_obs'])
    return counterdict

temp = time.time()
counterdict = get_site_pairs(users, sites, userz, sitez)
print (time.time() - temp)
# 0.03600 seconds

# implement bipartite-graph based algorithm
def get_site_graph(users, sites, userz, sitez):
    graph = nx.Graph()
    graph.add_nodes_from(users, bipartite=0)
    graph.add_nodes_from(sites, bipartite=1)
    graph.add_edges_from(zip(userz, sitez))
    projection = nx.algorithms.bipartite.projection.weighted_projected_graph(graph, sites)
    return graph, projection

temp = time.time()
graph, projection = get_site_graph(users, sites, userz, sitez)
print (time.time() - temp)
# 0.00024 seconds

# verify equality of results
for idr, row in counterdict.iterrows():
    u, v = np.array(row['pair'].split('<--->')).astype(np.int)
    pro = projection[u][v]
    assert row['site_obs'] == pro['weight']

# prepare graph layouts for plotting
layers = nx.bipartite_layout(graph, userz)
circle = nx.circular_layout(projection)
width = np.array(list(nx.get_edge_attributes(projection, 'weight').values()))
width = 0.2 + 0.8 * width / max(width)
degrees = graph.degree()

# plot graphs
fig = plt.figure(figsize=(16, 9))
plt.subplot(131)
plt.title('(A)\nbipartite graph', loc='center')
nx.draw_networkx(graph, layers, width=2)
plt.axis('off')
plt.subplot(132)
plt.title('(B)\none-mode projection (onto sites)', loc='center')
nx.draw_networkx(projection, circle, edge_color=plt.cm.Greys(width), width=2)
plt.axis('off')
plt.subplot(133)
plt.title('(C)\nrandomization setup', loc='center')
nx.draw_networkx(graph, layers, width=2)
plt.text(*(layers['U000']-[0.1, 0]), '$n_u=%s$' % degrees['U000'], ha='right')
plt.text(*(layers[0]+[0.1, 0]), '$n_s=%s$' % degrees[0], ha='left')
plt.text(*(layers[1]+[0.1, 0]), '$n_t=%s$' % degrees[1], ha='left')
plt.text(0.3, -1, '$N_v=%s$' % nvisits)
plt.plot([0.3]*2, [-1, 1], lw=160, color='white')
plt.axis('off')
Run Code Online (Sandbox Code Playgroud)

网络随机化和 PyMC3 模拟

当随机化用户列表时,正如问题中提到的,我们可以获得站点-站点连接的分布。对于中等规模的网络,这应该相当快,请参阅上面有关洗牌复杂性的争论和下面的代码示例。

如果网络太大,采样可能是一种选择,图形形式化有助于设置采样场景,请参见上面的面板 (C)。对于给定的 n_u 和 n_s 边缘随机化对应于从多元超几何分布中随机抽取。

不幸的是,PyMC3 尚不支持超几何分布。如果这有帮助,我添加了一个使用 PyMC3 并从下面的简单二项式分布中采样的小示例。黑色直方图显示了来自完全网络随机化和二分投影的站点间连接 n_{s,t} 的分布。灰色垂直线表示最大值n_{s,t} <= min(N_u, n_s, n_t)。红点来自二项式近似,假设有 nvisits*(nvisits-1)/2 对要分布的边,并且通过用户 u 连接节点 s 和 t 的机会为 p_s * p_u * p_t * p_u,其中 p_x = n_x / N_x。这里,假设所有边都是独立的,并且结果显然仅产生近似值。

图形随机化结果

代码2

# randomize user visits and store frequencies of site-site connections
niters = 1000
matrix = np.zeros((niters, nsites, nsites))
siten = collections.Counter(sitez)
for i in range(niters):
    np.random.shuffle(userz)
    graph, projection = get_site_graph(users, sites, userz, sitez)
    edges = projection.edges(data=True)
    for u, v, d in edges:
        matrix[i, u, v] = d['weight']

# define PyMC3 function for sampling from binomial distribution
def sample_pymc3(prob, number, bins, draws=1000):
    with pm.Model() as model:
        nst = pm.Binomial('nst', n=number, p=prob)
        trace = pm.sample(draws=draws, step=pm.Metropolis())
    nst = trace.get_values('nst')
    freqs = [np.mean(nst == val) for val in bins]
    return freqs

# define auxiliary variables
# probability to select site s by chance
probs = [np.mean(sitez == s) for s in sites]
# probability to select user u by chance
probu = [np.mean(userz == u) for u in users]

# plot connectivity statistics
nsitez = min(5, nsites)
bins = np.arange(9)
number = nvisits*(nvisits-1)/2
fig, axis = plt.subplots(nrows=nsitez,
                         ncols=nsitez,
                         figsize=(16, 9))
for s in sites[:nsitez]:
    for t in sites[:nsitez]:

        # prepare axis
        axia = axis[s, t]
        if t <= s:
            axia.set_axis_off()
            continue

        # plot histogram
        axia.hist(matrix[:, s, t], bins=bins, histtype='step', density=True,
                  zorder=-10, align='left', color='black', lw=2)
        axia.plot([min(siten[s], siten[t], nusers)+0.5]*2, [0, 0.5], lw=4, color='gray')

        # approximate probabilities using PyMC3
        prob = np.sum([probs[s] * pru * probs[t] * pru for pru in probu])
        freqs = sample_pymc3(prob, number, bins)
        freqs = sample_pymc3(prob, number, bins)
        axia.scatter(bins, freqs, color='red')

        # set axes
        nst = '$n_{s=%s,t=%s}$' % (s, t)
        axia.set_xlabel(nst)
        if t == s+1:
            axia.set_ylabel('frequency')

plt.suptitle('distribution of the number $n_{s,t}$\nof connections between site $s$ and $t$')
plt.tight_layout(rect=[-0.2, -0.2, 1, 0.9])
Run Code Online (Sandbox Code Playgroud)