ast*_*ris 12 python numpy scipy
我在python中有一组数据.我将其绘制为直方图,该图显示双峰分布,因此我试图在双峰中的每个峰上绘制两个高斯分布图.
如果我使用下面的代码,则需要我有两个相同大小的数据集.但是我只有一个数据集,这不能平分.我怎样才能适应这两位高斯人
from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(yourdata)
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
histdist = matplotlib.pyplot.hist(yourdata, 100, normed=True)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3)
plotgauss1(histdist[1])
plotgauss2(histdist[1])
Run Code Online (Sandbox Code Playgroud)
B. *_* M. 26
这里使用scipy工具进行模拟:
from pylab import *
from scipy.optimize import curve_fit
data=concatenate((normal(1,.2,5000),normal(2,.2,2500)))
y,x,_=hist(data,100,alpha=.3,label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)
def gauss(x,mu,sigma,A):
return A*exp(-(x-mu)**2/2/sigma**2)
def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
expected=(1,.2,250,2,.2,125)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')
legend()
print(params,'\n',sigma)
Run Code Online (Sandbox Code Playgroud)
数据是两个正常样本的叠加,该模型是高斯曲线的总和.我们获得 :
估计参数是:
# via pandas :
# pd.DataFrame(data={'params':params,'sigma':sigma},index=bimodal.__code__.co_varnames[1:])
params sigma
mu1 0.999447 0.002683
sigma1 0.202465 0.002696
A1 226.296279 2.597628
mu2 2.003028 0.005036
sigma2 0.193235 0.005058
A2 117.823706 2.658789
Run Code Online (Sandbox Code Playgroud)
sklearn这可以使用Python 库以干净简单的方式实现:
import numpy as np
from sklearn.mixture import GaussianMixture
from pylab import concatenate, normal
# First normal distribution parameters
mu1 = 1
sigma1 = 0.1
# Second normal distribution parameters
mu2 = 2
sigma2 = 0.2
w1 = 2/3 # Proportion of samples from first distribution
w2 = 1/3 # Proportion of samples from second distribution
n = 7500 # Total number of samples
n1 = int(n*w1) # Number of samples from first distribution
n2 = int(n*w2) # Number of samples from second distribution
# Generate n1 samples from the first normal distribution and n2 samples from the second normal distribution
X = concatenate((normal(mu1, sigma1, n1), normal(mu2, sigma2, n2))).reshape(-1, 1)
# Determine parameters mu1, mu2, sigma1, sigma2, w1 and w2
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
print(f'mu1={gm.means_[0]}, mu2={gm.means_[1]}')
print(f'sigma1={np.sqrt(gm.covariances_[0])}, sigma2={np.sqrt(gm.covariances_[1])}')
print(f'w1={gm.weights_[0]}, w2={gm.weights_[1]}')
print(f'n1={int(n * gm.weights_[0])} n2={int(n * gm.weights_[1])}')
Run Code Online (Sandbox Code Playgroud)
由于命名空间混乱,现在 matplotlib 强烈建议不要使用pylab ,因此我用 BM 重写了脚本并添加了两个组件的绘图:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
#data generation
np.random.seed(123)
data=np.concatenate((np.random.normal(1, .2, 5000), np.random.normal(1.6, .3, 2500)))
y,x,_=plt.hist(data, 100, alpha=.3, label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)
#x, y inputs can be lists or 1D numpy arrays
def gauss(x, mu, sigma, A):
return A*np.exp(-(x-mu)**2/2/sigma**2)
def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
expected = (1, .2, 250, 2, .2, 125)
params, cov = curve_fit(bimodal, x, y, expected)
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
#plot combined...
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
#...and individual Gauss curves
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
#and the original data points if no histogram has been created before
#plt.scatter(x, y, marker="X", color="black", label="original data")
plt.legend()
print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show()
Run Code Online (Sandbox Code Playgroud)
params sigma
mu1 1.014589 0.005273
sigma1 0.203826 0.004067
A1 230.654585 3.667409
mu2 1.635225 0.022423
sigma2 0.282690 0.019070
A2 72.621856 2.148459
Run Code Online (Sandbox Code Playgroud)