O.r*_*rka 7 python statistics numpy matplotlib pca
我知道有一些包可以为我做双标图,但我想了解这些加载是如何绘制的,因为到目前为止它看起来像巫术。
我的理解如下:
我正在将我的结果与 PCA 包 ( https://github.com/erdogant/pca ) 进行比较,因为我比我的直觉更相信这些结果。我试图弄清楚我的逻辑哪里有缺陷以及我的绘图做错了什么。在原始的源代码开始绘制双标图这里。
虹膜示例看起来很准确,但我的小样本珊瑚基因表达气候变化数据集看起来根本不正确。
我似乎无法弄清楚我做错了什么。任何帮助都会很棒。
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# Test version to understand how loadings are plotted
def plot_biplot(
data:pd.DataFrame,
arrow_scale=1,
n_feat=4,
):
# Build PCA
model = PCA(n_components=min(data.shape))
projection = pd.DataFrame(model.fit_transform(data), index=data.index)
projection.columns = projection.columns.map(lambda j: "PC.{}".format(j+1))
projection = projection.iloc[:,:2]
# Get loading subset
loadings = pd.DataFrame(model.components_.T, index=data.columns)
loadings.columns = loadings.columns.map(lambda j: "PC.{}".format(j+1))
loadings = loadings.iloc[:,:2]
features = loadings.iloc[:,0].abs().sort_values(ascending=False).iloc[:n_feat].index
loadings = loadings.loc[features]
# Get contributions of each feature to each PC
contributions = loadings.abs()
contributions = contributions/contributions.sum(axis=0)
contributions = contributions.iloc[:,:2]
# Get projections
x = projection.iloc[:,0].values
y = projection.iloc[:,1].values
mean_x = np.mean(x)
max_x = np.abs(x).max()
mean_y = np.mean(y)
max_y = np.abs(y).max()
# Keywords
_arrow_kws = dict(color="red", linewidth=1, alpha=0.618)
_arrowhead_kws = dict(arrowstyle="-|>", mutation_scale=20, facecolor="red", alpha=0.618)
_text_kws = dict(color="red", ha='center', va='center')
# Plot arrows and text
with plt.style.context("seaborn-white"):
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(x,y,s=50,c="black", linewidth=1, edgecolor="white")
ax.set_xlabel("PC.1({:.3f})".format(model.explained_variance_ratio_[0]), fontsize=15)
ax.set_ylabel("PC.2({:.3f})".format(model.explained_variance_ratio_[1]), fontsize=15)
for i in range(0, loadings.shape[0]):
feature = loadings.index[i]
# Set PC1 vs PC2 direction. Note that these are not neccarily the best loading.
xarrow = contributions.loc[feature].iloc[0] * np.sign(loadings.loc[feature].iloc[0]) * max_x * arrow_scale
yarrow = contributions.loc[feature].iloc[1] * np.sign(loadings.loc[feature].iloc[1]) * max_y * arrow_scale
# Plot arrow
xyA = (mean_x, mean_y)
xyB = (xarrow, yarrow)
con = ConnectionPatch(xyA, xyB, coordsA="data", coordsB="data", **_arrowhead_kws)
ax.plot([xyA[0], xyB[0]], [xyA[1], xyB[1]], **_arrow_kws)
ax.add_artist(con)
# Plot feature label
ax.text(xarrow*1.11, yarrow*1.11, feature, **_text_kws)
return ax
# Gold standard
# !pip install pca
# https://github.com/erdogant/pca
from pca import pca
def pca_wrapper(data, n_feat=4):
# Initialize to reduce the data up to the number of componentes that explains 95% of the variance.
model = pca(n_components=0.95)
# Or reduce the data towards 2 PCs
model = pca(n_components=2)
# Fit transform
results = model.fit_transform(data)
# Make biplot with the number of features
with plt.style.context("seaborn-white"):
fig, ax = model.biplot(y=np.asarray(["_"]*data.shape[0]), n_feat=n_feat, legend=False, label=False, figsize=(5,5))
return ax
# Iris data
iris = load_iris()
X_iris = pd.DataFrame(
data=iris.data,
columns=map(lambda j:j[:-5].replace(" ","_"), iris.feature_names),
index=map(lambda i:"iris_{}".format(i), range(150))
)
# Test version
plot_biplot(X_iris, arrow_scale=2)
Run Code Online (Sandbox Code Playgroud)
# Gold standard
pca_wrapper(X_iris)
Run Code Online (Sandbox Code Playgroud)
# Coral dataset
X_corals = pd.read_csv("https://pastebin.com/raw/iVinU2p3", sep="\t", index_col=0)
# Test version
plot_biplot(X_corals, arrow_scale=2)
Run Code Online (Sandbox Code Playgroud)
# Gold standard
pca_wrapper(X_corals)
Run Code Online (Sandbox Code Playgroud)