为什么我的“载荷”在我的 PCA 双标图中有偏移?(Python,Matplotlib)

O.r*_*rka 7 python statistics numpy matplotlib pca

我知道有一些包可以为我做双标图,但我想了解这些加载是如何绘制的,因为到目前为止它看起来像巫术。

我的理解如下:

  1. 获取载荷
  2. 计算每台PC中每个特征的相对贡献
  3. 使用贡献作为大小,然后使用来自原始载荷的方向性
  4. 绘制这些向量

我正在将我的结果与 PCA 包 ( https://github.com/erdogant/pca ) 进行比较,因为我比我的直觉更相信这些结果。我试图弄清楚我的逻辑哪里有缺陷以及我的绘图做错了什么。在原始的源代码开始绘制双标图这里

虹膜示例看起来很准确,但我的小样本珊瑚基因表达气候变化数据集看起来根本不正确。

我似乎无法弄清楚我做错了什么。任何帮助都会很棒。

import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Test version to understand how loadings are plotted
def plot_biplot(
    data:pd.DataFrame,
    arrow_scale=1,
    n_feat=4,
    ):

    # Build PCA
    model = PCA(n_components=min(data.shape))
    projection = pd.DataFrame(model.fit_transform(data), index=data.index)
    projection.columns = projection.columns.map(lambda j: "PC.{}".format(j+1))
    projection = projection.iloc[:,:2]

    # Get loading subset
    loadings = pd.DataFrame(model.components_.T, index=data.columns)
    loadings.columns = loadings.columns.map(lambda j: "PC.{}".format(j+1))
    loadings = loadings.iloc[:,:2]
    features = loadings.iloc[:,0].abs().sort_values(ascending=False).iloc[:n_feat].index
    loadings = loadings.loc[features]

    # Get contributions of each feature to each PC
    contributions = loadings.abs()
    contributions = contributions/contributions.sum(axis=0)
    contributions = contributions.iloc[:,:2]

    # Get projections
    x = projection.iloc[:,0].values
    y = projection.iloc[:,1].values

    mean_x = np.mean(x)
    max_x = np.abs(x).max()
    mean_y = np.mean(y)
    max_y = np.abs(y).max()

    # Keywords
    _arrow_kws = dict(color="red", linewidth=1, alpha=0.618) 
    _arrowhead_kws = dict(arrowstyle="-|>", mutation_scale=20, facecolor="red", alpha=0.618)
    _text_kws = dict(color="red", ha='center', va='center')

    # Plot arrows and text
    with plt.style.context("seaborn-white"):
        fig, ax = plt.subplots(figsize=(5,5))
        ax.scatter(x,y,s=50,c="black", linewidth=1, edgecolor="white")
        ax.set_xlabel("PC.1({:.3f})".format(model.explained_variance_ratio_[0]), fontsize=15)
        ax.set_ylabel("PC.2({:.3f})".format(model.explained_variance_ratio_[1]), fontsize=15)

        for i in range(0, loadings.shape[0]):
            feature = loadings.index[i]
            # Set PC1 vs PC2 direction. Note that these are not neccarily the best loading.
            xarrow = contributions.loc[feature].iloc[0] * np.sign(loadings.loc[feature].iloc[0]) * max_x * arrow_scale
            yarrow = contributions.loc[feature].iloc[1] * np.sign(loadings.loc[feature].iloc[1]) * max_y * arrow_scale

            # Plot arrow
            xyA = (mean_x, mean_y)
            xyB = (xarrow, yarrow)
            con = ConnectionPatch(xyA, xyB, coordsA="data", coordsB="data", **_arrowhead_kws)
            ax.plot([xyA[0], xyB[0]], [xyA[1], xyB[1]], **_arrow_kws)
            ax.add_artist(con)

            # Plot feature label
            ax.text(xarrow*1.11, yarrow*1.11, feature,  **_text_kws)

        return ax
    
# Gold standard
# !pip install pca
# https://github.com/erdogant/pca
from pca import pca

def pca_wrapper(data, n_feat=4):
    # Initialize to reduce the data up to the number of componentes that explains 95% of the variance.
    model = pca(n_components=0.95)

    # Or reduce the data towards 2 PCs
    model = pca(n_components=2)

    # Fit transform
    results = model.fit_transform(data)

    # Make biplot with the number of features
    with plt.style.context("seaborn-white"):
        fig, ax = model.biplot(y=np.asarray(["_"]*data.shape[0]), n_feat=n_feat, legend=False, label=False, figsize=(5,5))
    return ax

# Iris data
iris = load_iris()
X_iris = pd.DataFrame(
    data=iris.data, 
    columns=map(lambda j:j[:-5].replace(" ","_"), iris.feature_names),
    index=map(lambda i:"iris_{}".format(i), range(150))
)

# Test version
plot_biplot(X_iris, arrow_scale=2)
Run Code Online (Sandbox Code Playgroud)

在此处输入图片说明

# Gold standard
pca_wrapper(X_iris)
Run Code Online (Sandbox Code Playgroud)

在此处输入图片说明

# Coral dataset
X_corals = pd.read_csv("https://pastebin.com/raw/iVinU2p3", sep="\t", index_col=0)


# Test version
plot_biplot(X_corals, arrow_scale=2)
Run Code Online (Sandbox Code Playgroud)

在此处输入图片说明

# Gold standard
pca_wrapper(X_corals)
Run Code Online (Sandbox Code Playgroud)

在此处输入图片说明