Press "Enter" to skip to content

PCA是最简单的以特征量分析多元统计分布的方法。通常情况下，这种运算可以被看作是揭露数据的内部结构，从而更好的解释数据的变量的方法。如果一个多元数据集能够在一个高维数据空间坐标系中被显现出来，那幺PCA就能够提供一幅比较低维度的图像，这幅图像即为在讯息最多的点上原对象的一个‘投影’。这样就可以利用少量的主成分使得数据的维度降低了。

PCA跟因子分析密切相关，并且已经有很多混合这两种分析的统计包。而真实要素分析则是假定底层结构，求得微小差异矩阵的特征向量。

PCA，Principle Component Analysis，即主成分分析法，是特征降维的最常用手段。顾名思义，PCA 能从冗余特征中提取主要成分，在不太损失模型质量的情况下，提升了模型训练速度。

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

```# 导入数据
dataset = pd.read_csv('Wine.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 13].values
dataset.head(10)```

```# 分成训练集与测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train[:3]```

array([[1.369e+01, 3.260e+00, 2.540e+00, 2.000e+01, 1.070e+02, 1.830e+00,

5.600e-01, 5.000e-01, 8.000e-01, 5.880e+00, 9.600e-01, 1.820e+00,

6.800e+02],

[1.269e+01, 1.530e+00, 2.260e+00, 2.070e+01, 8.000e+01, 1.380e+00,

1.460e+00, 5.800e-01, 1.620e+00, 3.050e+00, 9.600e-01, 2.060e+00,

4.950e+02],

[1.162e+01, 1.990e+00, 2.280e+00, 1.800e+01, 9.800e+01, 3.020e+00,

2.260e+00, 1.700e-01, 1.350e+00, 3.250e+00, 1.160e+00, 2.960e+00,

3.450e+02]])

```# 特征缩放
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train[:3]```

array([[ 0.87668336, 0.79842885, 0.64412971, 0.12974277, 0.48853231,

-0.70326216, -1.42846826, 1.0724566 , -1.36820277, 0.35193216,

0.0290166 , -1.06412236, -0.2059076 ],

[-0.36659076, -0.7581304 , -0.39779858, 0.33380024, -1.41302392,

-1.44153145, -0.5029981 , 1.70109989, 0.02366802, -0.84114577,

0.0290166 , -0.73083231, -0.81704676],

[-1.69689407, -0.34424759, -0.32337513, -0.45327855, -0.14531976,

1.24904997, 0.31964204, -1.52069698, -0.4346309 , -0.75682931,

0.90197362, 0.51900537, -1.31256499]])

```# 测试 PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
# explained_variance_ratio_，它代表降维后的各主成分的方差值占总方差值的比例，这个比例越大，则越是重要的主成分。
explained_variance = pca.explained_variance_ratio_
explained_variance```

array([0.36884109, 0.19318394, 0.10752862, 0.07421996, 0.06245904,

0.04909 , 0.04117287, 0.02495984, 0.02308855, 0.01864124,

0.01731766, 0.01252785, 0.00696933])

# 这里取前 2 个主成分，它可以解释 (0.3688+0.1931) 的方差

from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

X_train = pca.fit_transform(X_train)

X_test = pca.transform(X_test)

print(explained_variance)

X_train[:3]

[0.36884109 0.19318394]

array([[-2.17884511, -1.07218467],

[-1.80819239, 1.57822344],

[ 1.09829474, 2.22124345]])

```# 逻辑回归拟合训练集
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)```

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,

intercept_scaling=1, max_iter=100, multi_class=’ovr’, n_jobs=1,

penalty=’l2′, random_state=0, solver=’liblinear’, tol=0.0001,

verbose=0, warm_start=False)

# 预测测试集

y_pred = classifier.predict(X_test)

y_pred[:5]

array([1, 3, 2, 1, 2])

# 混淆矩阵

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm

array([[14, 0, 0],

[ 1, 15, 0],

[ 0, 0, 6]])

# 预测正确的为正对角线的值，准确率为 (14+15+6) / (14+15+6+1)

print(“准确率(精度)为 :”, (14+15+6)/(14+15+6+1))

```# 可视化训练集
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('逻辑回归 (训练集)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()```

```# 可视化测试集
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('逻辑回归 (测试集)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()```

output_12_0.png