## 一、特征工程简介

```import pandas as pd
print(data.shape)
X = data.iloc[:,1:]
y = data.iloc[:,0]
print(X.shape)
print(y.shape)```

## 二、Filter过滤法

### 1 、方差过滤

VarianceThreshold有重要参数threshold，表示方差的阈值，表示舍弃所有方差小于threshold的特征，不填默认为0，即删除所有的记录都相同的特征。我们通过一段代码看看其实现的效果，具体代码如下：

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
X = data.iloc[:,1:]
y = data.iloc[:,0]
VTS = VarianceThreshold() #实例化，不填参数默认⽅差为0
X_var0 = VTS.fit_transform(X) #获取删除不合格特征之后的新特征矩阵
print(X_var0.shape)```

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
print(X_fsvar.shape)```

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
VTS = VarianceThreshold(np.median(X.var().values)) #实例化
VTS = VTS.fit(X) #训练模型
X_fsvar = VTS.transform(X) #将X降维，只保留需要的特征
#查看模型相关接⼝
VTS.get_support(indices=False)
#返回与原特征⻓度相等的布尔索引，被留下的特征为True
VTS.get_support(indices=True)
#提取出所有满⾜要求的特证名
print(X.columns[VTS.get_support(indices=False)])
print(X.columns[VTS.get_support(indices=True)])
#提取出满⾜要求的特征矩阵
#以下两种表达都可以
print(X.iloc[:,VTS.get_support(indices=True)])
print(X.loc[:,VTS.get_support()])```

```import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
print(X_fsvar)```

KNN方差过滤前数据运行情况

```import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
# print(X_fsvar)
print(cross_val_score(KNN(),X,y,cv=5).mean())```

KNN过滤前的准确率为：

KNN过滤前运行的时间为

KNN方差过滤后

```import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
# print(X_fsvar)
print(cross_val_score(KNN(),X,y,cv=5).mean())
print(cross_val_score(KNN(),X_fsvar,y,cv=5).mean())```

KNN过滤后的准确率为：

KNN过滤后的运行的时间为：

```import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
# print(X_fsvar)
# KNN过滤前
#print(cross_val_score(KNN(),X,y,cv=5).mean())
# KNN过滤后
#print(cross_val_score(KNN(),X_fsvar,y,cv=5).mean())
# RFC过滤前
print(cross_val_score(RFC(n_estimators=10,random_state=0),X,y,cv=5).mean())```

RFC过滤前运行时间为：

```import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
# print(X_fsvar)
# KNN过滤前
#print(cross_val_score(KNN(),X,y,cv=5).mean())
# KNN过滤后
#print(cross_val_score(KNN(),X_fsvar,y,cv=5).mean())
# RFC过滤前
print(cross_val_score(RFC(n_estimators=10,random_state=0),X,y,cv=5).mean())
# RFC过滤后
print(cross_val_score(RFC(n_estimators=10,random_state=0),X_fsvar,y,cv=5).mean())```

RFC过滤后的准确率与运行前的对比，结果如下：

RFC过滤后的运行时间如下：

1、 为什幺随机森林要比KNN运行速度快好多了？

2、 为什幺方差过滤对随机森林没很大的影响？

### 2、相关性过滤

#### (1)、卡方过滤

```import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
#假设在这⾥，已知需要300个特征
X_fschi = SelectKBest(chi2, k=300).fit_transform(X_fsvar, y)
print(X_fschi.shape)
# 卡方检测
print(cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean())```

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
score = []
for i in range(390,200,-10):
X_fschi = SelectKBest(chi2, k=i).fit_transform(X_fsvar, y)
once = cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()
score.append(once)
plt.plot(range(390,200,-10),score)
plt.show()```

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
chi2val, pval = chi2(X_fsvar,y)
print(chi2val)
print(pval)
k = chi2val.shape[0] - (pval > 0.05).sum()
print(k)```

#### (2)、F检验

F检验，又称ANOVA，方差齐性检验，是用来捕捉每个特征与标签之间的线性关系的过滤方法。它即可以做回归也可以做分类，因此包含​ `​feature_selection.f_classif​` ​​（F检验分类）和​ `​feature_selection.f_regression​` ​（F检验回归）两个类。其中F检验分类用于标签是离散型变量的数据，而F检验回归用于标签是连续型变量的数据。

F检验的本质是寻找两组数据之间的线性关系，其原假设是”数据不存在显着的线性关系“。它返回F值 和p值两个统计量。和卡方过滤一样，我们希望 选取p值小于0.05或0.01的特征 ， 这些特征与标签是显着线性相关的 ，而p值大于0.05或0.01的特征则被我们认为是和标签没有显着线性关系的特征，应该被删除。以F检验的分类为例，我们继续在数字数据集上来进行特征选择，具体使用实验代码如下：

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
from sklearn.feature_selection import f_classif
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
F, pval_f = f_classif(X_fsvar,y)
print(F)
print(pval_f)
k = F.shape[0] - (pval_f > 0.05).sum()
print(k)```

#### (3)、互信息法

```import pandas as pd
from sklearn.feature_selection import VarianceThreshold
import numpy as np
from sklearn.feature_selection import mutual_info_classif as MIC
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
result = MIC(X_fsvar,y)
k = result.shape[0] - sum(result <= 0)
print(k)```