目录
1. 载入数据
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import mean_squared_error from sklearn.model_selection import cross_val_score from collections import Counter from sklearn.datasets import load_boston sns.set_style('darkgrid') boston_dataset = load_boston() dataset = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names) dataset.head()
输出:
CRIM ZN INDUS CHAS NOX ... RAD TAX PTRATIO B LSTAT 0 0.00632 18.0 2.31 0.0 0.538 ... 1.0 296.0 15.3 396.90 4.98 1 0.02731 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 396.90 9.14 2 0.02729 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 392.83 4.03 3 0.03237 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 394.63 2.94 4 0.06905 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 396.90 5.33 [5 rows x 13 columns]
列解释Columns:
- CRIM: 按城镇划分的人均犯罪率
- ZN: 大于25000平方英尺的住宅用地比例
- INDUS: 每个城镇非零售业务英亩比例
- CHAS : 查尔斯河流 哑变量 (靠近河流为1; 否则为0)
- NOX: 一氧化氮浓度 (百万分之)
- RM: 每个住宅的房间数
- AGE: 1940年之前建造的自由单位
- DIS: 与5个波士顿就业中心的加权距离
- RAD: 高速公里同行能力指数
- PTRATIO: 按城镇划分的师生比例
- B: 1000(Bk — 0.63)², 按城镇划分的非裔人口结构比例
- LSTAT: 低收入人口百分比
- MEDV: 自有住房的中位数价值(单位:1000美元)
上图看到没有 MEDV
这个我们要预测的列,先加进来。
dataset['MEDV'] = boston_dataset.target
现在再看就有了。
2. 数据分析
2.1 预处理
看看有没有缺失值
dataset.isnull().sum()
设置特征和标签
X = dataset.iloc[:, 0:13].values y = dataset.iloc[:, 13].values.reshape(-1,1)
分割训练和测试
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)
看看分割结果
print("Shape of X_train: ",X_train.shape) print("Shape of X_test: ", X_test.shape) print("Shape of y_train: ",y_train.shape) print("Shape of y_test",y_test.shape) 输出: Shape of X_train: (354, 13) Shape of X_test: (152, 13) Shape of y_train: (354, 1) Shape of y_test (152, 1)
2.2 可视化
#相关系数矩阵,即给出了任意两个变量之间的相关系数 corr = dataset.corr() import matplotlib.pyplot as plt import seaborn as sns fig, ax = plt.subplots(figsize=(10, 10)) sns.heatmap(corr, cmap='RdBu', annot=True, fmt=".2f") plt.xticks(range(len(corr.columns)), corr.columns) plt.yticks(range(len(corr.columns)), corr.columns) plt.show()
sns.pairplot(dataset) plt.show()
3. 训练模型
3.1 线性拟合
from sklearn.linear_model import LinearRegression regressor_linear = LinearRegression() regressor_linear.fit(X_train, y_train)
看看此时的预测得分
from sklearn.metrics import r2_score # 交叉验证:将数据集分为10折,做一次交叉验证,实际上它是计算了十次,将每一折都当做一次测试集,其余九折当做训练集,这样循环十次。通过传入的模型,训练十次,最后将十次结果求平均值。 cv_linear = cross_val_score(estimator = regressor_linear, X = X_train, y = y_train, cv = 10) # R2 score,即决定系数,反映因变量的全部变异能通过回归关系被自变量解释的比例 y_pred_linear_train = regressor_linear.predict(X_train) r2_score_linear_train = r2_score(y_train, y_pred_linear_train) y_pred_linear_test = regressor_linear.predict(X_test) r2_score_linear_test = r2_score(y_test, y_pred_linear_test) # RMSE一般指均方根误差。均方根误差亦称标准误差。 rmse_linear = (np.sqrt(mean_squared_error(y_test, y_pred_linear_test))) print("CV: ", cv_linear.mean()) print('R2_score (train): ', r2_score_linear_train) print('R2_score (test): ', r2_score_linear_test) print("RMSE: ", rmse_linear)
输出:
CV: 0.6984854476156042 R2_score (train): 0.7435787589010061 R2_score (test): 0.7133593313710366 RMSE: 4.6472797457242
3.2 多项式回归(二次)
from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree = 2) X_poly = poly_reg.fit_transform(X_train) poly_reg.fit(X_poly, y_train) regressor_poly2 = LinearRegression() regressor_poly2.fit(X_poly, y_train)
看预测得分
from sklearn.metrics import r2_score cv_poly2 = cross_val_score(estimator = regressor_poly2, X = X_train, y = y_train, cv = 10) y_pred_poly2_train = regressor_poly2.predict(poly_reg.fit_transform(X_train)) r2_score_poly2_train = r2_score(y_train, y_pred_poly2_train) y_pred_poly2_test = regressor_poly2.predict(poly_reg.fit_transform(X_test)) r2_score_poly2_test = r2_score(y_test, y_pred_poly2_test) rmse_poly2 = (np.sqrt(mean_squared_error(y_test, y_pred_poly2_test))) print('CV: ', cv_poly2.mean()) print('R2_score (train): ', r2_score_poly2_train) print('R2_score (test): ', r2_score_poly2_test) print("RMSE: ", rmse_poly2)
输出略,最后汇总
3.3 脊回归(Ridge Regression),又叫岭回归
lasso 回归和岭回归(ridge regression)其实就是在标准线性回归的基础上分别加入 L1 和 L2 正则化(regularization)
from sklearn.linear_model import Ridge from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures steps = [ ('scalar', StandardScaler()), ('poly', PolynomialFeatures(degree=2)), ('model', Ridge(alpha=3.8, fit_intercept=True)) ] ridge_pipe = Pipeline(steps) ridge_pipe.fit(X_train, y_train)
评估
from sklearn.metrics import r2_score cv_ridge = cross_val_score(estimator = ridge_pipe, X = X_train, y = y_train.ravel(), cv = 10) y_pred_ridge_train = ridge_pipe.predict(X_train) r2_score_ridge_train = r2_score(y_train, y_pred_ridge_train) y_pred_ridge_test = ridge_pipe.predict(X_test) r2_score_ridge_test = r2_score(y_test, y_pred_ridge_test) rmse_ridge = (np.sqrt(mean_squared_error(y_test, y_pred_ridge_test))) print('CV: ', cv_ridge.mean()) print('R2_score (train): ', r2_score_ridge_train) print('R2_score (test): ', r2_score_ridge_test) print("RMSE: ", rmse_ridge)
3.4 Lasso 回归
from sklearn.linear_model import Lasso from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures steps = [ ('scalar', StandardScaler()), ('poly', PolynomialFeatures(degree=2)), ('model', Lasso(alpha=0.012, fit_intercept=True, max_iter=3000)) ] lasso_pipe = Pipeline(steps) lasso_pipe.fit(X_train, y_train)
评估
from sklearn.metrics import r2_score # Predicting Cross Validation Score cv_lasso = cross_val_score(estimator = lasso_pipe, X = X_train, y = y_train, cv = 10) # Predicting R2 Score the Test set results y_pred_lasso_train = lasso_pipe.predict(X_train) r2_score_lasso_train = r2_score(y_train, y_pred_lasso_train) # Predicting R2 Score the Test set results y_pred_lasso_test = lasso_pipe.predict(X_test) r2_score_lasso_test = r2_score(y_test, y_pred_lasso_test) # Predicting RMSE the Test set results rmse_lasso = (np.sqrt(mean_squared_error(y_test, y_pred_lasso_test))) print('CV: ', cv_lasso.mean()) print('R2_score (train): ', r2_score_lasso_train) print('R2_score (test): ', r2_score_lasso_test) print("RMSE: ", rmse_lasso)
3.5 支持向量回归 Support Vector Regression
支持向量分类的方法能被推广到解决回归问题,称为支持向量回归
# Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X_scaled = sc_X.fit_transform(X_train) y_scaled = sc_y.fit_transform(y_train.reshape(-1,1)) # Fitting the SVR Model to the dataset from sklearn.svm import SVR regressor_svr = SVR(kernel = 'rbf', gamma = 'scale') regressor_svr.fit(X_scaled, y_scaled.ravel())
评估
from sklearn.metrics import r2_score # Predicting Cross Validation Score cv_svr = cross_val_score(estimator = regressor_svr, X = X_scaled, y = y_scaled.ravel(), cv = 10) # Predicting R2 Score the Train set results y_pred_svr_train = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_train))) r2_score_svr_train = r2_score(y_train, y_pred_svr_train) # Predicting R2 Score the Test set results y_pred_svr_test = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_test))) r2_score_svr_test = r2_score(y_test, y_pred_svr_test) # Predicting RMSE the Test set results rmse_svr = (np.sqrt(mean_squared_error(y_test, y_pred_svr_test))) print('CV: ', cv_svr.mean()) print('R2_score (train): ', r2_score_svr_train) print('R2_score (test): ', r2_score_svr_test) print("RMSE: ", rmse_svr)
3.6 决策树回归 Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor regressor_dt = DecisionTreeRegressor(random_state = 0) regressor_dt.fit(X_train, y_train)
评估
from sklearn.metrics import r2_score # Predicting Cross Validation Score cv_dt = cross_val_score(estimator = regressor_dt, X = X_train, y = y_train, cv = 10) # Predicting R2 Score the Train set results y_pred_dt_train = regressor_dt.predict(X_train) r2_score_dt_train = r2_score(y_train, y_pred_dt_train) # Predicting R2 Score the Test set results y_pred_dt_test = regressor_dt.predict(X_test) r2_score_dt_test = r2_score(y_test, y_pred_dt_test) # Predicting RMSE the Test set results rmse_dt = (np.sqrt(mean_squared_error(y_test, y_pred_dt_test))) print('CV: ', cv_dt.mean()) print('R2_score (train): ', r2_score_dt_train) print('R2_score (test): ', r2_score_dt_test) print("RMSE: ", rmse_dt)
3.7 随机森林回归 Random Forest Regression
from sklearn.ensemble import RandomForestRegressor regressor_rf = RandomForestRegressor(n_estimators = 500, random_state = 0) regressor_rf.fit(X_train, y_train.ravel())
评估
from sklearn.metrics import r2_score # Predicting Cross Validation Score cv_rf = cross_val_score(estimator = regressor_rf, X = X_scaled, y = y_train.ravel(), cv = 10) # Predicting R2 Score the Train set results y_pred_rf_train = regressor_rf.predict(X_train) r2_score_rf_train = r2_score(y_train, y_pred_rf_train) # Predicting R2 Score the Test set results y_pred_rf_test = regressor_rf.predict(X_test) r2_score_rf_test = r2_score(y_test, y_pred_rf_test) # Predicting RMSE the Test set results rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test))) print('CV: ', cv_rf.mean()) print('R2_score (train): ', r2_score_rf_train) print('R2_score (test): ', r2_score_rf_test) print("RMSE: ", rmse_rf)
4. 评估结果汇总
models = [('Linear Regression', rmse_linear, r2_score_linear_train, r2_score_linear_test, cv_linear.mean()), ('Polynomial Regression (2nd)', rmse_poly2, r2_score_poly2_train, r2_score_poly2_test, cv_poly2.mean()), ('Ridge Regression', rmse_ridge, r2_score_ridge_train, r2_score_ridge_test, cv_ridge.mean()), ('Lasso Regression', rmse_lasso, r2_score_lasso_train, r2_score_lasso_test, cv_lasso.mean()), ('Support Vector Regression', rmse_svr, r2_score_svr_train, r2_score_svr_test, cv_svr.mean()), ('Decision Tree Regression', rmse_dt, r2_score_dt_train, r2_score_dt_test, cv_dt.mean()), ('Random Forest Regression', rmse_rf, r2_score_rf_train, r2_score_rf_test, cv_rf.mean()) ]
看看表格
predict = pd.DataFrame(data = models, columns=['Model', 'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation']) predict
输出:
No | Model | RMSE | R2_Score(training) | R2_Score(test) | Cross-Validation |
---|---|---|---|---|---|
0 | Linear Regression | 4.647280 | 0.743579 | 0.713359 | 0.698485 |
1 | Polynomial Regression (2nd) | 4.194313 | 0.930656 | 0.766513 | 0.698485 |
2 | Ridge Regression | 2.853062 | 0.922818 | 0.891965 | 0.763563 |
3 | Lasso Regression | 2.811451 | 0.923402 | 0.895094 | 0.750544 |
4 | Support Vector Regression | 3.838898 | 0.874272 | 0.804407 | 0.782601 |
5 | Decision Tree Regression | 5.723785 | 1.000000 | 0.565183 | 0.613295 |
6 | Random Forest Regression | 3.211470 | 0.976717 | 0.863118 | 0.818677 |
5. 可视化评估结果
f, axe = plt.subplots(1,1, figsize=(18,6)) predict.sort_values(by=['Cross-Validation'], ascending=False, inplace=True) sns.barplot(x='Cross-Validation', y='Model', data = predict, ax = axe) #axes[0].set(xlabel='Region', ylabel='Charges') axe.set_xlabel('Cross-Validaton Score', size=16) axe.set_ylabel('Model') axe.set_xlim(0,1.0) plt.show()
f, axes = plt.subplots(2,1, figsize=(14,10)) predict.sort_values(by=['R2_Score(training)'], ascending=False, inplace=True) sns.barplot(x='R2_Score(training)', y='Model', data = predict, palette='Blues_d', ax = axes[0]) #axes[0].set(xlabel='Region', ylabel='Charges') axes[0].set_xlabel('R2 Score (Training)', size=16) axes[0].set_ylabel('Model') axes[0].set_xlim(0,1.0) predict.sort_values(by=['R2_Score(test)'], ascending=False, inplace=True) sns.barplot(x='R2_Score(test)', y='Model', data = predict, palette='Reds_d', ax = axes[1]) #axes[0].set(xlabel='Region', ylabel='Charges') axes[1].set_xlabel('R2 Score (Test)', size=16) axes[1].set_ylabel('Model') axes[1].set_xlim(0,1.0) plt.show()
predict.sort_values(by=['RMSE'], ascending=False, inplace=True) f, axe = plt.subplots(1,1, figsize=(18,6)) sns.barplot(x='Model', y='RMSE', data=predict, ax = axe) axe.set_xlabel('Model', size=16) axe.set_ylabel('RMSE', size=16) plt.show()
Be First to Comment