## 一、数据无量钢化

### 1、数据归一化

```import numpy as np
import pandas as pd
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
x = np.array(data)
print(x)
# 实现numpy来实现归一化
x_nor = (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0))
print(x_nor)
# 逆转归一化
x_returned = x_nor * (x.max(axis=0) - x.min(axis=0)) + x.min(axis= 0)
print(x_returned)
# 使用pandas来实现归一化
x_pandas = pd.DataFrame()
# 使⽤pandas来实现归⼀化
x_normor =  (x - x.min()) / (x.max() - x.min())
print(x_normor)
#逆转归⼀化
x_adverse = x_normor * (x.max() - x.min()) + x.min()

```from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
#实现归⼀化
scaler = MinMaxScaler()   # 实例化
scaler = scaler.fit(data) #fit，在这⾥本质是⽣成min(x)和max(x)
result = scaler.transform(data) #通过接⼝导出结果
print(result)
result_ = scaler.fit_transform(data) #训练和导出结果⼀步达成
scaler.inverse_transform(result) #将归⼀化后的结果逆转
#使⽤MinMaxScaler的参数feature_range实现将数据归⼀化到[0,1]以外的范围中
scaler_1 = MinMaxScaler(feature_range=[5,10]) #依然实例化
result_1 = scaler.fit_transform(data) #fit_transform⼀步导出结果
print(result_1)```

### 2、数据标准化

```from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = StandardScaler() #实例化
print(scaler.fit(data)) #fit，本质是⽣成均值和⽅差
print(scaler.mean_) #查看均值的属性mean_
print(scaler.var_) #查看⽅差的属性var_
x_std = scaler.transform(data) #通过接⼝导出结果
print(x_std.mean()) #导出的结果是⼀个数组，⽤mean()查看均值
print(x_std.std()) #⽤std()查看⽅差
print(scaler.fit_transform(data)) # 使⽤fit_transform(data)⼀步达成结果
print(scaler.inverse_transform(x_std)) #使⽤inverse_transform逆转标准化```

## 二、缺失值处理

```import pandas as pd

```class sklearn.impute.SimpleImputer (missing_values=nan, strategy=’mean’, fill_value=None,
verbose=0, copy=True)```

```import pandas as pd
from sklearn.impute import SimpleImputer
print(data.info())
#查看缺失值
print(data.isnull().sum())
#填补年龄
Age = data.loc[:,"Age"].values.reshape(-1,1) #sklearn当中特征矩阵必须是⼆维
print(Age[:20])
imp_mean = SimpleImputer() #实例化，默认均值填补
imp_median = SimpleImputer(strategy="median") #⽤中位数填补
imp_0 = SimpleImputer(strategy="constant",fill_value=0) #⽤0填补
#fit_transform⼀步完成调取结果
imp_mean = imp_mean.fit_transform(Age)
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)
print(imp_mean[:20])
print(imp_median[:20])
print(imp_0[:20])
#在这⾥我们使⽤中位数填补Age
data.loc[:,"Age"] = imp_median
print(data.info())
#使⽤众数填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
print(data.info())```

```import pandas as pd
data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
#.fillna 在DataFrame⾥⾯直接进⾏填补
print(data.dropna(axis=0,inplace=True))```

## 三、处理分类型特征：编码与哑变量

```import pandas as pd
from sklearn.preprocessing import LabelEncoder
y = data.iloc[:,-1] #要输⼊的是标签，不是特征矩阵，所以允许⼀维
le = LabelEncoder() #实例化
le = le.fit(y) #导⼊数据
label = le.transform(y) #transform接⼝调取结果
data.iloc[:,-1] = label #让标签等于我们运⾏出来的结果
#查看相关属性
print(le.classes_) #属性.classes_查看标签中究竟有多少类别
print(label) #查看获取的结果label
print(le.fit_transform(y)) #也可以直接fit_transform⼀步到位
print(le.inverse_transform(label)) #使⽤inverse_transform可以逆转
#如果不需要教学展示的话我们一般会这幺写：
data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])```

```import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
data_ = data.copy()
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
#接⼝categories_对应LabelEncoder的接⼝classes_，⼀模⼀样的功能
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

```import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1] #提取出所有的名义变量
result = OneHotEncoder(categories='auto').fit_transform(X).toarray() #进⾏独热编码
newdata = pd.concat([data,pd.DataFrame(result)],axis=1) #合并原数据和编码后的结果
newdata.drop(["Sex","Embarked"],axis=1,inplace=True) #删除原名义变量
newdata.columns =["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"] #重命名列名
print(newdata)
#实例化并训练模型
enc = OneHotEncoder(categories='auto').fit(X)
#transform接⼝调取结果
re = enc.transform(X).toarray()
print(re)
print(re.shape)
#依然可以还原
print(pd.DataFrame(enc.inverse_transform(re)))
#获取模型特证名
print(enc.get_feature_names())
#合并原数据和独热编码
df = pd.concat([data,pd.DataFrame(re)],axis=1)
#axis=1,表示跨⾏进⾏合并，也就是将量表左右相连，如果是axis=0，就是将量表上下相连
#删除做过独热编码的特征
print(df.drop(['Embarked','Sex'],axis=1,inplace=True))
#修改列名
df.columns = ['Age','Survived','Female', 'Male', 'Embarked_C', 'Embarked_Q',
'Embarked_S']

## 四、处理连续型特征：二值化与分段

```import pandas as pd
from sklearn.preprocessing import Binarizer
data_2 = data.copy()
print(data_2)
X = data_2.iloc[:,0].values.reshape(-1,1) #类为特征专⽤，所以不能使⽤⼀维数组
transformer = Binarizer(threshold=30).fit_transform(X)
print(transformer)```

`​preprocessing.KBinsDiscretizer​` ​是将连续型变量划分为分类变量的类，能够将连续型变量排序后按顺序分箱后编码。总共包含三个重要参数，主要含义如下：

```import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
X = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
print(est.fit_transform(X))
#查看转换后分的箱：变成了⼀列中的三箱
print(set(est.fit_transform(X).ravel()))
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
#查看转换后分的箱：变成了哑变量
print(est.fit_transform(X).toarray())
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='quantile')
#查看转换后分的箱：变成了哑变量,且每个类别数量基本相等
print(est.fit_transform(X).toarray().sum(0))```

[1]. ​ ​数据无量钢化​

[2]. ​ ​数据归一化​

[3]. ​ ​编码与哑编码​