## 1.前向传播算法

### 1.1 展示数据

```import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

```# 加载数据
X, y = data['X'], data['y']
X.shape,y.shape```

```def plot_100figs():
"""
画100幅图
"""
# 随机100个索引,不重复
sample_list = np.random.choice(np.arange(X.shape[0]), 100, replace=False)
# 从数据中取出这100个值
sample_data = X[sample_list, :]
# 画图, sharex,sharey:子图共享x轴,y轴, ax_array：一个10*10的矩阵
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharex=True, sharey=True, figsize=(8, 8))
# 画子图
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(sample_data[10 * r + c, :].reshape((20, 20)).T,
cmap=matplotlib.cm.binary)
# 去掉刻度线
plt.xticks([])
plt.yticks([])
plt.show()```

### 1.2 One-hot编码

```def onehot(y, class_num):
"""
onehot编码
"""
# (5000, 10)
y_onehot = np.zeros((y.shape[0], class_num))
# 取y的值
for row in range(y.shape[0]):
# row -> [0,5000)
# 把相应的位置置1
cols = y[row] - 1
y_onehot[row, cols] = 1
return y_onehot```

### 1.3 参数展开(Unrolling Parameters)

```def serialize(a, b):
"""
展开元素
"""
return np.concatenate((np.ravel(a), np.ravel(b)))```

```def deserialize(seq):
"""
恢复元素
"""
# Θ的形状是(25, 401)和(10, 26)
return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)```

### 1.4 数据读取与形状改变

```# 读取训练好的权重
theta1, theta2 = weight['Theta1'], weight['Theta2']
# 插入全1行x0
X = np.insert(X, 0, values=np.ones(X.shape[0]), axis=1)
# 将Θ展开
theta = serialize(theta1, theta2)```

### 1.5 前馈运算

Sigmoid函数：

```def sigmoid(z):
return 1 / (1 + np.exp(-z))```

```def feed_forward(theta, X, y):
# 将Θ恢复
theta1, theta2 = deserialize(theta)
# 第一层 --> 第二层
a1 = X
z2 = X @ theta1.T  # (5000, 401) @ (401, 25) = (5000, 25)
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, values=np.ones(a2.shape[0]), axis=1)  # intercept:(5000, 26)
# 第二层 --> 第三层
z3 = a2 @ theta2.T  # (5000, 26) @ (26, 10) = (5000, 10)
h = sigmoid(z3)
# 这些参数需要在反向传播时使用
return a1, z2, a2, z3, h```

### 1.6 损失函数

```def cost(theta, X, y):
X = np.matrix(X)
y = np.matrix(y)
# 前馈运算获取hθ(x)
_,_,_,_,h = feed_forward(theta, X, y)
# 计算
# np.multiply是对应位置相乘
first = np.multiply(-y, np.log(h))
second = np.multiply((1 - y), np.log(1 - h))
front = np.sum(first - second) / (len(X))
return front```

```def regular_cost(theta, X, y, lambd):
front = cost(theta, X, y)
# 函数后半部分,忽略θ(1)和θ(2)的第一列
# 恢复θ
theta1, theta2 = deserialize(theta)
last = lambd / (2 * len(X)) * (np.sum(np.power(theta1[:, 1:], 2)) + np.sum(np.power(theta2[:, 1:], 2)))
return front + last```

## 2.反向传播算法

```def sigmoid_gra(z):
return sigmoid(z) * (1 - sigmoid(z))```

### 2.2 BP算法

1.数据集：

2.设置

3.for-loop:

#### 代码表示：

```def backpropa(theta, X, y):
"""
反向传播算法
"""
m = X.shape[0]
# 恢复θ
theta1, theta2 = deserialize(theta)
# set Δ=0(for all i,j,l)
# 与Θ的形状一致
delta1 = np.zeros(theta1.shape)  # (25, 401)
delta2 = np.zeros(theta2.shape)  # (10, 26)
# 获取前向传播中生成的参数
a1, z2, a2, z3, h = feed_forward(theta, X, y)
# 设置for-loop
for i in range(m):
# 获取各层的参数
a1i = a1[i, :]  # (401,)
z2i = z2[i, :]  # (25,)
a2i = a2[i, :]  # (26,)

hi = h[i, :]  # (10,)
yi = y[i, :]  # (10,)

# 计算各层δ
# 没有δ1，不需要对输入层考虑误差
d3i = hi - yi  # (10,)
z2i = np.insert(z2i, 0, np.ones(1))  # (26,)
d2i = np.multiply(theta2.T @ d3i, sigmoid_gra(z2i))  # [(26, 10) @ (10,)] * (26,) = (26,)

# 计算Δ
delta2 += np.mat(d3i).T @ np.mat(a2i)  # (10, 1) @ (1, 26)
# 去掉d2的第0列
delta1 += np.mat(d2i[1:]).T @ np.mat(a1i)  # (25, 1) @ (1, 401)

# 得到Dij
delta1 = delta1 / m
delta2 = delta2 / m
# 返回长序列
return serialize(delta1, delta2)```

### 2.3 正则化BP算法

#### 代码实现：

```def regularized_bp(theta, X, y, lambd=1):
"""偏置项（j = 0时）不进行优化"""
m = X.shape[0]
# 前半部分
delta = backpropa(theta, X, y)
# 恢复θ
theta1, theta2 = deserialize(theta)
# 恢复Dij
delta1, delta2 = deserialize(delta)
theta1[:, 0] = 0  # 将j=0时的θ置零
reg_term_d1 = (lambd / m) * theta1
delta1 += reg_term_d1
theta2[:, 0] = 0  # j=0时的θ置零
reg_term_d2 = (lambd / m) * theta2
delta2 += reg_term_d2
# 需要将theta拼接起来
return serialize(delta1, delta2)```

## 3.模型训练

### 3.1 随机初始化变量

```# 随机初始化变量，打破参数对称
def random_init(size):
# 均匀分布抽取样本在[-0.12,0.12]之间
return np.random.uniform(-0.12, 0.12, size)```

### 3.2 模型训练

```# 使用 scipy.optimize.minimize 去寻找参数
import scipy.optimize as opt
def nn_training(X, y):
# 随机初始化变量theta
init_theta = random_init(10285)  # 25*401 + 10*26
#
res = opt.minimize(fun=regular_cost,
x0=init_theta,
args=(X, y, 1),
method='TNC',
jac=regularized_bp,
options={'maxiter': 1000})
return res```

### 3.3 计算准确率

```def show_acc(theta, X, y):
# 取得结果概率
_, _, _, _, h = feed_forward(theta, X, y)
# 预测值矩阵
y_predict = np.mat(np.argmax(h,axis=1) + 1)
y_true = np.mat(data['y']).ravel()  # 真实值矩阵
# 矩阵进行比较返回各元素比对布尔值矩阵，列表进行比较返回整个列表的比对布尔值
accuracy = np.mean(y_predict == y_true)
print('accuracy = {}%'.format(accuracy * 100))```

### 3.4 隐层可视化

```def plot_hidden(final_theta):
"""
将隐层画出来
"""
# 获取theta1
theta1, _ = deserialize(final_theta)  # (25, 401)
# 去掉偏置列
theta1 = theta1[:, 1:]
# 画图, sharex,sharey:子图共享x轴,y轴, ax_array：一个5*5的矩阵
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharex=True, sharey=True, figsize=(8, 8))
# 画子图
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(theta1[5 * r + c, :].reshape((20, 20)).T,
cmap=matplotlib.cm.binary)
# 去掉刻度线
plt.xticks([])
plt.yticks([])
plt.show()```