01 为什幺要学习知识蒸馏

1.1 一切源于业务的需要

1.2 主流的模型加速方法

02 详解知识蒸馏

2.1 知识蒸馏的作用和原理

2.2 知识蒸馏为啥有用

2.3 知识蒸馏的流程

L_hard其实和常规模型是一样的，就是根据训练集的label来学习。上面公式中c就是正确答案label，也就是计算学生模型的输出结果q和标签c的交叉熵。

L_soft和L_hard分别对应的是样本soft target和hard target。下面通过手写数字集样本1来对比 soft target和hard target的区别：

2.4 为什幺用“”一词

2.5 对比softmax-T函数和直接优化logits差异

2.6 知识蒸馏模型效果

03 实战知识蒸馏BERT到TextCNN

```class TextCNN(object):
"""
利用bert作为teacher，指导textcnn学习logits，损失函数为KL散度
"""
def __init__(
self, sequence_length, vocab_size,
embedding_size, filter_sizes, num_filters,dropout_keep_prob=0.2):
self.dropout_keep_prob = dropout_keep_prob
# Placeholders for input, output
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
self.labels = tf.placeholder(tf.int32, shape=None, name="labels")
self.teacher_logits = tf.placeholder(tf.float32, shape=None, name="teacher_logits")
# Embedding layer
# with tf.device('/cpu:0'), tf.name_scope("embedding"):
with tf.name_scope("embedding"):
self.W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
# Create a convolution + maxpool layer for each filter size
# textcnn模型结构
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1, 1, 1, 1],
name="conv")
# Maxpooling over the outputs
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
name="pool")
pooled_outputs.append(pooled)

# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

l2_loss = tf.constant(0.0)
num_classes = 2
# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
W = tf.get_variable(
"W",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")[:,1]
self.logits = tf.nn.softmax(self.scores)
with tf.name_scope("loss"):
loss = 0.1*tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels)
loss = tf.reduce_sum(loss)
self.loss = loss + 0.9*tf.keras.losses.KLDivergence()(tf.nn.log_softmax(self.scores), self.teacher_logits)```

```with tf.Graph().as_default():
session_conf = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = TextCNN(
sequence_length=max_len-2,
vocab_size=21128,  #将bert词典的大小硬编码
embedding_size=128,
filter_sizes=[3,4,5,6], #卷积核大小列表
num_filters=32)
global_step = tf.Variable(0, name="global_step", trainable=False)
#冻结bert参数

# Keep track of gradient values and sparsity (optional)
if g is not None:

# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
loss_summary = tf.summary.scalar("loss", cnn.loss)
# Train Summaries
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
# Initialize all variables
sess.run(tf.global_variables_initializer())

def train_step(x_batch, label_batch, teacher_logits):
"""
A single training step
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.labels: label_batch,
cnn.teacher_logits: teacher_logits
}
_, step, summaries, loss = sess.run(
[train_op, global_step, train_summary_op, cnn.loss],
feed_dict)
time_str = datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}".format(time_str, step, loss))

saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
for batch in batchs:
indices, labels, teacher_logits = [],[],[]

# 数据的输入格式是text,label,logits
for text,label,logits in batch:
idx, _ = tokenizer_.encode(first=text, max_len=max_len)
indices.append(idx)
labels.append(label)
teacher_logits.append(logits)
indices_cnn= [list(filter(lambda x: x!=101 and x!=102, i)) for i in indices] #textcnn不需要CLS SEP
train_step(indices_cnn, labels,teacher_logits)
current_step = tf.train.global_step(sess, global_step)
if current_step%10==0:
path = saver.save(sess, './distil_model', global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))```

[1] Distilling the Knowledge in a Neural Network