### 文章目录

: https://github.com/iioSnail/chaotic-transformer-tutorials

## 本文内容

: https://arxiv.org/abs/1810.04805

Pytorch中 nn.Transformer的使用详解与Transformer的黑盒讲解
: https://blog.csdn.net/zhaohongfei_358/article/details/126019181

: https://blog.csdn.net/zhaohongfei_358/article/details/126085246

## 环境准备

import math
import copy
import torch
import torchtext
from torch import nn
import torch.nn.functional as F
from torchtext.vocab import build_vocab_from_iterator

torch.__version__

'1.12.1+cpu'

torchtext.__version__

'0.13.1'

## BERT Embedding

class TokenEmbedding(nn.Embedding):
def __init__(self, vocab_size, embed_size):
super().__init__(vocab_size, embed_size, padding_idx=0)

Token Embedding就是一个nn.Emebdding
，和Transformer一致。

class PositionalEmbedding(nn.Module):
def __init__(self, d_model, max_len=512):
super().__init__()
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model).float()
position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
return self.pe[:, :x.size(1)]

Position Embedding和Transformer也一致。

class SegmentEmbedding(nn.Embedding):
def __init__(self, embed_size=512):
super().__init__(3, embed_size, padding_idx=0)

Segment Embedding也是一个nn.Embedding，但需要注意的是其词典大小只有3，其中0是填充，1代表第一句话，2代表第二句话。

class BERTEmbedding(nn.Module):
def __init__(self, vocab_size, embed_size, dropout=0.1):
"""
:param vocab_size: token的词典大小
:param embed_size: 词向量大小
"""
super().__init__()
self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
self.dropout = nn.Dropout(p=dropout)
self.embed_size = embed_size
def forward(self, sequence, segment_label):
x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)
return self.dropout(x)

class BERT(nn.Module):
def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
"""
:param vocab_size: 词典大小
:param hidden: 隐状态大小，即词向量大小
:param n_layers: TransformerEncoderLayer的层数
:param dropout: dropout rate
"""
super().__init__()
self.hidden = hidden
self.n_layers = n_layers
# 论文中提到它们使用的feed_forward_hidden的大小为hidde_size*4
feed_forward_hidden = hidden * 4
# 定义BERT的embedding
self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)
# 在论文中提到，BERT中Transformer的激活函数使用的是GELU
# 多层TransformerEncoder堆叠
self.transformer_blocks = nn.ModuleList([copy.deepcopy(transformer_encoder) for _ in range(n_layers)])
def forward(self, x, segment_info):
"""
BERT前向传递
:param x: 要被bert编码的向量，例如[[1,2,3,4,5,5,4,3,2,1,0,0]]，
即一对儿句子，包含有两句话(1,2,3,4,5)和(5,4,3,2,1),其中0是填充
:param segment_info: 句子的段落信息，例如[1,1,1,1,1,2,2,2,2,2,0,0]，
即前5个token属于第一句话，接下来5个token是第二句话，
0是填充，不属于任何话。
:return: 所有token经过bert后包含上下文的隐状态，例如Shape为(1, 12, 768)，
即1个句子，12个token，每个token被编码成了768维的向量
"""
# 将index编码成向量
x = self.embedding(x, segment_info)
# 将编码后的向量经过TransformerEncoder一层一层传递
for transformer in self.transformer_blocks:
return x

B

E

R

T

B

A

S

E

\bf{BERT_{BASE}}

BER
T

BASE

B

E

R

T

L

A

R

G

E

\bf{BERT_{LARGE}}

BER
T

LARGE

bert_base = BERT(vocab_size=30522, hidden=768, n_layers=12, attn_heads=12)
bert_large = BERT(vocab_size=30522, hidden=1024, n_layers=24, attn_heads=16)
print("bert_base参数量: ", sum([param.nelement() for param in bert_base.parameters()]))
print("bert_large参数量: ", sum([param.nelement() for param in bert_large.parameters()]))

bert_base参数量:  108497664
bert_large参数量:  333566976

x = torch.LongTensor([[1,2,3,4,5,5,4,3,2,1,0,0]])
segment_info = torch.LongTensor([[1,1,1,1,1,2,2,2,2,2,0,0]])
print("bert_base outputs size:", bert_base(x, segment_info).size())
print("bert_large outputs size:", bert_large(x, segment_info).size())

bert_base outputs size: torch.Size([1, 12, 768])
bert_large outputs size: torch.Size([1, 12, 1024])

## 预训练BERT

MLM任务简介：MLM任务就是把一个句子中的部分token给替换掉，然后让bert去结合上下文来预测被替换掉的词是什幺。

#### Next Sentence Prediction(NSP)任务：

NSP任务简介：NSP任务就是预测传给BERT的两句话是不是一对儿，是一个二分类任务。预测方式就是使用输入的第一个token[CLS]

sentence = "大家好，我是练习时长两年半的个人练习生蔡徐坤，喜欢唱跳RAP篮球，接下来我会为大家带来一首鸡你太美。"

vocab = build_vocab_from_iterator(sentence, specials=['[PAD]', '[CLS]', '[SEP]', '[MASK]'])

# BERT的输入，以[CLS]开头，两句话中间以[SEP]分割，长度都为24。
inputs = [
]
# 段落信息，表示该token属于哪句话
segment_label = [
[1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,0,0,0],
[1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,0,0,0,0,0,0,0,0]
]
mlm_targets = [
]
# 定义NSP任务的targets
nsp_targets = [1, 0]

inputs = torch.LongTensor([vocab(input.split(' ')) for input in inputs])
segment_label = torch.LongTensor(segment_label)
mlm_targets = torch.LongTensor([vocab(target.split(' ')) for target in mlm_targets])
nsp_targets = torch.LongTensor(nsp_targets)
print("inputs.shape:", inputs.size())
print("targets.shape:", mlm_targets.size())

inputs.shape: torch.Size([2, 24])
targets.shape: torch.Size([2, 24])

class MaskedLanguageModel(nn.Module):
"""
predicting origin token from masked input sequence
n-class classification problem, n-class = vocab_size
"""
def __init__(self, hidden, vocab_size):
"""
:param hidden: output size of BERT model
:param vocab_size: total vocab size
"""
super().__init__()
self.linear = nn.Linear(hidden, vocab_size)
self.softmax = nn.LogSoftmax(dim=-1)
def forward(self, x):
return self.softmax(self.linear(x))

class NextSentencePrediction(nn.Module):
"""
2-class classification model : is_next, is_not_next
"""
def __init__(self, hidden):
"""
:param hidden: BERT model output size
"""
super().__init__()
self.linear = nn.Linear(hidden, 2)
self.softmax = nn.LogSoftmax(dim=-1)
def forward(self, x):
return self.softmax(self.linear(x[:, 0]))

class BERTLM(nn.Module):
def __init__(self, vocab_size):
super(BERTLM, self).__init__()
self.vocab_size = vocab_size
# 这里就使用bert_base吧
self.bert = BERT(vocab_size=vocab_size, hidden=768, n_layers=12, attn_heads=12)
self.next_sentence = NextSentencePrediction(self.bert.hidden)
def forward(self, x, segment_label):
x = self.bert(x, segment_label)
return self.next_sentence(x), self.mask_lm(x)

bert_mlm = BERTLM(len(vocab))
nsp_outputs, mlm_outputs = bert_mlm(inputs, segment_label)
print("nsp_outputs shape:", nsp_outputs.size())
print("mlm_outputs shape:", mlm_outputs.size())

nsp_outputs shape: torch.Size([2, 2])
mlm_outputs shape: torch.Size([2, 24, 46])

criterion = nn.NLLLoss(ignore_index = 0)
optimizer = torch.optim.Adam(bert_mlm.parameters(), lr=3e-5)

for epoch in range(300):
nsp_outputs, mlm_outputs = bert_mlm(inputs, (inputs>0).int())
nsp_loss = criterion(nsp_outputs, nsp_targets)
mlm_loss = criterion(mlm_outputs.view(-1, 46), mlm_targets.view(-1))
loss = nsp_loss + mlm_loss
loss.backward()
optimizer.step()
print("loss {:.4}, nsp loss: {:.4}, mlm_loss {:.4}".format(loss, nsp_loss, mlm_loss))

loss 4.49, nsp loss: 0.4757, mlm_loss 4.014
loss 3.851, nsp loss: 0.007476, mlm_loss 3.843
loss 3.67, nsp loss: 0.002525, mlm_loss 3.667
...
loss 0.004217, nsp loss: 4.911e-05, mlm_loss 0.004168
loss 0.004457, nsp loss: 8.523e-05, mlm_loss 0.004372

inputs = '我 是 练 习 时 长 两 年 半 的 个 人 练 习 生 [MASK] 徐 坤'
inputs = torch.LongTensor([vocab(inputs.split(' '))])
segment_label = torch.ones(inputs.size()).long()

nsp_outputs, mlm_outputs = bert_mlm(inputs, segment_label)

print(vocab.lookup_tokens(mlm_outputs.argmax(-1)[0].tolist()))

['我', '是', '练', '习', '时', '长', '两', '年', '半', '的', '个', '人', '练', '习', '生', '蔡', '徐', '坤']

## 参考资料

(原论文)BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
: https://arxiv.org/abs/1810.04805

(论文阅读)BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
: https://blog.csdn.net/zhaohongfei_358/article/details/126838417

Pytorch中 nn.Transformer的使用详解与Transformer的黑盒讲解
: https://blog.csdn.net/zhaohongfei_358/article/details/126019181

: https://blog.csdn.net/zhaohongfei_358/article/details/126085246

BERT-Pytorch实现
: https://github.com/codertimo/BERT-pytorch