【NLP】:warning:学不会打我! 半小时学会基本操作 15:warning: 一百行实现 bert 二分类
从今天开始我们将开启一段自然语言处理 (NLP) 的旅程. 自然语言处理可以让来处理, 理解, 以及运用人类的语言, 实现机器语言和人类语言之间的沟通桥梁.
Bert (Bidirecrional Encoder Representation from Transformers) 是一个预训练的语言表征模型. Bert 主要利用了 Transformer 的 Encoder 结构, 这里就不多赘述.
简单说一说
在大家的鼓励下这届的 CCF 荣获 4 个冠军, 1 个亚军, 在天池中荣获第 4 名. 如图:
100 行实现 Bert
以下代码是全网最简单的 Bert 实现, 部分为比赛源码.
网络架构
Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(None, 780)] 0 __________________________________________________________________________________________________ input_2 (InputLayer) [(None, 780)] 0 __________________________________________________________________________________________________ tf_bert_model (TFBertModel) TFBaseModelOutputWit 102267648 input_1[0][0] input_2[0][0] __________________________________________________________________________________________________ dense (Dense) (None, 2) 7690 tf_bert_model[0][1] ================================================================================================== Total params: 102,275,338 Trainable params: 102,275,338 Non-trainable params: 0 __________________________________________________________________________________________________
# 超参数 EPOCHS = 50 # 迭代次数 BATCH_SIZE = 8 # 单词训练样本数目 learning_rate = 0.00003 # 学习率 INPUT_DIM = 36782 + 1 MAX_LENGTH = 780 optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 优化器 loss = tf.keras.losses.CategoricalCrossentropy() # 损失 bert_tokenizer = BertTokenizer.from_pretrained('Langboat/mengzi-bert-base') # Bert的分词器
get_data
def get_data(): """ 读取数据 :return: 返回分批完的训练集和测试集 """ # 读取数据 data_train = pd.read_csv("../data/train.csv") print(data_train.head(), data_train.shape) data_val = pd.read_csv("../data/val.csv") print(data_val.head(), data_val.shape) # 预处理 data_train["label"] = data_train["label"].apply(lambda x: x.split(",")) print(data_train.head()) data_val["label"] = data_val["label"].apply(lambda x: x.split(",")) print(data_val.head()) # 获取X,y X_train = data_train["text"].values.tolist() y_train = data_train["label"].values.tolist() y_train = np.asarray(y_train, dtype=np.float32) X_val = data_val["text"].values.tolist() y_val = data_val["label"].values.tolist() y_val = np.asarray(y_val, dtype=np.float32) # Tokenizer X_train = bert_tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH) X_val = bert_tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH) print("=" * 20, "字个数:", bert_tokenizer.vocab_size, "=" * 20) # 获取input/mask train_input = X_train["input_ids"] train_mask = X_train["attention_mask"] train_input = np.asarray(train_input) train_mask = np.asarray(train_mask) val_input = X_val["input_ids"] val_mask = X_val["attention_mask"] val_input = np.asarray(val_input) val_mask = np.asarray(val_mask) return train_input, val_input, train_mask, val_mask, y_train, y_val
def main(): # 获取数据 X_train_input, X_test_input, X_train_mask, X_test_mask, y_train, y_test = get_data() # 调试输出 print(X_train_input[:5], X_train_input.shape) print(X_test_input[:5], X_test_input.shape) print(X_train_mask[:5], X_train_mask.shape) print(X_test_mask[:5], X_test_mask.shape) print(y_train[:5], y_train.shape) print(y_test[:5], y_test.shape) # Bert模型 bert = TFBertModel.from_pretrained("Langboat/mengzi-bert-base", from_pt=True) input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32) masks = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32) bert = bert([input_ids, masks]) bert = bert[1] classifier = Dense(2, activation="softmax")(bert) # 模型 model = Model(inputs=[input_ids, masks], outputs=classifier) print(model.summary()) # 组合 model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"]) # 保存 checkpoint = tf.keras.callbacks.ModelCheckpoint( "../model/bert_mengzi/bert_mengzi.ckpt", monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True ) # 训练 model.fit([X_train_input, X_train_mask], y_train, validation_data=([X_test_input, X_test_mask], y_test), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[checkpoint])
完整代码
import numpy as np import pandas as pd import tensorflow as tf from tensorflow.python.keras import Model from tensorflow.keras.layers import Dense from transformers import BertTokenizer, TFBertModel # 超参数 EPOCHS = 50 # 迭代次数 BATCH_SIZE = 8 # 单词训练样本数目 learning_rate = 0.00003 # 学习率 INPUT_DIM = 36782 + 1 MAX_LENGTH = 780 optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 优化器 loss = tf.keras.losses.CategoricalCrossentropy() # 损失 bert_tokenizer = BertTokenizer.from_pretrained('Langboat/mengzi-bert-base') # Bert的分词器 def get_data(): """ 读取数据 :return: 返回分批完的训练集和测试集 """ # 读取数据 data_train = pd.read_csv("../data/train.csv") print(data_train.head(), data_train.shape) data_val = pd.read_csv("../data/val.csv") print(data_val.head(), data_val.shape) # 预处理 data_train["label"] = data_train["label"].apply(lambda x: x.split(",")) print(data_train.head()) data_val["label"] = data_val["label"].apply(lambda x: x.split(",")) print(data_val.head()) # 获取X,y X_train = data_train["text"].values.tolist() y_train = data_train["label"].values.tolist() y_train = np.asarray(y_train, dtype=np.float32) X_val = data_val["text"].values.tolist() y_val = data_val["label"].values.tolist() y_val = np.asarray(y_val, dtype=np.float32) # Tokenizer X_train = bert_tokenizer(X_train, padding=True, truncation=True, max_length=MAX_LENGTH) X_val = bert_tokenizer(X_val, padding=True, truncation=True, max_length=MAX_LENGTH) print("=" * 20, "字个数:", bert_tokenizer.vocab_size, "=" * 20) # 获取input/mask train_input = X_train["input_ids"] train_mask = X_train["attention_mask"] train_input = np.asarray(train_input) train_mask = np.asarray(train_mask) val_input = X_val["input_ids"] val_mask = X_val["attention_mask"] val_input = np.asarray(val_input) val_mask = np.asarray(val_mask) return train_input, val_input, train_mask, val_mask, y_train, y_val def main(): # 获取数据 X_train_input, X_test_input, X_train_mask, X_test_mask, y_train, y_test = get_data() # 调试输出 print(X_train_input[:5], X_train_input.shape) print(X_test_input[:5], X_test_input.shape) print(X_train_mask[:5], X_train_mask.shape) print(X_test_mask[:5], X_test_mask.shape) print(y_train[:5], y_train.shape) print(y_test[:5], y_test.shape) # Bert模型 bert = TFBertModel.from_pretrained("Langboat/mengzi-bert-base", from_pt=True) input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32) masks = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32) bert = bert([input_ids, masks]) bert = bert[1] classifier = Dense(2, activation="softmax")(bert) # 模型 model = Model(inputs=[input_ids, masks], outputs=classifier) print(model.summary()) # 组合 model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"]) # 保存 checkpoint = tf.keras.callbacks.ModelCheckpoint( "../model/bert_mengzi/bert_mengzi.ckpt", monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True ) # 训练 model.fit([X_train_input, X_train_mask], y_train, validation_data=([X_test_input, X_test_mask], y_test), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[checkpoint]) if __name__ == '__main__': main()
Be First to Comment