本站内容均来自兴趣收集,如不慎侵害的您的相关权益,请留言告知,我们将尽快删除.谢谢.
作者简介:在校大学生一枚,华为云享专家,阿里云星级博主,腾云先锋(TDP)成员,云曦智划项目总负责人,全国高等学校计算机教学与产业实践资源建设专家委员会(TIPCC)志愿者,以及编程爱好者,期待和大家一起学习,一起进步~
.
博客主页:
ぃ灵彧が的学习日志
.
本文专栏:
人工智能
.
专栏寄语:若你决定灿烂,山无遮,海无拦
.
文章目录
【深度学习前沿应用】文本分类Fine-Tunning
应用BERT模型做短文本情绪分类
二、BERT预训练模型加载
应用BERT模型做短文本情绪分类
#导入相关的模块 import paddle import paddlenlp as ppnlp from paddlenlp.data import Stack, Pad, Tuple import paddle.nn.functional as F import numpy as np from functools import partial #partial()函数可以用来固定某些参数值,并返回一个新的callable对象 ppnlp.__version__
一、数据加载及预处理
(一)、数据导入
数据集为公开中文情感分析数据集ChnSenticorp。使用PaddleNLP的.datasets.ChnSentiCorp.get_datasets方法即可以加载该数据集。
#采用paddlenlp内置的ChnSentiCorp语料,该语料主要可以用来做情感分类。训练集用来训练模型,验证集用来选择模型,测试集用来评估模型泛化性能。 train_ds, dev_ds, test_ds = ppnlp.datasets.ChnSentiCorp.get_datasets(['train','dev','test']) #获得标签列表 label_list = train_ds.get_labels() #看看数据长什幺样子,分别打印训练集、验证集、测试集的前3条数据。 print("训练集数据:{} ".format(train_ds[0:1])) print("验证集数据:{} ".format(dev_ds[0:1])) print("测试集数据:{} ".format(test_ds[0:1])) print("训练集样本个数:{}".format(len(train_ds))) print("验证集样本个数:{}".format(len(dev_ds))) print("测试集样本个数:{}".format(len(test_ds)))
输出结果如下图1所示:
(二)、数据预处理
#调用ppnlp.transformers.BertTokenizer进行数据处理,tokenizer可以把原始输入文本转化成模型model可接受的输入数据格式。 tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained("bert-base-chinese") #数据预处理 def convert_example(example,tokenizer,label_list,max_seq_length=256,is_test=False): if is_test: text = example else: text, label = example #tokenizer.encode方法能够完成切分token,映射token ID以及拼接特殊token encoded_inputs = tokenizer.encode(text=text, max_seq_len=max_seq_length) # print('===================') # print(encoded_inputs) input_ids = encoded_inputs["input_ids"] segment_ids = encoded_inputs["token_type_ids"] if not is_test: label_map = { } for (i, l) in enumerate(label_list): label_map[l] = i label = label_map[label] label = np.array([label], dtype="int64") return input_ids, segment_ids, label else: return input_ids, segment_ids #数据迭代器构造方法 def create_dataloader(dataset, trans_fn=None, mode='train', batch_size=1, use_gpu=False, pad_token_id=0, batchify_fn=None): if trans_fn: dataset = dataset.apply(trans_fn, lazy=True) if mode == 'train' and use_gpu: sampler = paddle.io.DistributedBatchSampler(dataset=dataset, batch_size=batch_size, shuffle=True) else: shuffle = True if mode == 'train' else False #如果不是训练集,则不打乱顺序 sampler = paddle.io.BatchSampler(dataset=dataset, batch_size=batch_size, shuffle=shuffle) #生成一个取样器 dataloader = paddle.io.DataLoader(dataset, batch_sampler=sampler, return_list=True, collate_fn=batchify_fn) return dataloader #使用partial()来固定convert_example函数的tokenizer, label_list, max_seq_length, is_test等参数值 trans_fn = partial(convert_example, tokenizer=tokenizer, label_list=label_list, max_seq_length=128, is_test=False) batchify_fn = lambda samples, fn=Tuple(Pad(axis=0,pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_id), Stack(dtype="int64")):[data for data in fn(samples)] #训练集迭代器 train_loader = create_dataloader(train_ds, mode='train', batch_size=64, batchify_fn=batchify_fn, trans_fn=trans_fn) #验证集迭代器 dev_loader = create_dataloader(dev_ds, mode='dev', batch_size=64, batchify_fn=batchify_fn, trans_fn=trans_fn) #测试集迭代器 test_loader = create_dataloader(test_ds, mode='test', batch_size=64, batchify_fn=batchify_fn, trans_fn=trans_fn)
二、BERT预训练模型加载
#加载预训练模型Bert用于文本分类任务的Fine-tune网络BertForSequenceClassification, 它在BERT模型后接了一个全连接层进行分类。 #由于本任务中的情感分类是二分类问题,设定num_classes为2 model = ppnlp.transformers.BertForSequenceClassification.from_pretrained("bert-base-chinese", num_classes=2)
三、训练模型
(一)、设置训练超参数
#设置训练超参数 #学习率 learning_rate = 1e-5 #训练轮次 epochs = 8 #学习率预热比率 warmup_proption = 0.1 #权重衰减系数 weight_decay = 0.01 num_training_steps = len(train_loader) * epochs num_warmup_steps = int(warmup_proption * num_training_steps) def get_lr_factor(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) else: return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) #学习率调度器 lr_scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate, lr_lambda=lambda current_step: get_lr_factor(current_step)) #优化器 optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) #损失函数 criterion = paddle.nn.loss.CrossEntropyLoss() #评估函数 metric = paddle.metric.Accuracy()
(二)、评估函数
#评估函数 def evaluate(model, criterion, metric, data_loader): model.eval() metric.reset() losses = [] for batch in data_loader: input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = criterion(logits, labels) losses.append(loss.numpy()) correct = metric.compute(logits, labels) metric.update(correct) accu = metric.accumulate() print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu)) model.train() metric.reset()
(三)、模型训练
#开始训练 global_step = 0 for epoch in range(1, epochs + 1): for step, batch in enumerate(train_loader): #从训练数据迭代器中取数据 # print(batch) input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = criterion(logits, labels) #计算损失 probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 50 == 0 : print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc)) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() evaluate(model, criterion, metric, dev_loader)
四、模型预测
def predict(model, data, tokenizer, label_map, batch_size=1): examples = [] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, label_list=label_map.values(), max_seq_length=128, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_id)): fn(samples) batches = [] one_batch = [] for example in examples: one_batch.append(example) if len(one_batch) == batch_size: batches.append(one_batch) one_batch = [] if one_batch: batches.append(one_batch) results = [] model.eval() for batch in batches: input_ids, segment_ids = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) segment_ids = paddle.to_tensor(segment_ids) logits = model(input_ids, segment_ids) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results
data = ['这个商品虽然看着样式挺好看的,但是不耐用。', '这个老师讲课水平挺高的。'] label_map = { 0: '负向情绪', 1: '正向情绪'} predictions = predict(model, data, tokenizer, label_map, batch_size=32) for idx, text in enumerate(data): print('预测文本: {} 情绪标签: {}'.format(text, predictions[idx]))
输出结果如下图2所示:
本系列文章内容为根据清华社出版的《机器学习实践》所作的相关笔记和感悟,其中代码均为基于百度飞桨开发,若有任何侵权和不妥之处,请私信于我,定积极配合处理,看到必回!!!
Be First to Comment