Press "Enter" to skip to content

抱抱脸系列 | 句对分类

本站内容均来自兴趣收集,如不慎侵害的您的相关权益,请留言告知,我们将尽快删除.谢谢.

任务:使用BERT系列模型做sentence-pair-classification的任务

 

数据集:使用GLUE中的MRPC

 

!pip install datasets==1.0.1
!pip install transformers==3.1.0

 

import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

 

下载数据集

 

dataset = load_dataset('glue', 'mrpc')

 

分割数据集

 

split = dataset['train'].train_test_split(test_size=0.1, seed=1)
train = split['train']  # 90 % of the original training data
val = split['test']   # 10 % of the original training data
test = dataset['validation']
# Transform data into pandas dataframes
df_train = pd.DataFrame(train)
df_val = pd.DataFrame(val)
df_test = pd.DataFrame(test)
df_train.head()

 

可以看到数据包含:sentence1;sentence2;label

 

print(df_train.shape)  # (3301, 4)
print(df_val.shape)  # (367, 4)
print(df_test.shape)  # (408, 4)

 

定义超参数

 

bert_model = "albert-base-v2"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', ...
freeze_bert = False  # 如果为True,则冻住bert的参数,只调整上层的分类参数
maxlen = 128
bs = 16  # batch size
iters_to_accumulate = 2  # 梯度累加的次数
lr = 2e-5
epochs = 4

 

根据Dataset类定义数据集

 

class CustomDataset(Dataset):
    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):
        self.data = data  # pandas dataframe
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  # 初始化tokenizer  
        self.maxlen = maxlen
        self.with_labels = with_labels 
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        sent1 = str(self.data.loc[index, 'sentence1'])
        sent2 = str(self.data.loc[index, 'sentence2'])
        # 使用tokenizer得到token id、attention mask和token type id
        encoded_pair = self.tokenizer(sent1, 
                                      sent2, 
                                      padding='max_length',
                                      truncation=True,
                                      max_length=self.maxlen,  
                                      return_tensors='pt')
        
        token_ids = encoded_pair['input_ids'].squeeze(0)
        attn_masks = encoded_pair['attention_mask'].squeeze(0)
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)
        if self.with_labels:
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

 

train_set = CustomDataset(df_train, maxlen, bert_model)
val_set = CustomDataset(df_val, maxlen, bert_model)

 

定义模型

 

class SentencePairClassifier(nn.Module):
    def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        self.bert_layer = AutoModel.from_pretrained(bert_model)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768
        # 如果冻住bert,那幺给bert层的参数梯度都设置为false
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        self.cls_layer = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(p=0.1)
    @autocast()  # 自动混合精度
    def forward(self, input_ids, attn_masks, token_type_ids):
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)
        logits = self.cls_layer(self.dropout(pooler_output))
        return logits

 

设置随机数

 

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

 

evaluate过程代码

 

def evaluate_loss(net, device, criterion, dataloader):
    net.eval()
    mean_loss = 0
    count = 0
    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1
    return mean_loss / count

 

!mkdir models

 

train过程代码

 

def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # 一个epoch打印5次loss
    iters = []
    train_losses = []
    val_losses = []
    scaler = GradScaler()
    for ep in range(epochs):
        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
            # 转换成 cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            with autocast():
                logits = net(seq, attn_masks, token_type_ids)
                # 计算 loss
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged
            scaler.scale(loss).backward()
            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()

            running_loss += loss.item()
            if (it + 1) % print_every == 0:
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))
                running_loss = 0.0

        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1
    # Saving the model
    path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))
    del loss
    torch.cuda.empty_cache()

 

set_seed(1)
train_set = CustomDataset(df_train, maxlen, bert_model)
val_set = CustomDataset(df_val, maxlen, bert_model)
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)
net.to(device)
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

 

Epoch 4 complete! Validation Loss : 0.36528593172197754

 

The model has been saved in models/albert-base-v2_lr_2e-05_val_loss_0.35007_ep_3.pt

 

def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()
def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []
    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()

 

!mkdir results

 

path_to_model = 'models/albert-base-v2_lr_2e-05_val_loss_0.35007_ep_3.pt'  

path_to_output_file = 'results/output.txt'
print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)
model = SentencePairClassifier(bert_model)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
print()
print("Loading the weights of the model...")
model.load_state_dict(torch.load(path_to_model))
model.to(device)
print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

 

path_to_output_file = 'results/output.txt'  # path to the file with prediction probabilities
labels_test = df_test['label']  # true labels
probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
threshold = 0.5   # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold
metric = load_metric("glue", "mrpc")
metric._compute(predictions=preds_test, references=labels_test)

 

{‘accuracy’: 0.875, ‘f1’: 0.911917098445596}

Be First to Comment

发表评论

邮箱地址不会被公开。 必填项已用*标注