## 1. 介绍

（natural language inference）主要研究

（hypothesis）是否可以从前提
（premise）中推断出来，

（entailment）：假设可以从前提中推断出来。

（neutral）：所有其他情况。

## 2. 下载SNLI数据集

SNLI是由500000多个带标签的英语句子对组成的集合，在路径…/data/snli_1.0中下载并存储提取的SNLI数据集。

```import torch
import torch.nn
import d2l.torch
import os
import re
d2l.torch.DATA_HUB['SNLI'] = (
'https://nlp.stanford.edu/projects/snli/snli_1.0.zip',
'9fcde07509c7e87ec61c640c1b2753d9041758e4')

## 3. 数据集读取

```def read_snli(data_dir,is_train=True):
"""将SNLI数据集解析为前提、假设和标签"""
def extract_text(s):
# 删除我们不会使用的信息
s = re.sub('\\(','',s)
s = re.sub('\\)','',s)
# 用一个空格替换两个或多个连续的空格
s = re.sub('\\s{2,}',' ',s)
return s.strip()
label_set = {
file_path = os.path.join(data_dir,'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
with open(file_path,'r') as f :
rows = [row.split('\t') for row in f.readlines()[1:]] # rows是一个list of list嵌套列表
premises = [extract_text(row[1]) for row in rows if row[0] in label_set] # premises是一个列表，里面元素是一个每一个样本的前提
hypotheses = [extract_text(row[2]) for row in rows if row[0] in label_set] #hypotheses是一个列表，里面元素是一个每一个样本（每一行）的假设
labels = [label_set[row[0]] for row in rows if row[0] in label_set] #labels是一个列表，里面元素是一个每一个样本的label,为0,1,2标签
return premises,hypotheses,labels```

```train_data = read_snli(data_dir,is_train=True)
for x0,x1,y in zip(train_data[0][:3],train_data[1][:3],train_data[2][:3]):
print('premise:',x0)
print('hypothesis:',x1)
print('label:',y)```

```输出结果如下：
premise: A person on a horse jumps over a broken down airplane .
hypothesis: A person is training his horse for a competition .
label: 2
premise: A person on a horse jumps over a broken down airplane .
hypothesis: A person is at a diner , ordering an omelette .
label: 1
premise: A person on a horse jumps over a broken down airplane .
hypothesis: A person is outdoors , on a horse .
label: 0```

```test_data = read_snli(data_dir,is_train=False)
for data in [train_data,test_data]:
print([[label for label in data[2]].count(i) for i in range(3)])```

```输出结果如下：
[183416, 183187, 182764]
[3368, 3237, 3219]```

## 4. 定义用于加载数据集的类

```class SNLIDataset(torch.utils.data.Dataset):
"""用于加载SNLI数据集的自定义数据集"""
def __init__(self,dataset,num_steps,vocab=None):
self.num_steps = num_steps
all_premises_tokens = d2l.torch.tokenize(dataset[0],token='word') # all_premises_tokens为一个list of list嵌套列表，列表里面每个元素是每个样本的token词元列表
all_hypotheses_tokens = d2l.torch.tokenize(dataset[1],token='word') # all_hypotheses_tokens为一个list of list嵌套列表，列表里面每个元素是每个样本的token词元列表
if vocab is None:
else:
self.vocab = vocab
self.all_labels = torch.tensor(dataset[2])
def __getitem__(self, idx):
return (self.all_premises_tokens[idx],self.all_hypotheses_tokens[idx]),self.all_labels[idx]
def __len__(self):
return len(self.all_premises_tokens)```

## 5. 整合代码

。因此在训练集中训练的模型将不知道来自测试集的任何新词元

```def load_data_snli(batch_size,num_steps = 50):
"""下载SNLI数据集并返回数据迭代器和词表"""
train_dataset = SNLIDataset(train_data,num_steps,vocab=None)#训练集需要构建自己的vocab
test_dataset = SNLIDataset(test_data,num_steps,vocab=train_dataset.vocab)#注意测试集需要使用train_dataset训练集里面的vocab
return train_iter,test_iter,train_dataset.vocab```

```train_iter,test_iter,vocab = load_data_snli(128,50)
len(vocab)```

```输出结果如下：
18678```

```#打印第一个批量的相关输入数据和label数据
for X,Y in train_iter:
print(X[0].shape) #前提的一个批量序列数据
print(X[1].shape) #假设的一个批量序列数据
print(Y.shape)  # label的一个批量label数据
break```

```输出结果如下：
torch.Size([128, 50])
torch.Size([128, 50])
torch.Size([128])```

## 7. 全部代码

```import torch
import torch.nn
import d2l.torch
import os
import re
d2l.torch.DATA_HUB['SNLI'] = (
'https://nlp.stanford.edu/projects/snli/snli_1.0.zip',
'9fcde07509c7e87ec61c640c1b2753d9041758e4')
"""将SNLI数据集解析为前提、假设和标签"""
def extract_text(s):
# 删除我们不会使用的信息
s = re.sub('\\(', '', s)
s = re.sub('\\)', '', s)
# 用一个空格替换两个或多个连续的空格
s = re.sub('\\s{2,}', ' ', s)
return s.strip()
label_set = {
'entailment': 0, 'contradiction': 1, 'neutral': 2}
file_path = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt')
with open(file_path, 'r') as f:
rows = [row.split('\t') for row in f.readlines()[1:]]  # rows是一个list of list嵌套列表
premises = [extract_text(row[1]) for row in rows if row[0] in label_set]  # premises是一个列表，里面元素是一个每一个样本的前提
hypotheses = [extract_text(row[2]) for row in rows if row[0] in label_set]  #hypotheses是一个列表，里面元素是一个每一个样本（每一行）的假设
labels = [label_set[row[0]] for row in rows if row[0] in label_set]  #labels是一个列表，里面元素是一个每一个样本的label,为0,1,2标签
return premises, hypotheses, labels
for x0, x1, y in zip(train_data[0][:3], train_data[1][:3], train_data[2][:3]):
print('premise:', x0)
print('hypothesis:', x1)
print('label:', y)
for data in [train_data, test_data]:
print([[label for label in data[2]].count(i) for i in range(3)])
[[1, 2, 3], [5, 8, 9]] + [[1, 2, 3], [5, 8, 9]]
class SNLIDataset(torch.utils.data.Dataset):
"""用于加载SNLI数据集的自定义数据集"""
def __init__(self, dataset, num_steps, vocab=None):
self.num_steps = num_steps
all_premises_tokens = d2l.torch.tokenize(dataset[0],
token='word')  # all_premises_tokens为一个list of list嵌套列表，列表里面每个元素是每个样本的token词元列表
all_hypotheses_tokens = d2l.torch.tokenize(dataset[1],
token='word')  # all_hypotheses_tokens为一个list of list嵌套列表，列表里面每个元素是每个样本的token词元列表
if vocab is None:
self.vocab = d2l.torch.Vocab(tokens=all_premises_tokens + all_hypotheses_tokens, min_freq=5,
else:
self.vocab = vocab
self.all_labels = torch.tensor(dataset[2])
def __getitem__(self, idx):
return (self.all_premises_tokens[idx], self.all_hypotheses_tokens[idx]), self.all_labels[idx]
def __len__(self):
return len(self.all_premises_tokens)
"""下载SNLI数据集并返回数据迭代器和词表"""
train_dataset = SNLIDataset(train_data, num_steps, vocab=None)  #训练集需要构建自己的vocab
test_dataset = SNLIDataset(test_data, num_steps, vocab=train_dataset.vocab)  #注意测试集需要使用train_dataset训练集里面的vocab
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=num_workers)