`Cars built after 2020 and manufactured in Italy`

`SELECT name FROM cars WHERE location = 'Italy' AND date > 2020`

#### 1.建立训练数据

```templates = [
["[prop1] of [nns]","SELECT [prop1] FROM [nns]"],
["[agg] [prop1] for each [breakdown]","SELECT [agg]([prop1]) , [breakdown] FROM [prop1] GROUP BY [breakdown]"],
["[prop1] of [nns] by [breakdown]","SELECT [prop1] , [breakdown] FROM [nns] GROUP BY [breakdown]"],
["[prop1] of [nns] in [location] by [breakdown]","SELECT [prop1] , [breakdown] FROM [nns] WHERE location = '[location]' GROUP BY [breakdown]"],
["[nns] having [prop1] between [number1] and [number2]","SELECT name FROM [nns] WHERE [prop1] > [number1] and [prop1] < [number2]"],
["[prop] by [breakdown]","SELECT name , [breakdown] FROM [prop] GROUP BY [breakdown]"],
["[agg] of [prop1] of [nn]","SELECT [agg]([prop1]) FROM [nn]"],
["[prop1] of [nns] before [year]","SELECT [prop1] FROM [nns] WHERE date < [year]"],
["[prop1] of [nns] after [year] in [location]","SELECT [prop1] FROM [nns] WHERE date > [year] AND location='[location]'"],
["[nns] [verb] after [year] in [location]","SELECT name FROM [nns] WHERE location = '[location]' AND date > [year]"],
["[nns] having [prop1] between [number1] and [number2] by [breakdown]","SELECT name , [breakdown] FROM [nns] WHERE [prop1] < [number1] AND [prop1] > [number2] GROUP BY [breakdown]"],
["[nns] with a [prop1] of maximum [number1] by their [breakdown]","SELECT name , [breakdown] FROM [nns] WHERE [prop1] <= [number1] GROUP BY [breakdown]"],
["[prop1] and [prop2] of [nns] since [year]","SELECT [prop1] , [prop2] FROM [nns] WHERE date > [year]"],
["[nns] which have both [prop1] and [prop2]","SELECT name FROM [nns] WHERE [prop1] IS true AND [prop2] IS true"],
["Top [number1] [nns] by [prop1]","SELECT name FROM [nns] ORDER BY [prop1] DESC LIMIT [number1]"]
]
template = random.choice(templates)
print("Sample Query Template  :", template[0])
print("SQL Translation        :", template[1])```

`layer.run([build_dataset])`

```from torch.utils.data import Dataset
class EnglishToSQLDataSet(Dataset):
def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
self.tokenizer = tokenizer
self.data = dataframe
self.source_len = source_len
self.target_len = target_len
self.target_text = self.data[target_text]
self.source_text = self.data[source_text]
self.data["query"] = "translate English to SQL: "+self.data["query"]
self.data["sql"] = "<pad>" + self.data["sql"] + "</s>"
def __len__(self):
return len(self.target_text)
def __getitem__(self, index):
source_text = str(self.source_text[index])
target_text = str(self.target_text[index])
source_text = ' '.join(source_text.split())
target_text = ' '.join(target_text.split())
source_ids = source['input_ids'].squeeze()
target_ids = target['input_ids'].squeeze()
return {
'source_ids': source_ids.to(dtype=torch.long),
'target_ids': target_ids.to(dtype=torch.long),
'target_ids_y': target_ids.to(dtype=torch.long)
}```

#### 3.细调T5模型

```def train(epoch, tokenizer, model, device, loader, optimizer):
import torch
model.train()
y = data['target_ids'].to(device, dtype = torch.long)
y_ids = y[:, :-1].contiguous()
lm_labels = y[:, 1:].clone().detach()
lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
ids = data['source_ids'].to(device, dtype = torch.long)
loss = outputs[0]
step = (epoch * len(loader)) + _
layer.log({"loss": float(loss)}, step)
loss.backward()
optimizer.step()```

@model：告诉Layer层这个函数用于训练一个ML模型。
@fabric：用于告诉Layer层训练模型所需的计算资源（CPU、GPU等）。由于T5是一个大型模型，所以我们需要使用GPU对其进行微调。下面列举的是一个你可以使用Layer层操作的组装列表。
@pip_requirements：指示Python包需要对我们的模型进行微调。

```@model("t5-tokenizer")
@fabric("f-medium")
@pip_requirements(packages=["torch","transformers","sentencepiece"])
def build_tokenizer():
from transformers import T5Tokenizer
#从Hugging face加载分词器
tokenizer = T5Tokenizer.from_pretrained("t5-small")
@model("t5-english-to-sql")
@fabric("f-gpu-small")
@pip_requirements(packages=["torch","transformers","sentencepiece"])
def build_model():
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch.nn.functional as F
from torch import cuda
import torch
parameters={
"BATCH_SIZE":8,
"EPOCHS":3,
"LEARNING_RATE":2e-05,
"MAX_SOURCE_TEXT_LENGTH":75,
"MAX_TARGET_TEXT_LENGTH":75,
"SEED": 42
}
#把参数加载到Layer层中
layer.log(parameters)
#为重复性设定种子参数
torch.manual_seed(parameters["SEED"])
np.random.seed(parameters["SEED"])
torch.backends.cudnn.deterministic = True
#从Layer层加载分词器
tokenizer = layer.get_model("t5-tokenizer").get_train()
#从Hugging face中加载预训练模型
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)
dataframe = layer.get_dataset("english_sql_translations").to_pandas()
source_text = "query"
target_text = "sql"
dataframe = dataframe[[source_text,target_text]]
train_dataset = dataframe.sample(frac=0.8,random_state = parameters["SEED"])
train_dataset = train_dataset.reset_index(drop=True)
layer.log({"FULL Dataset": str(dataframe.shape),
"TRAIN Dataset": str(train_dataset.shape)
})
training_set = EnglishToSQLDataSet(train_dataset, tokenizer, parameters["MAX_SOURCE_TEXT_LENGTH"], parameters["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
'batch_size': parameters["BATCH_SIZE"],
'shuffle': True,
'num_workers': 0
}
optimizer = torch.optim.Adam(params =  model.parameters(), lr=parameters["LEARNING_RATE"])
for epoch in range(parameters["EPOCHS"]):
train(epoch, tokenizer, model, device, training_loader, optimizer)
return model```

`layer.run([build_tokenizer, build_model], debug=True)`

```import gradio as gr
import layer
model = layer.get_model('layer/t5-fine-tuning-with-layer/models/t5-english-to-sql').get_train()
tokenizer = layer.get_model('layer/t5-fine-tuning-with-layer/models/t5-tokenizer').get_train()
def greet(query):
input_ids = tokenizer.encode(f"translate English to SQL: {query}", return_tensors="pt")
outputs = model.generate(input_ids, max_length=1024)
sql = tokenizer.decode(outputs[0], skip_special_tokens=True)
return sql
iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=[
"Show me the average price of wines in Italy by provinces",
"Cars built after 2020 and manufactured in Italy",
"Top 10 cities by their population"
])
iface.launch()```

```layer-sdk==0.9.350435
torch==1.11.0
sentencepiece==0.1.96```

（1）打开Hugging face官网（译者注：Hugging Face是美国的一家开源创业公司，其业务领域已经从聊天机器人扩展到机器学习等领域），创建一个空间。

`\$ git clone [YOUR_HUGGINGFACE_SPACE_URL]`

```\$ git add app.py
\$ git commit -m "Add application files"
\$ git push```

#### 5.小结

https://app.layer.ai/layer/t5-fine-tuning-with-layer

https://huggingface.co/spaces

https://www.kdnuggets.com/2022/05/query-table-t5.html