Hugging face Tutorial笔记

本文最后更新于:几秒前

使用Transformers的框架

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 导入需要用到的库
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# 加载预训练好的 BERT 模型和分词器
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# 定义两个待处理的样本序列
sequences = [
"I've been waiting for a HuggingFace course my whole life.",
"This course is amazing!",
]

# 对输入序列进行分词和处理,生成一个张量的批次
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# 在批次中添加标签张量
batch["labels"] = torch.tensor([1, 1])

# 定义优化器并计算损失函数
optimizer = AdamW(model.parameters())
loss = model(**batch).loss

# 计算梯度并执行优化器的一次反向传播操作
loss.backward()
optimizer.step()

微调模型

数据预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# 导入需要使用的库和模块
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader

# 加载SST-2数据集
raw_datasets = load_dataset("glue", "sst2")

# 设置预训练模型
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 定义对数据集的处理函数,使用tokenizer对输入数据进行分词,并设置截断和填充。
def tokenize_func(example):
return tokenizer(
example['sentence'],
truncation=True,
padding=True
)

# 对数据集进行分词处理,并移除不需要的列
# 根据代码中的操作,我们可以发现,我们可以移除idx和sentence这两列,因为它们在后面的处理中不会被使用
tokenized_datasets = raw_datasets.map(tokenize_func, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['idx', 'sentence'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets = tokenized_datasets.with_format('torch')

# 定义数据收集器,用于按需填充输入数据
data_collator = DataCollatorWithPadding(tokenizer)

# 创建一个批量数据加载器,并使用数据收集器进行填充
train_dataloader = DataLoader(
tokenized_datasets['train'],
batch_size=16,
shuffle=True,
collate_fn=data_collator
)

# 遍历数据加载器并输出每个批次的输入ID形状
for step, batch in enumerate(train_dataloader):
print(batch['input_ids'].shape)
if step > 5:
break

加入Trainer训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments,Trainer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import evaluate
import numpy as np

raw_datasets = load_dataset("glue", "sst2")

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # 从checkpoint中加载预训练好的tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2) # 从checkpoint中加载预训练好的分类模型,num_labels为分类数量

def tokenize_func(example):
return tokenizer(
example['sentence'],
truncation=True,
padding=True
)
def compute_metrics(predictions):
metrics = evaluate.load('glue','sst2')
logits,labels = predictions
pred = np.argmax(logits,axis=-1)
return metrics.compute(predictions=pred, references=labels)

tokenized_datasets = raw_datasets.map(tokenize_func, batched=True) # 使用tokenizer对数据集进行tokenize操作

data_collator = DataCollatorWithPadding(tokenizer) # 使用tokenizer对数据进行padding和collate操作

train_dataloader = DataLoader(
tokenized_datasets['train'], # 加载tokenize后的训练数据
batch_size=16, # 指定batch_size大小
shuffle=True, # 是否打乱数据
collate_fn=data_collator # 指定collate函数
)

training_args = TrainingArguments("test_training",evaluation_strategy="epoch") # 指定训练参数

trainer = Trainer(
model,
training_args,
train_dataset = tokenized_datasets['train'], # 指定训练集
eval_dataset = tokenized_datasets['validation'], # 指定验证集
data_collator = data_collator, # 指定collator
tokenizer = tokenizer, # 指定tokenizer
compute_metrics = compute_metrics # 指定评价函数
)

trainer.train() # 进行模型训练

含加速器的完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from datasets import load_dataset
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from accelerate import Accelerator

# 获取tokenizer
checkpoint = "bert-base-uncased"
raw_datasets = load_dataset("glue", "sst2")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# 对数据集进行处理
def tokenize_func(example):
return tokenizer(
example['sentence'],
truncation = True
)
tokenized_datasets = raw_datasets.map(tokenize_func,batched=True)
# 对不需要的列和错误的列做处理
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# 建立数据加载器,方便数据存取
# 动态填充
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(
tokenized_datasets['train'],
shuffle = True,
batch_size = 8,
collate_fn = data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"],
batch_size=8,
collate_fn=data_collator
)
# 定义模型
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# 设置优化器和学习率调整器
optimizer = AdamW(model.parameters(),lr=5e-5)
num_epochs = 3
num_train_steps = num_epochs * len(train_dataloader) # 注意这里
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_train_steps,
)
# 设置训练的加速器
accelerator = Accelerator()
# 将模型,数据loader和优化器传进去
model,train_dataloader,eval_dataloader,optimizer = accelerator.prepare(
model,train_dataloader,eval_dataloader,optimizer
)
# 设置跑模型的设备为Cuda
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# 调整模型的运行到GPU
model.to(device)
# 训练过程
progress_bar = tqdm(range(num_train_steps)) # tqdm的进度条
model.train() # 设置为train模式
for epoch in range(num_epochs):
for batch in train_dataloader:
# 将batch中的数据也放到device
# batch = {k: v.to(device) for k, v in batch.items()}
output = model(**batch)
# 在output中获取孙树
loss = output.loss
# 反向传播
loss.backward()
# 用加速器反向传播
# 梯度更新
optimizer.step()
# 更新学习率
lr_scheduler.step()
# 设置grad为0,防止影响下次训练
optimizer.zero_grad()
progress_bar.update(1)
# 加载评估指标
metrics = evaluate.load("glue", "sst2")
model.eval() # 设置为评估模式
for batch in eval_dataloader:
# 将batch中的tensor数据移动到设备上(GPU或CPU)
batch = {k :v.to(device) for k,v in batch.items()}
# 关闭梯度计算,加快推理速度
with torch.no_grad():
output = model(**batch) # 通过模型进行预测
# 获取预测结果
logits = output.logits
pred = torch.argmax(logits,axis=-1)
metrics.add_batch(predictions=accelerator.gather(pred),references=accelerator.gather(batch['labels']))
metrics.compute() # 计算评估指标的平均值

train.py

image-20230427235756835

image-20230428000011925


Hugging face Tutorial笔记
http://paopao0226.site/post/9586f022.html
作者
Ywj226
发布于
2023年4月18日
更新于
2023年9月23日
许可协议