微调参数调优
概述
参数调优是微调成功的关键环节,涉及学习率、批次大小、训练轮数等多个超参数的设置。合理的参数配置能显著提升模型性能,而错误的设置可能导致训练失败或效果不佳。
核心超参数详解
学习率(Learning Rate)
基础设置原则
# 不同微调方法的学习率建议
learning_rates = {
"全参数微调": 1e-5, # 较小,避免破坏预训练权重
"LoRA微调": 1e-4, # 可以稍大,只更新少量参数
"QLoRA微调": 2e-4, # 量化模型可以用更大学习率
"分类任务": 2e-5, # 分类头需要较小学习率
"生成任务": 1e-5 # 生成任务对学习率更敏感
}
学习率调度策略
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
# 1. 线性衰减(推荐)
def setup_linear_schedule(optimizer, num_training_steps, warmup_ratio=0.1):
num_warmup_steps = int(num_training_steps * warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
return scheduler
# 2. 余弦退火
def setup_cosine_schedule(optimizer, num_training_steps, warmup_ratio=0.1):
num_warmup_steps = int(num_training_steps * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
return scheduler
# 3. 分层学习率(不同层使用不同学习率)
def setup_layered_learning_rates(model, base_lr=1e-5):
"""为不同层设置不同的学习率"""
param_groups = []
# 嵌入层 - 最小学习率
embedding_params = []
for name, param in model.named_parameters():
if 'embedding' in name.lower():
embedding_params.append(param)
if embedding_params:
param_groups.append({
'params': embedding_params,
'lr': base_lr * 0.1
})
# 编码器层 - 渐进式学习率
num_layers = getattr(model.config, 'num_hidden_layers', 12)
for layer_idx in range(num_layers):
layer_params = []
for name, param in model.named_parameters():
if f'layer.{layer_idx}' in name:
layer_params.append(param)
if layer_params:
# 越靠近输出层,学习率越大
layer_lr = base_lr * (0.5 + 0.5 * layer_idx / num_layers)
param_groups.append({
'params': layer_params,
'lr': layer_lr
})
# 分类头 - 最大学习率
classifier_params = []
for name, param in model.named_parameters():
if 'classifier' in name.lower() or 'head' in name.lower():
classifier_params.append(param)
if classifier_params:
param_groups.append({
'params': classifier_params,
'lr': base_lr * 10
})
return param_groups
批次大小(Batch Size)
动态批次大小
def find_optimal_batch_size(model, dataloader, max_batch_size=32):
"""自动寻找最优批次大小"""
import torch
for batch_size in [2, 4, 8, 16, 32]:
if batch_size > max_batch_size:
break
try:
# 测试是否能正常运行
model.train()
batch = next(iter(dataloader))
# 调整批次大小
if len(batch['input_ids']) > batch_size:
batch = {k: v[:batch_size] for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss if hasattr(outputs, 'loss') else outputs[0]
loss.backward()
print(f"Batch size {batch_size}: OK")
optimal_batch_size = batch_size
except RuntimeError as e:
if "out of memory" in str(e):
print(f"Batch size {batch_size}: OOM")
break
else:
raise e
finally:
torch.cuda.empty_cache()
return optimal_batch_size
# 梯度累积实现更大的有效批次大小
def setup_gradient_accumulation(desired_batch_size, actual_batch_size):
"""计算梯度累积步数"""
accumulation_steps = desired_batch_size // actual_batch_size
return max(1, accumulation_steps)
LoRA特定参数
LoRA参数配置
from peft import LoraConfig
def get_lora_config(task_type, model_type="causal_lm"):
"""根据任务类型获取LoRA配置"""
base_config = {
"task_type": "CAUSAL_LM" if model_type == "causal_lm" else "SEQ_CLS",
"inference_mode": False,
"bias": "none",
"lora_dropout": 0.1,
}
if task_type == "classification":
# 分类任务 - 较小的rank和alpha
config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
**base_config
)
elif task_type == "generation":
# 生成任务 - 较大的rank和alpha
config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
**base_config
)
elif task_type == "complex_reasoning":
# 复杂推理任务 - 最大的rank和alpha
config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
**base_config
)
else:
# 默认配置
config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
**base_config
)
return config
# LoRA参数影响分析
def analyze_lora_parameters():
"""分析LoRA参数对性能的影响"""
analysis = {
"r (rank)": {
"作用": "控制低秩矩阵的秩,影响模型容量",
"建议": {
"简单任务": "r=4-8",
"中等任务": "r=16-32",
"复杂任务": "r=64-128"
},
"注意": "r越大,参数越多,但不一定效果更好"
},
"lora_alpha": {
"作用": "缩放因子,控制LoRA的影响程度",
"建议": {
"通常设置": "alpha = 2 * r",
"保守设置": "alpha = r",
"激进设置": "alpha = 4 * r"
},
"注意": "alpha/r的比值决定了LoRA的实际影响"
},
"target_modules": {
"作用": "指定应用LoRA的模块",
"建议": {
"最小配置": "['q_proj', 'v_proj']",
"标准配置": "['q_proj', 'v_proj', 'k_proj', 'o_proj']",
"完整配置": "包含所有线性层"
}
}
}
return analysis
训练轮数与早停
早停机制
class EarlyStopping:
def __init__(self, patience=3, min_delta=0.001, restore_best_weights=True):
self.patience = patience
self.min_delta = min_delta
self.restore_best_weights = restore_best_weights
self.best_score = None
self.counter = 0
self.best_weights = None
def __call__(self, val_score, model):
if self.best_score is None:
self.best_score = val_score
self.save_checkpoint(model)
elif val_score < self.best_score + self.min_delta:
self.counter += 1
if self.counter >= self.patience:
if self.restore_best_weights:
self.restore_checkpoint(model)
return True
else:
self.best_score = val_score
self.counter = 0
self.save_checkpoint(model)
return False
def save_checkpoint(self, model):
"""保存最佳模型权重"""
self.best_weights = {name: param.clone() for name, param in model.named_parameters()}
def restore_checkpoint(self, model):
"""恢复最佳模型权重"""
if self.best_weights:
for name, param in model.named_parameters():
param.data.copy_(self.best_weights[name])
# 使用示例
early_stopping = EarlyStopping(patience=3)
for epoch in range(max_epochs):
# 训练
train_loss = train_epoch(model, train_dataloader)
# 验证
val_loss = validate_epoch(model, val_dataloader)
# 早停检查
if early_stopping(val_loss, model):
print(f"Early stopping at epoch {epoch}")
break
正则化参数
权重衰减
def setup_weight_decay(model, weight_decay=0.01, no_decay_keywords=['bias', 'LayerNorm']):
"""设置权重衰减,排除特定参数"""
decay_params = []
no_decay_params = []
for name, param in model.named_parameters():
if any(keyword in name for keyword in no_decay_keywords):
no_decay_params.append(param)
else:
decay_params.append(param)
optimizer_grouped_parameters = [
{
'params': decay_params,
'weight_decay': weight_decay
},
{
'params': no_decay_params,
'weight_decay': 0.0
}
]
return optimizer_grouped_parameters
Dropout设置
def adjust_dropout_rates(model, task_type):
"""根据任务类型调整dropout率"""
dropout_rates = {
"classification": 0.1,
"generation": 0.05,
"complex_reasoning": 0.15
}
target_dropout = dropout_rates.get(task_type, 0.1)
for module in model.modules():
if isinstance(module, torch.nn.Dropout):
module.p = target_dropout
参数搜索策略
网格搜索
def grid_search_hyperparameters(model_class, train_data, val_data):
"""网格搜索超参数"""
param_grid = {
'learning_rate': [1e-5, 2e-5, 5e-5],
'batch_size': [8, 16, 32],
'lora_r': [8, 16, 32],
'lora_alpha': [16, 32, 64]
}
best_score = 0
best_params = None
for lr in param_grid['learning_rate']:
for bs in param_grid['batch_size']:
for r in param_grid['lora_r']:
for alpha in param_grid['lora_alpha']:
params = {
'learning_rate': lr,
'batch_size': bs,
'lora_r': r,
'lora_alpha': alpha
}
# 训练模型
score = train_and_evaluate(model_class, train_data, val_data, params)
if score > best_score:
best_score = score
best_params = params
print(f"Params: {params}, Score: {score}")
return best_params, best_score
贝叶斯优化
from skopt import gp_minimize
from skopt.space import Real, Integer
def bayesian_optimization(model_class, train_data, val_data, n_calls=20):
"""贝叶斯优化超参数"""
# 定义搜索空间
space = [
Real(1e-6, 1e-3, "log-uniform", name='learning_rate'),
Integer(4, 64, name='batch_size'),
Integer(4, 64, name='lora_r'),
Integer(8, 128, name='lora_alpha')
]
def objective(params):
lr, bs, r, alpha = params
param_dict = {
'learning_rate': lr,
'batch_size': int(bs),
'lora_r': int(r),
'lora_alpha': int(alpha)
}
# 训练并返回负分数(因为gp_minimize是最小化)
score = train_and_evaluate(model_class, train_data, val_data, param_dict)
return -score
# 执行优化
result = gp_minimize(objective, space, n_calls=n_calls, random_state=42)
best_params = {
'learning_rate': result.x[0],
'batch_size': int(result.x[1]),
'lora_r': int(result.x[2]),
'lora_alpha': int(result.x[3])
}
return best_params, -result.fun
实战调优案例
分类任务调优
def tune_classification_model():
"""分类任务参数调优示例"""
# 基础配置
base_config = {
"model_name": "bert-base-chinese",
"num_classes": 10,
"max_length": 512
}
# 参数搜索空间
search_space = {
"learning_rate": [1e-5, 2e-5, 5e-5],
"batch_size": [16, 32],
"num_epochs": [3, 5, 8],
"warmup_ratio": [0.1, 0.2],
"weight_decay": [0.01, 0.1]
}
best_f1 = 0
best_config = None
for lr in search_space["learning_rate"]:
for bs in search_space["batch_size"]:
for epochs in search_space["num_epochs"]:
for warmup in search_space["warmup_ratio"]:
for wd in search_space["weight_decay"]:
config = {
**base_config,
"learning_rate": lr,
"batch_size": bs,
"num_epochs": epochs,
"warmup_ratio": warmup,
"weight_decay": wd
}
# 训练和评估
f1_score = train_classification_model(config)
if f1_score > best_f1:
best_f1 = f1_score
best_config = config
print(f"Config: {config}")
print(f"F1 Score: {f1_score:.4f}")
print("-" * 50)
return best_config, best_f1
生成任务调优
def tune_generation_model():
"""生成任务参数调优示例"""
# LoRA配置搜索
lora_configs = [
{"r": 8, "alpha": 16, "dropout": 0.1},
{"r": 16, "alpha": 32, "dropout": 0.1},
{"r": 32, "alpha": 64, "dropout": 0.05},
]
# 训练配置搜索
training_configs = [
{"lr": 1e-4, "epochs": 3, "warmup": 0.1},
{"lr": 2e-4, "epochs": 5, "warmup": 0.2},
{"lr": 5e-4, "epochs": 2, "warmup": 0.05},
]
best_bleu = 0
best_combination = None
for lora_config in lora_configs:
for train_config in training_configs:
full_config = {
**lora_config,
**train_config,
"model_name": "chatglm3-6b",
"max_length": 1024
}
# 训练和评估
bleu_score = train_generation_model(full_config)
if bleu_score > best_bleu:
best_bleu = bleu_score
best_combination = full_config
print(f"LoRA: {lora_config}")
print(f"Training: {train_config}")
print(f"BLEU: {bleu_score:.4f}")
print("-" * 50)
return best_combination, best_bleu
调优最佳实践
参数调优顺序
def systematic_tuning_process():
"""系统化的参数调优流程"""
tuning_stages = [
{
"stage": "1. 基础配置",
"parameters": ["learning_rate", "batch_size"],
"strategy": "粗调,快速找到大致范围"
},
{
"stage": "2. 模型结构",
"parameters": ["lora_r", "lora_alpha", "target_modules"],
"strategy": "在基础配置基础上优化模型结构"
},
{
"stage": "3. 训练策略",
"parameters": ["num_epochs", "warmup_ratio", "scheduler_type"],
"strategy": "优化训练过程"
},
{
"stage": "4. 正则化",
"parameters": ["weight_decay", "dropout", "gradient_clipping"],
"strategy": "防止过拟合,提升泛化能力"
},
{
"stage": "5. 精细调优",
"parameters": ["所有参数的微调"],
"strategy": "在前面基础上进行精细调整"
}
]
return tuning_stages
# 监控指标
def setup_monitoring():
"""设置训练监控"""
metrics_to_monitor = {
"训练指标": ["loss", "learning_rate", "gradient_norm"],
"验证指标": ["val_loss", "val_accuracy", "val_f1"],
"系统指标": ["gpu_memory", "training_speed", "throughput"],
"模型指标": ["parameter_count", "model_size", "inference_time"]
}
return metrics_to_monitor