微调参数调优

概述

参数调优是微调成功的关键环节,涉及学习率、批次大小、训练轮数等多个超参数的设置。合理的参数配置能显著提升模型性能,而错误的设置可能导致训练失败或效果不佳。

核心超参数详解

学习率(Learning Rate)

基础设置原则

# 不同微调方法的学习率建议
learning_rates = {
    "全参数微调": 1e-5,      # 较小,避免破坏预训练权重
    "LoRA微调": 1e-4,        # 可以稍大,只更新少量参数
    "QLoRA微调": 2e-4,       # 量化模型可以用更大学习率
    "分类任务": 2e-5,        # 分类头需要较小学习率
    "生成任务": 1e-5         # 生成任务对学习率更敏感
}

学习率调度策略

from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
 
# 1. 线性衰减(推荐)
def setup_linear_schedule(optimizer, num_training_steps, warmup_ratio=0.1):
    num_warmup_steps = int(num_training_steps * warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    return scheduler
 
# 2. 余弦退火
def setup_cosine_schedule(optimizer, num_training_steps, warmup_ratio=0.1):
    num_warmup_steps = int(num_training_steps * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    return scheduler
 
# 3. 分层学习率(不同层使用不同学习率)
def setup_layered_learning_rates(model, base_lr=1e-5):
    """为不同层设置不同的学习率"""
    param_groups = []
    
    # 嵌入层 - 最小学习率
    embedding_params = []
    for name, param in model.named_parameters():
        if 'embedding' in name.lower():
            embedding_params.append(param)
    
    if embedding_params:
        param_groups.append({
            'params': embedding_params,
            'lr': base_lr * 0.1
        })
    
    # 编码器层 - 渐进式学习率
    num_layers = getattr(model.config, 'num_hidden_layers', 12)
    for layer_idx in range(num_layers):
        layer_params = []
        for name, param in model.named_parameters():
            if f'layer.{layer_idx}' in name:
                layer_params.append(param)
        
        if layer_params:
            # 越靠近输出层,学习率越大
            layer_lr = base_lr * (0.5 + 0.5 * layer_idx / num_layers)
            param_groups.append({
                'params': layer_params,
                'lr': layer_lr
            })
    
    # 分类头 - 最大学习率
    classifier_params = []
    for name, param in model.named_parameters():
        if 'classifier' in name.lower() or 'head' in name.lower():
            classifier_params.append(param)
    
    if classifier_params:
        param_groups.append({
            'params': classifier_params,
            'lr': base_lr * 10
        })
    
    return param_groups

批次大小(Batch Size)

动态批次大小

def find_optimal_batch_size(model, dataloader, max_batch_size=32):
    """自动寻找最优批次大小"""
    import torch
    
    for batch_size in [2, 4, 8, 16, 32]:
        if batch_size > max_batch_size:
            break
            
        try:
            # 测试是否能正常运行
            model.train()
            batch = next(iter(dataloader))
            
            # 调整批次大小
            if len(batch['input_ids']) > batch_size:
                batch = {k: v[:batch_size] for k, v in batch.items()}
            
            outputs = model(**batch)
            loss = outputs.loss if hasattr(outputs, 'loss') else outputs[0]
            loss.backward()
            
            print(f"Batch size {batch_size}: OK")
            optimal_batch_size = batch_size
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"Batch size {batch_size}: OOM")
                break
            else:
                raise e
        finally:
            torch.cuda.empty_cache()
    
    return optimal_batch_size
 
# 梯度累积实现更大的有效批次大小
def setup_gradient_accumulation(desired_batch_size, actual_batch_size):
    """计算梯度累积步数"""
    accumulation_steps = desired_batch_size // actual_batch_size
    return max(1, accumulation_steps)

LoRA特定参数

LoRA参数配置

from peft import LoraConfig
 
def get_lora_config(task_type, model_type="causal_lm"):
    """根据任务类型获取LoRA配置"""
    
    base_config = {
        "task_type": "CAUSAL_LM" if model_type == "causal_lm" else "SEQ_CLS",
        "inference_mode": False,
        "bias": "none",
        "lora_dropout": 0.1,
    }
    
    if task_type == "classification":
        # 分类任务 - 较小的rank和alpha
        config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=["q_proj", "v_proj"],
            **base_config
        )
    elif task_type == "generation":
        # 生成任务 - 较大的rank和alpha
        config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            **base_config
        )
    elif task_type == "complex_reasoning":
        # 复杂推理任务 - 最大的rank和alpha
        config = LoraConfig(
            r=32,
            lora_alpha=64,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
            **base_config
        )
    else:
        # 默认配置
        config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            **base_config
        )
    
    return config
 
# LoRA参数影响分析
def analyze_lora_parameters():
    """分析LoRA参数对性能的影响"""
    analysis = {
        "r (rank)": {
            "作用": "控制低秩矩阵的秩,影响模型容量",
            "建议": {
                "简单任务": "r=4-8",
                "中等任务": "r=16-32", 
                "复杂任务": "r=64-128"
            },
            "注意": "r越大,参数越多,但不一定效果更好"
        },
        "lora_alpha": {
            "作用": "缩放因子,控制LoRA的影响程度",
            "建议": {
                "通常设置": "alpha = 2 * r",
                "保守设置": "alpha = r",
                "激进设置": "alpha = 4 * r"
            },
            "注意": "alpha/r的比值决定了LoRA的实际影响"
        },
        "target_modules": {
            "作用": "指定应用LoRA的模块",
            "建议": {
                "最小配置": "['q_proj', 'v_proj']",
                "标准配置": "['q_proj', 'v_proj', 'k_proj', 'o_proj']",
                "完整配置": "包含所有线性层"
            }
        }
    }
    return analysis

训练轮数与早停

早停机制

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_score = None
        self.counter = 0
        self.best_weights = None
    
    def __call__(self, val_score, model):
        if self.best_score is None:
            self.best_score = val_score
            self.save_checkpoint(model)
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                if self.restore_best_weights:
                    self.restore_checkpoint(model)
                return True
        else:
            self.best_score = val_score
            self.counter = 0
            self.save_checkpoint(model)
        
        return False
    
    def save_checkpoint(self, model):
        """保存最佳模型权重"""
        self.best_weights = {name: param.clone() for name, param in model.named_parameters()}
    
    def restore_checkpoint(self, model):
        """恢复最佳模型权重"""
        if self.best_weights:
            for name, param in model.named_parameters():
                param.data.copy_(self.best_weights[name])
 
# 使用示例
early_stopping = EarlyStopping(patience=3)
 
for epoch in range(max_epochs):
    # 训练
    train_loss = train_epoch(model, train_dataloader)
    
    # 验证
    val_loss = validate_epoch(model, val_dataloader)
    
    # 早停检查
    if early_stopping(val_loss, model):
        print(f"Early stopping at epoch {epoch}")
        break

正则化参数

权重衰减

def setup_weight_decay(model, weight_decay=0.01, no_decay_keywords=['bias', 'LayerNorm']):
    """设置权重衰减,排除特定参数"""
    decay_params = []
    no_decay_params = []
    
    for name, param in model.named_parameters():
        if any(keyword in name for keyword in no_decay_keywords):
            no_decay_params.append(param)
        else:
            decay_params.append(param)
    
    optimizer_grouped_parameters = [
        {
            'params': decay_params,
            'weight_decay': weight_decay
        },
        {
            'params': no_decay_params,
            'weight_decay': 0.0
        }
    ]
    
    return optimizer_grouped_parameters

Dropout设置

def adjust_dropout_rates(model, task_type):
    """根据任务类型调整dropout率"""
    dropout_rates = {
        "classification": 0.1,
        "generation": 0.05,
        "complex_reasoning": 0.15
    }
    
    target_dropout = dropout_rates.get(task_type, 0.1)
    
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = target_dropout

参数搜索策略

网格搜索

def grid_search_hyperparameters(model_class, train_data, val_data):
    """网格搜索超参数"""
    param_grid = {
        'learning_rate': [1e-5, 2e-5, 5e-5],
        'batch_size': [8, 16, 32],
        'lora_r': [8, 16, 32],
        'lora_alpha': [16, 32, 64]
    }
    
    best_score = 0
    best_params = None
    
    for lr in param_grid['learning_rate']:
        for bs in param_grid['batch_size']:
            for r in param_grid['lora_r']:
                for alpha in param_grid['lora_alpha']:
                    params = {
                        'learning_rate': lr,
                        'batch_size': bs,
                        'lora_r': r,
                        'lora_alpha': alpha
                    }
                    
                    # 训练模型
                    score = train_and_evaluate(model_class, train_data, val_data, params)
                    
                    if score > best_score:
                        best_score = score
                        best_params = params
                        
                    print(f"Params: {params}, Score: {score}")
    
    return best_params, best_score

贝叶斯优化

from skopt import gp_minimize
from skopt.space import Real, Integer
 
def bayesian_optimization(model_class, train_data, val_data, n_calls=20):
    """贝叶斯优化超参数"""
    
    # 定义搜索空间
    space = [
        Real(1e-6, 1e-3, "log-uniform", name='learning_rate'),
        Integer(4, 64, name='batch_size'),
        Integer(4, 64, name='lora_r'),
        Integer(8, 128, name='lora_alpha')
    ]
    
    def objective(params):
        lr, bs, r, alpha = params
        param_dict = {
            'learning_rate': lr,
            'batch_size': int(bs),
            'lora_r': int(r),
            'lora_alpha': int(alpha)
        }
        
        # 训练并返回负分数(因为gp_minimize是最小化)
        score = train_and_evaluate(model_class, train_data, val_data, param_dict)
        return -score
    
    # 执行优化
    result = gp_minimize(objective, space, n_calls=n_calls, random_state=42)
    
    best_params = {
        'learning_rate': result.x[0],
        'batch_size': int(result.x[1]),
        'lora_r': int(result.x[2]),
        'lora_alpha': int(result.x[3])
    }
    
    return best_params, -result.fun

实战调优案例

分类任务调优

def tune_classification_model():
    """分类任务参数调优示例"""
    
    # 基础配置
    base_config = {
        "model_name": "bert-base-chinese",
        "num_classes": 10,
        "max_length": 512
    }
    
    # 参数搜索空间
    search_space = {
        "learning_rate": [1e-5, 2e-5, 5e-5],
        "batch_size": [16, 32],
        "num_epochs": [3, 5, 8],
        "warmup_ratio": [0.1, 0.2],
        "weight_decay": [0.01, 0.1]
    }
    
    best_f1 = 0
    best_config = None
    
    for lr in search_space["learning_rate"]:
        for bs in search_space["batch_size"]:
            for epochs in search_space["num_epochs"]:
                for warmup in search_space["warmup_ratio"]:
                    for wd in search_space["weight_decay"]:
                        
                        config = {
                            **base_config,
                            "learning_rate": lr,
                            "batch_size": bs,
                            "num_epochs": epochs,
                            "warmup_ratio": warmup,
                            "weight_decay": wd
                        }
                        
                        # 训练和评估
                        f1_score = train_classification_model(config)
                        
                        if f1_score > best_f1:
                            best_f1 = f1_score
                            best_config = config
                        
                        print(f"Config: {config}")
                        print(f"F1 Score: {f1_score:.4f}")
                        print("-" * 50)
    
    return best_config, best_f1

生成任务调优

def tune_generation_model():
    """生成任务参数调优示例"""
    
    # LoRA配置搜索
    lora_configs = [
        {"r": 8, "alpha": 16, "dropout": 0.1},
        {"r": 16, "alpha": 32, "dropout": 0.1},
        {"r": 32, "alpha": 64, "dropout": 0.05},
    ]
    
    # 训练配置搜索
    training_configs = [
        {"lr": 1e-4, "epochs": 3, "warmup": 0.1},
        {"lr": 2e-4, "epochs": 5, "warmup": 0.2},
        {"lr": 5e-4, "epochs": 2, "warmup": 0.05},
    ]
    
    best_bleu = 0
    best_combination = None
    
    for lora_config in lora_configs:
        for train_config in training_configs:
            
            full_config = {
                **lora_config,
                **train_config,
                "model_name": "chatglm3-6b",
                "max_length": 1024
            }
            
            # 训练和评估
            bleu_score = train_generation_model(full_config)
            
            if bleu_score > best_bleu:
                best_bleu = bleu_score
                best_combination = full_config
            
            print(f"LoRA: {lora_config}")
            print(f"Training: {train_config}")
            print(f"BLEU: {bleu_score:.4f}")
            print("-" * 50)
    
    return best_combination, best_bleu

调优最佳实践

参数调优顺序

def systematic_tuning_process():
    """系统化的参数调优流程"""
    
    tuning_stages = [
        {
            "stage": "1. 基础配置",
            "parameters": ["learning_rate", "batch_size"],
            "strategy": "粗调,快速找到大致范围"
        },
        {
            "stage": "2. 模型结构",
            "parameters": ["lora_r", "lora_alpha", "target_modules"],
            "strategy": "在基础配置基础上优化模型结构"
        },
        {
            "stage": "3. 训练策略",
            "parameters": ["num_epochs", "warmup_ratio", "scheduler_type"],
            "strategy": "优化训练过程"
        },
        {
            "stage": "4. 正则化",
            "parameters": ["weight_decay", "dropout", "gradient_clipping"],
            "strategy": "防止过拟合,提升泛化能力"
        },
        {
            "stage": "5. 精细调优",
            "parameters": ["所有参数的微调"],
            "strategy": "在前面基础上进行精细调整"
        }
    ]
    
    return tuning_stages
 
# 监控指标
def setup_monitoring():
    """设置训练监控"""
    metrics_to_monitor = {
        "训练指标": ["loss", "learning_rate", "gradient_norm"],
        "验证指标": ["val_loss", "val_accuracy", "val_f1"],
        "系统指标": ["gpu_memory", "training_speed", "throughput"],
        "模型指标": ["parameter_count", "model_size", "inference_time"]
    }
    
    return metrics_to_monitor

相关概念