Adapter微调
什么是Adapter?
Adapter是一种参数高效的微调方法,通过在预训练模型的层之间插入小型神经网络模块来实现任务适配,而不修改原始模型参数。
核心架构
Adapter模块设计
import torch
import torch.nn as nn
class AdapterLayer(nn.Module):
def __init__(self, hidden_size, adapter_size, activation="relu"):
super().__init__()
self.hidden_size = hidden_size
self.adapter_size = adapter_size
# 下投影:降维
self.down_project = nn.Linear(hidden_size, adapter_size)
# 激活函数
if activation == "relu":
self.activation = nn.ReLU()
elif activation == "gelu":
self.activation = nn.GELU()
else:
self.activation = nn.Identity()
# 上投影:恢复维度
self.up_project = nn.Linear(adapter_size, hidden_size)
# 初始化:接近恒等映射
nn.init.zeros_(self.up_project.weight)
nn.init.zeros_(self.up_project.bias)
def forward(self, x):
# Adapter前向传播:降维 → 激活 → 升维
adapter_output = self.down_project(x)
adapter_output = self.activation(adapter_output)
adapter_output = self.up_project(adapter_output)
# 残差连接
return x + adapter_output
瓶颈架构
Adapter采用瓶颈(bottleneck)设计:
输入(d) → 下投影(d→r) → 激活函数 → 上投影(r→d) → 残差连接 → 输出(d)
其中r << d,大幅减少参数量。
插入策略
Transformer中的插入位置
class TransformerWithAdapter(nn.Module):
def __init__(self, transformer_layer, adapter_config):
super().__init__()
self.transformer_layer = transformer_layer
# 在不同位置插入Adapter
self.adapter_after_attention = AdapterLayer(
hidden_size=adapter_config["hidden_size"],
adapter_size=adapter_config["adapter_size"]
)
self.adapter_after_ffn = AdapterLayer(
hidden_size=adapter_config["hidden_size"],
adapter_size=adapter_config["adapter_size"]
)
def forward(self, x, attention_mask=None):
# 原始注意力机制
attention_output = self.transformer_layer.attention(x, attention_mask)
# 第一个Adapter
x = self.adapter_after_attention(attention_output)
# 原始FFN
ffn_output = self.transformer_layer.ffn(x)
# 第二个Adapter
output = self.adapter_after_ffn(ffn_output)
return output
插入位置选择
- 仅在FFN后:最常用,平衡效果与参数量
- 仅在注意力后:适合注意力相关任务
- 双重插入:最佳效果,但参数量增加
- 并行插入:与原始层并行而非串行
实现方法
使用AdapterHub
from transformers import AutoModel
from adapters import AdapterConfig, AdapterTrainer
# 加载预训练模型
model = AutoModel.from_pretrained("bert-base-uncased")
# 配置Adapter
adapter_config = AdapterConfig.load(
"pfeiffer", # Adapter架构类型
reduction_factor=16, # 降维因子
non_linearity="relu"
)
# 添加Adapter
model.add_adapter("task_adapter", config=adapter_config)
# 激活Adapter
model.set_active_adapters("task_adapter")
# 冻结原始参数,只训练Adapter
model.train_adapter("task_adapter")
手动实现Adapter
def add_adapters_to_model(model, adapter_config):
"""为模型添加Adapter层"""
for name, module in model.named_modules():
if "transformer.h" in name and "mlp" in name:
# 在MLP层后添加Adapter
adapter = AdapterLayer(
hidden_size=adapter_config["hidden_size"],
adapter_size=adapter_config["adapter_size"]
)
# 替换原始模块
parent_name = ".".join(name.split(".")[:-1])
parent_module = model.get_submodule(parent_name)
# 创建包含Adapter的新模块
new_module = nn.Sequential(module, adapter)
setattr(parent_module, name.split(".")[-1], new_module)
return model
训练策略
单任务训练
from transformers import Trainer, TrainingArguments
# 训练配置
training_args = TrainingArguments(
output_dir="./adapter_output",
num_train_epochs=5,
per_device_train_batch_size=16,
learning_rate=1e-3, # Adapter可以用更大的学习率
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=100,
save_strategy="epoch",
evaluation_strategy="epoch",
)
# 只训练Adapter参数
def freeze_base_model(model):
for name, param in model.named_parameters():
if "adapter" not in name:
param.requires_grad = False
else:
param.requires_grad = True
freeze_base_model(model)
# 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()
多任务训练
class MultiTaskAdapterModel(nn.Module):
def __init__(self, base_model, task_configs):
super().__init__()
self.base_model = base_model
self.adapters = nn.ModuleDict()
# 为每个任务创建独立的Adapter
for task_name, config in task_configs.items():
self.adapters[task_name] = AdapterLayer(
hidden_size=config["hidden_size"],
adapter_size=config["adapter_size"]
)
def forward(self, x, task_name):
# 基础模型前向传播
base_output = self.base_model(x)
# 应用任务特定的Adapter
if task_name in self.adapters:
adapted_output = self.adapters[task_name](base_output)
return adapted_output
else:
return base_output
# 多任务训练循环
def multi_task_training(model, task_dataloaders):
for epoch in range(num_epochs):
for task_name, dataloader in task_dataloaders.items():
for batch in dataloader:
# 前向传播
outputs = model(batch["input"], task_name=task_name)
loss = compute_loss(outputs, batch["labels"])
# 反向传播
loss.backward()
optimizer.step()
optimizer.zero_grad()
高级技术
AdapterFusion
class AdapterFusion(nn.Module):
def __init__(self, adapter_names, hidden_size):
super().__init__()
self.adapter_names = adapter_names
self.num_adapters = len(adapter_names)
# 注意力权重计算
self.attention = nn.MultiheadAttention(
embed_dim=hidden_size,
num_heads=8,
batch_first=True
)
# 融合权重
self.fusion_weights = nn.Parameter(
torch.ones(self.num_adapters) / self.num_adapters
)
def forward(self, base_output, adapter_outputs):
# 计算注意力权重
stacked_outputs = torch.stack(adapter_outputs, dim=1)
# 自注意力融合
fused_output, _ = self.attention(
stacked_outputs, stacked_outputs, stacked_outputs
)
# 加权融合
weighted_output = torch.sum(
fused_output * self.fusion_weights.view(1, -1, 1),
dim=1
)
return base_output + weighted_output
层次化Adapter
class HierarchicalAdapter(nn.Module):
def __init__(self, hidden_size, adapter_sizes):
super().__init__()
self.adapters = nn.ModuleList()
# 创建多层Adapter
for adapter_size in adapter_sizes:
self.adapters.append(AdapterLayer(hidden_size, adapter_size))
def forward(self, x):
# 逐层应用Adapter
for adapter in self.adapters:
x = adapter(x)
return x
性能优化
参数共享
class SharedAdapter(nn.Module):
def __init__(self, hidden_size, adapter_size, num_layers):
super().__init__()
# 所有层共享同一个Adapter
self.shared_adapter = AdapterLayer(hidden_size, adapter_size)
self.num_layers = num_layers
def forward(self, layer_outputs):
# 对所有层输出应用相同的Adapter
adapted_outputs = []
for output in layer_outputs:
adapted_outputs.append(self.shared_adapter(output))
return adapted_outputs
动态Adapter
class DynamicAdapter(nn.Module):
def __init__(self, hidden_size, max_adapter_size):
super().__init__()
self.hidden_size = hidden_size
self.max_adapter_size = max_adapter_size
# 可变大小的投影层
self.down_project = nn.Linear(hidden_size, max_adapter_size)
self.up_project = nn.Linear(max_adapter_size, hidden_size)
# 大小控制器
self.size_controller = nn.Linear(hidden_size, 1)
def forward(self, x):
# 动态确定Adapter大小
size_logit = self.size_controller(x.mean(dim=1))
adapter_size = int(torch.sigmoid(size_logit) * self.max_adapter_size)
# 使用动态大小进行计算
down_output = self.down_project(x)[:, :, :adapter_size]
up_output = self.up_project(
F.pad(down_output, (0, self.max_adapter_size - adapter_size))
)
return x + up_output
部署与推理
Adapter切换
class AdapterSwitcher:
def __init__(self, model, adapter_configs):
self.model = model
self.adapters = {}
# 加载所有Adapter
for name, config in adapter_configs.items():
adapter_path = config["path"]
self.adapters[name] = torch.load(adapter_path)
def switch_adapter(self, adapter_name):
"""切换到指定的Adapter"""
if adapter_name not in self.adapters:
raise ValueError(f"Adapter {adapter_name} not found")
# 加载Adapter权重
adapter_weights = self.adapters[adapter_name]
for name, param in self.model.named_parameters():
if "adapter" in name and name in adapter_weights:
param.data.copy_(adapter_weights[name])
def inference(self, input_text, adapter_name):
"""使用指定Adapter进行推理"""
self.switch_adapter(adapter_name)
with torch.no_grad():
outputs = self.model(input_text)
return outputs
批量推理优化
def batch_inference_with_adapters(model, inputs, adapter_names):
"""批量推理,每个输入使用不同的Adapter"""
results = []
# 按Adapter分组
adapter_groups = {}
for i, (input_text, adapter_name) in enumerate(zip(inputs, adapter_names)):
if adapter_name not in adapter_groups:
adapter_groups[adapter_name] = []
adapter_groups[adapter_name].append((i, input_text))
# 分组处理
for adapter_name, group_inputs in adapter_groups.items():
# 切换Adapter
model.set_active_adapters(adapter_name)
# 批量处理
batch_texts = [text for _, text in group_inputs]
batch_outputs = model(batch_texts)
# 记录结果
for (original_idx, _), output in zip(group_inputs, batch_outputs):
results.append((original_idx, output))
# 按原始顺序排序
results.sort(key=lambda x: x[0])
return [output for _, output in results]
优势与局限
优势
- 模块化设计:每个任务独立的Adapter,便于管理
- 参数效率:通常只需0.5%-2%的额外参数
- 任务迁移:Adapter可以在相似任务间迁移
- 并行训练:多个任务可以并行训练不同的Adapter
- 热插拔:推理时可以动态切换Adapter
局限性
- 推理开销:额外的前向传播计算
- 架构限制:需要修改模型架构
- 任务相关性:对差异很大的任务效果有限
- 超参数敏感:Adapter大小需要仔细调优
最佳实践
Adapter大小选择
def choose_adapter_size(hidden_size, task_complexity):
"""根据任务复杂度选择Adapter大小"""
size_ratios = {
"simple": 1/16, # 简单任务:hidden_size/16
"medium": 1/8, # 中等任务:hidden_size/8
"complex": 1/4 # 复杂任务:hidden_size/4
}
ratio = size_ratios.get(task_complexity, 1/8)
adapter_size = max(8, int(hidden_size * ratio)) # 最小8
return adapter_size
训练策略
def adapter_training_strategy():
"""Adapter训练最佳实践"""
return {
"学习率": "1e-3到1e-4,比全参数微调大",
"训练轮数": "5-10轮,比LoRA多",
"批次大小": "可以用较大的batch size",
"正则化": "适度的weight decay",
"初始化": "接近恒等映射的初始化",
"梯度裁剪": "防止梯度爆炸"
}
相关概念
- PEFT参数高效微调 - Adapter所属的技术类别
- LoRA微调 - 另一种参数高效方法
- 多任务联合微调 - 多任务训练策略
- 模型推理部署 - Adapter部署技术
- 微调策略选择指南 - 方法选择指导