源码级别解析 · 源码解析 · Hugging Face
2026-05-02 | 每日技术深度解读
传统全参数微调在3B+模型上面临严重瓶颈
| 项目 | 传统微调 | PEFT 微调 | 性能差距 |
|---|---|---|---|
| 12B 模型内存需求 | 56GB GPU | 22GB GPU | 60%↓ |
| 3B 模型训练时间 | 24小时 | 8小时 | 67%↓ |
| 模型存储大小 | 11GB | 19MB | 99.8%↓ |
| 参数训练比例 | 100% | 0.1%-2% | 98%↓ |
PEFT 通过注入轻量级适配器,保留基础模型权重不变
from transformers import AutoModelForCausalLM
from peft import LoraConfig, TaskType, get_peft_model
# 加载基础模型
model_id = "Qwen/Qwen2.5-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id)
# 配置 LoRA
peft_config = LoraConfig(
r=16,
lora_alpha=32,
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "v_proj"]
)
# 注入 PEFT 适配器
model = get_peft_model(model, peft_config)
# 查看可训练参数比例
model.print_trainable_parameters()
# trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.1193
仅需训练 0.12% 的参数即可获得接近全参数微调的性能
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# 加载基础模型
tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model = AutoModelForCausalLM.from_pretrained(model_id)
# 加载 PEFT 适配器
model = PeftModel.from_pretrained(base_model, "qwen2.5-3b-lora")
# 推理
inputs = tokenizer("你好,请介绍一下自己", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))
推理时无需修改推理代码,保持与原生模型兼容
PEFT 采用插件化架构,支持多种微调方法
class PeftModel(PushToHubMixin, torch.nn.Module):
"""Base model encompassing various Peft methods."""
def __init__(self, model, peft_config, adapter_name="default"):
super().__init__()
# 激活的适配器
self.active_adapter = adapter_name
self.peft_type = peft_config.peft_type
# 根据类型选择具体的调优器
if peft_config.is_prompt_learning:
self._setup_prompt_learning(model, peft_config)
else:
cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type]
self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
PeftModel 是 PEFT 的核心基类,支持多种微调方法的统一接口
LoRA 通过低秩矩阵分解,仅训练适配器矩阵 BA
class LoraLayer(BaseTunerLayer):
"""LoRA implementation for linear layers."""
def __init__(self, in_features, out_features, r=8):
super().__init__()
# 冻结原始权重
self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
self.weight.requires_grad_(False)
# LoRA 适配器
self.lora_A = nn.Parameter(torch.Tensor(r, in_features))
self.lora_B = nn.Parameter(torch.Tensor(out_features, r))
# 初始化
nn.init.kaiming_uniform_(self.lora_A, a=5**0.5)
nn.init.zeros_(self.lora_B)
def forward(self, x):
# Wx + BAx = Wx + (BA)x
return self.weight @ x + (self.lora_B @ (self.lora_A @ x))
前向传播时计算原始权重与适配器的线性组合
@dataclass
class LoraConfig(PeftConfig):
"""LoRA configuration."""
r: int = 8
lora_alpha: int = 16
target_modules: List[str] = None
lora_dropout: float = 0.0
def __post_init__(self):
# 验证配置
if self.r <= 0:
raise ValueError("Rank must be positive")
if self.lora_alpha <= 0:
raise ValueError("Alpha must be positive")
LoraConfig 提供了丰富的配置选项,支持精细控制
| 方法 | 特点 | 参数效率 | 适用场景 |
|---|---|---|---|
| LoRA | 标准低秩分解 | 高 | 通用微调 |
| AdaLoRA | 动态秩分配 | 极高 | 资源受限场景 |
| QLoRA | 量化压缩 | 极高 | 大模型训练 |
| LoHa | 参数选择 | 中高 | 特定任务 |
| XLora | 层次化 | 极高 | 超大模型 |
class QLoraConfig(LoraConfig):
"""Quantized LoRA configuration."""
@dataclass
class QuantConfig:
bits: int = 4
double_quant: bool = True
quant_config: QuantConfig = field(default_factory=QuantConfig)
torch_dtype: torch.dtype = torch.bfloat16
def prepare_model_for_kbit_training(model):
"""Prepare model for quantized training."""
for param in model.parameters():
if param.requires_grad:
param.data = param.data.to(torch.bfloat16)
QLoRA 通过量化技术,进一步减少内存占用
提示微调在输入序列开头插入可学习的连续提示
class PromptEncoder(nn.Module):
"""Prompt encoder for prompt tuning."""
def __init__(self, config):
super().__init__()
self.embedding = nn.Embedding(
config.num_virtual_tokens,
config.token_dim
)
self.dropout = nn.Dropout(config.prompt_dropout)
def forward(self, input_ids):
# 生成虚拟提示
prompt_ids = torch.arange(
self.embedding.num_embeddings,
device=input_ids.device
)
# 获取提示嵌入
prompt_embeddings = self.embedding(prompt_ids)
return self.dropout(prompt_embeddings)
PromptEncoder 生成可学习的连续提示向量
Prefix Tuning 在注意力机制中注入可学习的前缀
| 方法 | 训练速度 | 内存使用 | 推理延迟 | 任务适应性 |
|---|---|---|---|---|
| LoRA | 中 | 中 | 低 | 高 |
| Prompt Tuning | 快 | 低 | 极低 | 中 |
| Prefix Tuning | 慢 | 中 | 中 | 高 |
| QLoRA | 中 | 极低 | 中 | 高 |
| AdaLoRA | 慢 | 低 | 低 | 极高 |
class PersonalizedChatModel:
"""个性化聊天机器人."""
def __init__(self, base_model_name, user_id):
self.base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
self.user_id = user_id
# 加载用户特定适配器
adapter_path = f"adapters/user_{user_id}"
self.peft_model = PeftModel.from_pretrained(
self.base_model,
adapter_path
)
def chat(self, message):
inputs = self.tokenizer(
f"User {self.user_id}: {message}",
return_tensors="pt"
)
outputs = self.peft_model.generate(**inputs)
return self.tokenizer.decode(outputs[0])
每个用户使用独立的 LoRA 适配器实现个性化
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
# 配置训练参数
training_args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=8,
gradient_accumulation_steps=4,
learning_rate=2e-4,
fp16=True,
gradient_checkpointing=True,
save_strategy="epoch",
evaluation_strategy="epoch"
)
# 创建 Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
# 开始训练
trainer.train()
使用梯度累积和混合精度优化训练过程
class AdapterManager:
"""动态适配器管理器."""
def __init__(self, base_model):
self.base_model = base_model
self.loaded_adapters = {}
def load_adapter(self, adapter_name, adapter_path):
"""动态加载适配器."""
if adapter_name not in self.loaded_adapters:
peft_model = PeftModel.from_pretrained(
self.base_model,
adapter_path,
adapter_name=adapter_name
)
self.loaded_adapters[adapter_name] = peft_model
def switch_adapter(self, adapter_name):
"""切换适配器."""
if adapter_name in self.loaded_adapters:
self.current_model = self.loaded_adapters[adapter_name]
return self.current_model
raise ValueError(f"Adapter {adapter_name} not loaded")
动态管理多个适配器,实现快速任务切换
# 不同规模模型的参数建议
model_configs = {
"small": { # < 1B parameters
"r": 8,
"alpha": 16,
"target_modules": ["q_proj", "v_proj"]
},
"medium": { # 1B-10B parameters
"r": 16,
"alpha": 32,
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"]
},
"large": { # > 10B parameters
"r": 32,
"alpha": 64,
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj"]
}
}
# 根据模型大小选择配置
config = model_configs["medium"]
不同规模模型需要不同的参数配置
感谢阅读!
访问 https://atcfu.com/ai-articles/peft-parameter-efficient-fine-tuning/ 回顾本文