源码级别解析 · 源码解析 · 内存优化 · 性能突破
2026-04-16 | 每日技术深度解读
标准注意力机制在处理长序列时面临严重的内存和性能瓶颈
FlashAttention通过巧妙的分块算法,显著减少内存访问次数
每个版本都在性能和适用性上有显著提升
def standard_attention(q, k, v):
# 计算注意力分数
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
# 应用softmax
attention_weights = F.softmax(scores, dim=-1)
# 计算输出
output = torch.matmul(attention_weights, v)
return output
标准实现:需要存储完整的QK^T矩阵,内存复杂度O(n²)
def flash_attention_forward(q, k, v):
batch_size, seqlen_q, nheads, head_dim = q.shape
# 分块大小优化
block_m = 128 if head_dim <= 64 else 64
block_n = 64 if head_dim <= 128 else 32
output = torch.zeros_like(q)
softmax_lse = torch.zeros(batch_size, nheads, seqlen_q)
# 分块计算
for i in range(0, seqlen_q, block_m):
for j in range(0, seqlen_k, block_n):
q_block = q[:, i:i+block_m]
k_block = k[:, j:j+block_n]
v_block = v[:, j:j+block_n]
# 计算当前块
scores = torch.matmul(q_block, k_block.transpose(-2, -1))
attention = F.softmax(scores, dim=-1)
# 累积结果
output[:, i:i+block_m] += torch.matmul(attention, v_block)
return output
FlashAttention通过分块计算将内存复杂度从O(n²)降低到O(n)
块大小需要根据GPU架构和序列长度动态优化
| GPU架构 | 头维度 | 块大小M | 块大小N |
|---|---|---|---|
| Ampere | 64 | 128 | 64 |
| Ampere | 128 | 64 | 32 |
| Hopper | 64 | 128 | 64 |
| Hopper | 128 | 64 | 32 |
| Turing | 64 | 64 | 32 |
GPU内存访问是性能瓶颈,FlashAttention通过算法优化减少90%+的内存访问
// FlashAttention CUDA内核
__global__ void flash_attention_kernel(
float* q, float* k, float* v,
float* out, float* softmax_lse,
int batch_size, int seqlen, int nheads, int head_dim
) {
int batch_idx = blockIdx.x;
int head_idx = blockIdx.y;
// 共享内存优化
__shared__ float q_shared[128];
__shared__ float k_shared[128];
// 分块计算
for (int i = 0; i < seqlen; i += blockDim.x) {
for (int j = 0; j < seqlen; j += blockDim.x) {
// 加载数据到共享内存
load_to_shared(q + batch_idx * seqlen * nheads * head_dim +
head_idx * seqlen * head_dim + i * head_dim,
q_shared);
// 计算当前块
compute_block(q_shared, k, v, out, softmax_lse);
}
}
}
CUDA内核充分利用共享内存和线程级并行
FlashAttention完美支持因果注意力,适合语言模型训练
MQA允许多个查询头部共享相同的键值向量,显著提高效率
GQA在保持MQA优势的同时,解决了多GPU训练的通信问题
def flash_attention_mqa(q, kv, dropout_p=0.0, causal=False):
"""支持多查询和分组查询的FlashAttention
Args:
q: (batch, seqlen, nheads, head_dim)
kv: (batch, seqlen, 2, nheads_kv, head_dim)
"""
batch_size, seqlen_q = q.shape[:2]
nheads_q = q.shape[2]
nheads_kv = kv.shape[3]
# 确保查询头数可以被KV头数整除
assert nheads_q % nheads_kv == 0
# 计算分组因子
group_factor = nheads_q // nheads_kv
# 重塑以便处理分组
q_reshaped = q.view(batch_size, seqlen_q, nheads_kv, group_factor, head_dim)
# 应用FlashAttention
output = flash_attention_func(q_reshaped, kv[..., 0], kv[..., 1],
dropout_p=dropout_p, causal=causal)
return output.view(batch_size, seqlen_q, nheads_q, head_dim)
MQA/GQA通过分组策略实现计算效率的提升
FlashAttention支持滑动窗口注意力,限制每个查询只能关注附近的位置
FlashAttention原生支持ALiBi位置编码,提供更好的位置感知能力
def get_alibi_slopes(nheads):
"""生成ALiBi的斜率参数
Args:
nheads: 注意力头数
Returns:
slopes: 每个头的斜率值
"""
def get_slopes_power_of_2(nheads):
start = 2 ** (-(2 ** -(math.log2(nheads) - 3)))
ratio = start
return [start * ratio**i for i in range(nheads)]
if math.log2(nheads).is_integer():
return get_slopes_power_of_2(nheads)
else:
closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
return (get_slopes_power_of_2(closest_power_of_2) +
get_alibi_slopes(2 * closest_power_of_2)[0::2][:nheads - closest_power_of_2])
ALiBi使用几何级数递减的斜率来创建位置偏置
FlashAttention-2引入了页式KV缓存,大幅提升推理效率
def flash_attn_with_kvcache(
q, k_cache, v_cache, k=None, v=None,
cache_seqlens=None, block_table=None,
causal=False, window_size=(-1, -1)
):
"""支持KV缓存的FlashAttention
Args:
q: 查询向量 (batch, seqlen_q, nheads, head_dim)
k_cache/v_cache: 已缓存的键值
k/v: 新的键值用于更新缓存
cache_seqlens: 当前序列长度
block_table: 分块表(页式缓存)
"""
# 更新KV缓存
if k is not None and v is not None:
if cache_seqlens is not None:
# 增量更新缓存
update_kv_cache(k_cache, v_cache, k, v, cache_seqlens)
# 应用旋转位置编码(如果需要)
if rotary_cos is not None and rotary_sin is not None:
q, k_cache = apply_rotary_embedding(q, k_cache,
rotary_cos, rotary_sin,
cache_seqlens)
# 执行注意力计算
out = flash_attention_func(q, k_cache, v_cache,
causal=causal, window_size=window_size)
return out
KV缓存管理是FlashAttention推理优化的关键
FlashAttention通过CuSeqlens支持处理变长序列,非常适合真实世界的应用场景
def flash_attention_varlen(
q, k, v, cu_seqlens_q, cu_seqlens_k,
max_seqlen_q, max_seqlen_k, dropout_p=0.0
):
"""处理变长序列的FlashAttention
Args:
q: (total_tokens, nheads, head_dim)
k: (total_tokens_k, nheads_k, head_dim)
v: (total_tokens_k, nheads_k, head_dim)
cu_seqlens_q: (batch_size + 1) 累积查询长度
cu_seqlens_k: (batch_size + 1) 累积键长度
"""
batch_size = len(cu_seqlens_q) - 1
# 分块处理每个序列
outputs = []
for i in range(batch_size):
seq_len_q = cu_seqlens_q[i+1] - cu_seqlens_q[i]
seq_len_k = cu_seqlens_k[i+1] - cu_seqlens_k[i]
# 提取当前序列的数据
q_i = q[cu_seqlens_q[i]:cu_seqlens_q[i+1]]
k_i = k[cu_seqlens_k[i]:cu_seqlens_k[i+1]]
v_i = v[cu_seqlens_k[i]:cu_seqlens_k[i+1]]
# 处理当前序列
out_i = flash_attention_func(q_i, k_i, v_i, causal=True)
outputs.append(out_i)
# 合并结果
return torch.cat(outputs, dim=0)
变长序列处理是FlashAttention在实际应用中的重要特性
FlashAttention支持确定性后向传播,便于调试和复现结果
Flash充分利用现代GPU的FP16/BF16能力,在保证精度的同时提升性能
FlashAttention-3开始支持FP8,为HopperGPU提供极致性能
# FP8量化支持
from flash_attn import flash_attn_func
def flash_attention_fp8(q, k, v):
"""使用FP8量化的FlashAttention
Args:
q, k, v: FP8格式的张量
"""
# 自动类型转换和量化
q_fp8 = q.to(torch.float8_e4m3fn)
k_fp8 = k.to(torch.float8_e4m3fn)
v_fp8 = v.to(torch.float8_e4m3fn)
# 执行FP8 FlashAttention
out = flash_attn_func(q_fp8, k_fp8, v_fp8,
softmax_scale=1.0 / math.sqrt(q.shape[-1]))
# 转换回FP16
return out.to(torch.float16)
FP8量化在保持精度的同时进一步减少内存占用
FlashAttention在各种序列长度下都表现出显著的性能优势
| 序列长度 | 标准注意力 | FlashAttention | 加速比 | 内存节省 |
|---|---|---|---|---|
| 512 | 125 | 45 | 2.8x | 75% |
| 1024 | 520 | 180 | 2.9x | 78% |
| 2048 | 2100 | 720 | 2.9x | 81% |
| 4096 | 8500 | 2900 | 2.9x | 84% |
| 8192 | 34000 | 11600 | 2.9x | 87% |
FlashAttention的内存节省效果在长序列处理时尤为显著
FlashAttention在多个层面进行了深度优化,远超标准PyTorch实现
FlashAttention和xformers是互补的技术,各有优势
FlashAttention可以轻松集成到PyTorch项目中
from flash_attn import flash_attn_func, flash_attn_qkvpacked_func
import torch
import torch.nn as nn
class FlashAttentionMHA(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.0):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
# 线性投影层
self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim)
self.out_proj = nn.Linear(embed_dim, embed_dim)
def forward(self, x, causal=False):
batch_size, seqlen = x.shape[0], x.shape[1]
# 投影到QKV
qkv = self.Wqkv(x)
q, k, v = qkv.chunk(3, dim=-1)
# 重塑为FlashAttention需要的格式
q = q.view(batch_size, seqlen, self.num_heads, self.head_dim).transpose(1, 2)
k = k.view(batch_size, seqlen, self.num_heads, self.head_dim).transpose(1, 2)
v = v.view(batch_size, seqlen, self.num_heads, self.head_dim).transpose(1, 2)
# 执行FlashAttention
attn_output = flash_attn_func(q, k, v, dropout_p=0.0, causal=causal)
# 输出投影
output = attn_output.transpose(1, 2).contiguous()
output = output.view(batch_size, seqlen, self.embed_dim)
output = self.out_proj(output)
return output
FlashAttention可以轻松集成到PyTorch模块中
FlashAttention已经集成到HuggingFace模型中,可以轻松使用
from transformers import GPT2Config, GPT2Model
from flash_attn.modules.mha import MHA
# 配置使用FlashAttention
class GPT2WithFlashAttention(GPT2Model):
def __init__(self, config):
super().__init__(config)
# 替换标准注意力层
for i, block in enumerate(self.h):
block.attn = MHA(
embed_dim=config.n_embd,
num_heads=config.n_head,
dropout=config.attn_pdrop,
causal=True,
use_flash_attn=True, # 启用FlashAttention
layer_idx=i,
)
# 使用示例
config = GPT2Config.from_pretrained('gpt2')
model = GPT2WithFlashAttention.from_pretrained('gpt2')
FlashAttention可以无缝集成到HuggingFace模型中
FlashAttention支持分布式训练,适合大规模模型训练
import torch.distributed as dist
from flash_attn.modules.mha import ParallelMHA
class DistributedFlashAttention(nn.Module):
def __init__(self, embed_dim, num_heads, process_group):
super().__init__()
# 创建并行注意力层
self.mha = ParallelMHA(
embed_dim=embed_dim,
num_heads=num_heads,
process_group=process_group,
use_flash_attn=True,
sequence_parallel=True,
)
def forward(self, x):
# 多GPU前向传播
return self.mha(x)
并行FlashAttention支持大规模分布式训练
使用FlashAttention时需要调整训练策略以获得最佳性能
FlashAttention在推理时也需要特殊的优化策略
class FlashAttentionInference:
def __init__(self, model, max_batch_size=32, max_seqlen=2048):
self.model = model
self.max_batch_size = max_batch_size
self.max_seqlen = max_seqlen
# 初始化KV缓存
self.inference_params = InferenceParams(
max_batch_size=max_batch_size,
max_seqlen=max_seqlen,
cache={}
)
def generate(self, input_ids, max_new_tokens=100):
"""使用FlashAttention进行文本生成
"""
generated = input_ids
for _ in range(max_new_tokens):
# 前向传播
with torch.no_grad():
outputs = self.model(
generated,
inference_params=self.inference_params
)
# 采样下一个token
next_token = self.sample(outputs.logits[:, -1, :])
generated = torch.cat([generated, next_token], dim=1)
return generated
FlashAttention推理优化需要专门的缓存管理
FlashAttention使用过程中可能会遇到一些问题,需要了解如何调试
FlashAttention支持多种GPU架构,但性能有所差异
正确的安装配置是使用FlashAttention的基础
# 安装FlashAttention
pip install flash-attn --no-build-isolation
# 检查安装
python -c "import flash_attn; print(flash_attn.__version__)"
# 验证CUDA支持
python -c "import torch; print(torch.cuda.is_available())"
print(f"CUDA版本: {torch.version.cuda}")
print(f"设备数量: {torch.cuda.device_count()}")
正确的安装和配置是使用FlashAttention的前提
性能测试可以帮助验证FlashAttention的实际效果
def benchmark_flash_attention(model, input_shapes, num_runs=10):
"""FlashAttention性能测试
Args:
model: 测试模型
input_shapes: 输入形状列表
num_runs: 测试次数
"""
results = []
for batch_size, seq_len, embed_dim in input_shapes:
# 准备输入数据
q = torch.randn(batch_size, seq_len, embed_dim, device='cuda', dtype=torch.float16)
k = torch.randn(batch_size, seq_len, embed_dim, device='cuda', dtype=torch.float16)
v = torch.randn(batch_size, seq_len, embed_dim, device='cuda', dtype=torch.float16)
# 预热
with torch.no_grad():
_ = flash_attn_func(q, k, v)
# 正式测试
torch.cuda.synchronize()
start_time = time.time()
with torch.no_grad():
for _ in range(num_runs):
_ = flash_attn_func(q, k, v)
torch.cuda.synchronize()
end_time = time.time()
# 记录结果
avg_time = (end_time - start_time) / num_runs
memory_used = torch.cuda.max_memory_allocated() / 1024**3
results.append({
'batch_size': batch_size,
'seq_len': seq_len,
'embed_dim': embed_dim,
'avg_time_ms': avg_time * 1000,
'memory_gb': memory_used,
'throughput': batch_size * seq_len * embed_dim / avg_time / 1024**3
})
return results
性能测试可以帮助验证FlashAttention的实际效果
FlashAttention在各种AI场景中都有重要应用
FlashAttention已成为现代语言模型的标准组件
FlashAttention也可以用于计算机视觉任务
FlashAttention支持复杂的多模态模型
FlashAttention仍在持续发展中,未来会有更多优化
FlashAttention的性能还有进一步优化的空间
FlashAttention是AI计算优化的重要里程碑
感谢阅读!
访问 https://atcfu.com/ai-articles/flash-attention/ 回顾本文