PyTorch 模型评估与调试
深度学习模型的训练和优化需要系统的评估与调试方法。本节介绍如何分析 Loss 曲线、识别过拟合与欠拟合、使用混淆矩阵评估分类模型,以及常见的调试技巧。
1. 损失函数 (Loss) 分析
损失函数是训练过程中的核心指标,它反映了模型预测与真实值的差距。正确分析 Loss 曲线是调试模型的关键。
1.1 训练与验证 Loss 对比
通过对比训练 Loss 和验证 Loss,可以判断模型的状态:
- 训练 Loss 下降,验证 Loss 下降:正常,表明模型正在学习
- 训练 Loss 下降,验证 Loss 稳定或上升:过拟合,需要正则化
- 训练 Loss 不下降:学习率问题或模型架构问题
- 训练 Loss 震荡:学习率过大或 batch size 过小
1.2 Loss 曲线可视化
实例
import matplotlib.pyplot as plt
import numpy as np
def plot_training_history(history):
"""
绘制训练历史
history: 包含 'train_loss', 'val_loss', 'train_acc', 'val_acc' 的字典
"""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Loss 曲线
axes[0].plot(history['train_loss'], label='Train Loss', alpha=0.8)
axes[0].plot(history['val_loss'], label='Val Loss', alpha=0.8)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Accuracy 曲线
axes[1].plot(history['train_acc'], label='Train Acc', alpha=0.8)
axes[1].plot(history['val_acc'], label='Val Acc', alpha=0.8)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 模拟训练数据
history = {
'train_loss': np.linspace(2.0, 0.2, 50) + np.random.normal(0, 0.05, 50),
'val_loss': np.linspace(2.0, 0.3, 50) + np.random.normal(0, 0.08, 50),
'train_acc': np.linspace(0.3, 0.95, 50),
'val_acc': np.linspace(0.3, 0.88, 50),
}
plot_training_history(history)
import numpy as np
def plot_training_history(history):
"""
绘制训练历史
history: 包含 'train_loss', 'val_loss', 'train_acc', 'val_acc' 的字典
"""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Loss 曲线
axes[0].plot(history['train_loss'], label='Train Loss', alpha=0.8)
axes[0].plot(history['val_loss'], label='Val Loss', alpha=0.8)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Accuracy 曲线
axes[1].plot(history['train_acc'], label='Train Acc', alpha=0.8)
axes[1].plot(history['val_acc'], label='Val Acc', alpha=0.8)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 模拟训练数据
history = {
'train_loss': np.linspace(2.0, 0.2, 50) + np.random.normal(0, 0.05, 50),
'val_loss': np.linspace(2.0, 0.3, 50) + np.random.normal(0, 0.08, 50),
'train_acc': np.linspace(0.3, 0.95, 50),
'val_acc': np.linspace(0.3, 0.88, 50),
}
plot_training_history(history)
1.3 识别过拟合
当验证 Loss 开始上升而训练 Loss 继续下降时,就是过拟合的开始。此时模型开始"记忆"训练数据,而非学习通用模式。
实例
import matplotlib.pyplot as plt
import numpy as np
def detect_overfitting(history, patience=5):
"""
检测过拟合
patience: 连续多少个 epoch 验证 loss 上升才停止训练
"""
val_loss = history['val_loss']
best_loss = float('inf')
best_epoch = 0
early_stop_epoch = None
for epoch in range(patience, len(val_loss)):
# 检查最近 patience 个 epoch
recent_losses = val_loss[epoch - patience:epoch]
current_loss = val_loss[epoch]
# 如果当前 loss 比最近最低 loss 高
if min(recent_losses) < current_loss:
early_stop_epoch = epoch
break
if current_loss < best_loss:
best_loss = current_loss
best_epoch = epoch
if early_stop_epoch:
print(f"检测到过拟合!最佳 epoch: {best_epoch}, 应在 epoch {early_stop_epoch} 停止")
else:
print("未检测到过拟合")
return early_stop_epoch, best_epoch
# 过拟合示例
overfitting_history = {
'train_loss': np.linspace(1.5, 0.1, 30),
'val_loss': np.concatenate([np.linspace(1.5, 0.3, 15), np.linspace(0.3, 0.8, 15)]),
}
detect_overfitting(overfitting_history)
import numpy as np
def detect_overfitting(history, patience=5):
"""
检测过拟合
patience: 连续多少个 epoch 验证 loss 上升才停止训练
"""
val_loss = history['val_loss']
best_loss = float('inf')
best_epoch = 0
early_stop_epoch = None
for epoch in range(patience, len(val_loss)):
# 检查最近 patience 个 epoch
recent_losses = val_loss[epoch - patience:epoch]
current_loss = val_loss[epoch]
# 如果当前 loss 比最近最低 loss 高
if min(recent_losses) < current_loss:
early_stop_epoch = epoch
break
if current_loss < best_loss:
best_loss = current_loss
best_epoch = epoch
if early_stop_epoch:
print(f"检测到过拟合!最佳 epoch: {best_epoch}, 应在 epoch {early_stop_epoch} 停止")
else:
print("未检测到过拟合")
return early_stop_epoch, best_epoch
# 过拟合示例
overfitting_history = {
'train_loss': np.linspace(1.5, 0.1, 30),
'val_loss': np.concatenate([np.linspace(1.5, 0.3, 15), np.linspace(0.3, 0.8, 15)]),
}
detect_overfitting(overfitting_history)
2. 过拟合与欠拟合处理
2.1 过拟合解决方案
| 方法 | 描述 | 代码实现 |
|---|---|---|
| 增加数据量 | 收集更多训练数据 | 数据增强 |
| Dropout | 随机丢弃神经元 | nn.Dropout(0.5) |
| 权重衰减 | L2 正则化 | weight_decay=1e-4 |
| 早停 (Early Stopping) | 验证 loss 停止下降时停止训练 | patience 参数 |
| 减少模型复杂度 | 减少层数或神经元 | 降低 hidden_size |
| 数据增强 | 随机变换增加数据多样性 | 旋转、翻转、裁剪 |
2.2 欠拟合解决方案
| 方法 | 描述 |
|---|---|
| 增加模型复杂度 | 增加层数或神经元数量 |
| 训练更久 | 增加训练轮数 |
| 减小正则化 | 降低 Dropout 或 weight_decay |
| 调整学习率 | 尝试更大的学习率 |
2.3 早停机制实现
实例
import torch
import torch.nn as nn
import torch.optim as optim
class EarlyStopping:
"""
早停机制:验证 loss 连续不下降时停止训练
"""
def __init__(self, patience=7, min_delta=0, mode='min'):
"""
patience: 连续多少个 epoch 没有改善就停止
min_delta: 被认为是改善的最小变化量
mode: 'min' 或 'max',指标是越小越好还是越大越好
"""
self.patience = patience
self.min_delta = min_delta
self.mode = mode
self.counter = 0
self.best_score = None
self.early_stop = False
def __call__(self, val_loss):
score = -val_loss if self.mode == 'min' else val_loss
if self.best_score is None:
self.best_score = score
elif score < self.best_score + self.min_delta:
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.counter = 0
return self.early_stop
# 使用早停
def train_with_early_stopping(model, train_loader, val_loader, patience=7):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
early_stopping = EarlyStopping(patience=patience, mode='min')
best_val_loss = float('inf')
best_model_state = None
for epoch in range(100):
# 训练
model.train()
train_loss = 0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
# 验证
model.eval()
val_loss = 0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
val_loss /= len(val_loader)
print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
# 早停检查
if early_stopping(val_loss):
print(f"早停触发!连续 {patience} 个 epoch 验证 loss 未下降")
# 保存最佳模型
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model_state = model.state_dict().copy()
# 恢复最佳模型
model.load_state_dict(best_model_state)
return model
import torch.nn as nn
import torch.optim as optim
class EarlyStopping:
"""
早停机制:验证 loss 连续不下降时停止训练
"""
def __init__(self, patience=7, min_delta=0, mode='min'):
"""
patience: 连续多少个 epoch 没有改善就停止
min_delta: 被认为是改善的最小变化量
mode: 'min' 或 'max',指标是越小越好还是越大越好
"""
self.patience = patience
self.min_delta = min_delta
self.mode = mode
self.counter = 0
self.best_score = None
self.early_stop = False
def __call__(self, val_loss):
score = -val_loss if self.mode == 'min' else val_loss
if self.best_score is None:
self.best_score = score
elif score < self.best_score + self.min_delta:
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.counter = 0
return self.early_stop
# 使用早停
def train_with_early_stopping(model, train_loader, val_loader, patience=7):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
early_stopping = EarlyStopping(patience=patience, mode='min')
best_val_loss = float('inf')
best_model_state = None
for epoch in range(100):
# 训练
model.train()
train_loss = 0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
# 验证
model.eval()
val_loss = 0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
val_loss /= len(val_loader)
print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
# 早停检查
if early_stopping(val_loss):
print(f"早停触发!连续 {patience} 个 epoch 验证 loss 未下降")
# 保存最佳模型
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model_state = model.state_dict().copy()
# 恢复最佳模型
model.load_state_dict(best_model_state)
return model
3. 混淆矩阵 (Confusion Matrix)
混淆矩阵是评估分类模型的重要工具,它展示了模型预测与真实标签的关系。
3.1 混淆矩阵计算
实例
import torch
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
def calculate_confusion_matrix(model, dataloader, device, num_classes):
"""
计算混淆矩阵
"""
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for inputs, labels in dataloader:
inputs = inputs.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.numpy())
cm = confusion_matrix(all_labels, all_preds)
return cm
def plot_confusion_matrix(cm, class_names):
"""
绘制混淆矩阵
"""
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()
# 使用示例
# 假设有 10 个类别
class_names = [f'Class {i}' for i in range(10)]
cm = np.array([
[45, 1, 0, 0, 0, 0, 2, 1, 1, 0],
[0, 42, 2, 0, 0, 0, 0, 3, 2, 1],
[1, 1, 38, 3, 0, 0, 0, 2, 3, 2],
[0, 0, 1, 41, 2, 1, 0, 1, 1, 3],
[0, 0, 0, 1, 44, 0, 1, 0, 2, 2],
[1, 0, 0, 0, 0, 43, 2, 1, 1, 2],
[2, 1, 0, 0, 1, 1, 39, 2, 2, 2],
[0, 2, 1, 1, 0, 0, 1, 40, 3, 2],
[1, 2, 2, 0, 1, 1, 1, 2, 36, 4],
[0, 1, 1, 2, 2, 2, 2, 1, 2, 37],
])
plot_confusion_matrix(cm, class_names)
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
def calculate_confusion_matrix(model, dataloader, device, num_classes):
"""
计算混淆矩阵
"""
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for inputs, labels in dataloader:
inputs = inputs.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.numpy())
cm = confusion_matrix(all_labels, all_preds)
return cm
def plot_confusion_matrix(cm, class_names):
"""
绘制混淆矩阵
"""
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()
# 使用示例
# 假设有 10 个类别
class_names = [f'Class {i}' for i in range(10)]
cm = np.array([
[45, 1, 0, 0, 0, 0, 2, 1, 1, 0],
[0, 42, 2, 0, 0, 0, 0, 3, 2, 1],
[1, 1, 38, 3, 0, 0, 0, 2, 3, 2],
[0, 0, 1, 41, 2, 1, 0, 1, 1, 3],
[0, 0, 0, 1, 44, 0, 1, 0, 2, 2],
[1, 0, 0, 0, 0, 43, 2, 1, 1, 2],
[2, 1, 0, 0, 1, 1, 39, 2, 2, 2],
[0, 2, 1, 1, 0, 0, 1, 40, 3, 2],
[1, 2, 2, 0, 1, 1, 1, 2, 36, 4],
[0, 1, 1, 2, 2, 2, 2, 1, 2, 37],
])
plot_confusion_matrix(cm, class_names)
3.2 分类评估指标
从混淆矩阵可以计算多种评估指标:
实例
import numpy as np
def classification_metrics(cm):
"""
从混淆矩阵计算各种评估指标
"""
# 准确率 (Accuracy)
accuracy = np.trace(cm) / np.sum(cm)
# 对每类计算精确率、召回率、F1
num_classes = cm.shape[0]
precisions = []
recalls = []
f1s = []
for i in range(num_classes):
tp = cm[i, i]
fp = np.sum(cm[:, i]) - tp
fn = np.sum(cm[i, :]) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
precisions.append(precision)
recalls.append(recall)
f1s.append(f1)
# 宏平均 (Macro Average)
macro_precision = np.mean(precisions)
macro_recall = np.mean(recalls)
macro_f1 = np.mean(f1s)
# 加权平均 (Weighted Average)
class_counts = np.sum(cm, axis=1)
weights = class_counts / np.sum(class_counts)
weighted_precision = np.average(precisions, weights=weights)
weighted_recall = np.average(recalls, weights=weights)
weighted_f1 = np.average(f1s, weights=weights)
return {
'accuracy': accuracy,
'macro_precision': macro_precision,
'macro_recall': macro_recall,
'macro_f1': macro_f1,
'weighted_precision': weighted_precision,
'weighted_recall': weighted_recall,
'weighted_f1': weighted_f1,
}
# 计算指标
metrics = classification_metrics(cm)
print("=" * 50)
print("分类评估指标")
print("=" * 50)
print(f"准确率 (Accuracy): {metrics['accuracy']:.4f}")
print(f"宏平均精确率 (Macro Precision): {metrics['macro_precision']:.4f}")
print(f"宏平均召回率 (Macro Recall): {metrics['macro_recall']:.4f}")
print(f"宏平均 F1 (Macro F1): {metrics['macro_f1']:.4f}")
print(f"加权精确率 (Weighted Precision): {metrics['weighted_precision']:.4f}")
print(f"加权召回率 (Weighted Recall): {metrics['weighted_recall']:.4f}")
print(f"加权 F1 (Weighted F1): {metrics['weighted_f1']:.4f}")
def classification_metrics(cm):
"""
从混淆矩阵计算各种评估指标
"""
# 准确率 (Accuracy)
accuracy = np.trace(cm) / np.sum(cm)
# 对每类计算精确率、召回率、F1
num_classes = cm.shape[0]
precisions = []
recalls = []
f1s = []
for i in range(num_classes):
tp = cm[i, i]
fp = np.sum(cm[:, i]) - tp
fn = np.sum(cm[i, :]) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
precisions.append(precision)
recalls.append(recall)
f1s.append(f1)
# 宏平均 (Macro Average)
macro_precision = np.mean(precisions)
macro_recall = np.mean(recalls)
macro_f1 = np.mean(f1s)
# 加权平均 (Weighted Average)
class_counts = np.sum(cm, axis=1)
weights = class_counts / np.sum(class_counts)
weighted_precision = np.average(precisions, weights=weights)
weighted_recall = np.average(recalls, weights=weights)
weighted_f1 = np.average(f1s, weights=weights)
return {
'accuracy': accuracy,
'macro_precision': macro_precision,
'macro_recall': macro_recall,
'macro_f1': macro_f1,
'weighted_precision': weighted_precision,
'weighted_recall': weighted_recall,
'weighted_f1': weighted_f1,
}
# 计算指标
metrics = classification_metrics(cm)
print("=" * 50)
print("分类评估指标")
print("=" * 50)
print(f"准确率 (Accuracy): {metrics['accuracy']:.4f}")
print(f"宏平均精确率 (Macro Precision): {metrics['macro_precision']:.4f}")
print(f"宏平均召回率 (Macro Recall): {metrics['macro_recall']:.4f}")
print(f"宏平均 F1 (Macro F1): {metrics['macro_f1']:.4f}")
print(f"加权精确率 (Weighted Precision): {metrics['weighted_precision']:.4f}")
print(f"加权召回率 (Weighted Recall): {metrics['weighted_recall']:.4f}")
print(f"加权 F1 (Weighted F1): {metrics['weighted_f1']:.4f}")
3.3 分类报告
实例
from sklearn.metrics import classification_report
# 使用 sklearn 的分类报告
y_true = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 10 # 模拟真实标签
y_pred = [0, 1, 2, 3, 4, 5, 6, 7, 8, 8] * 10 # 模拟预测
report = classification_report(y_true, y_pred, digits=4)
print(report)
# 使用 sklearn 的分类报告
y_true = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 10 # 模拟真实标签
y_pred = [0, 1, 2, 3, 4, 5, 6, 7, 8, 8] * 10 # 模拟预测
report = classification_report(y_true, y_pred, digits=4)
print(report)
4. 学习率调度
学习率是训练深度学习模型最重要的超参数之一。合适的学习率调度可以显著提升训练效果。
4.1 学习率调度策略
| 策略 | 描述 | 适用场景 |
|---|---|---|
| StepLR | 固定步长衰减 | 已知最优学习率时 |
| MultiStepLR | 指定 epoch 衰减 | 非均匀衰减 |
| CosineAnnealingLR | 余弦退火 | 平滑收敛 |
| ReduceLROnPlateau | 验证 loss 不降时衰减 | 自动调优 |
| Warmup | 先增后减 | 稳定训练初期 |
4.2 学习率调度实现
实例
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
# 模拟训练 epoch
epochs = 50
# 1. Step LR:每 10 个 epoch 衰减一半
scheduler_step = optim.lr_scheduler.StepLR(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
step_size=10, gamma=0.5
)
# 2. MultiStep LR:指定 epoch 衰减
scheduler_multistep = optim.lr_scheduler.MultiStepLR(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
milestones=[15, 30, 45], gamma=0.1
)
# 3. Cosine Annealing
scheduler_cosine = optim.lr_scheduler.CosineAnnealingLR(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
T_max=50
)
# 4. ReduceLROnPlateau
scheduler_plateau = optim.lr_scheduler.ReduceLROnPlateau(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
mode='min', factor=0.5, patience=5
)
# 绘制学习率曲线
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Step LR
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler_step.step()
axes[0, 0].plot(lr_history)
axes[0, 0].set_title('Step LR')
# 重新初始化
optimizer = optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15, 30, 45], gamma=0.1)
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler.step()
axes[0, 1].plot(lr_history)
axes[0, 1].set_title('MultiStep LR')
# Cosine
optimizer = optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler.step()
axes[1, 0].plot(lr_history)
axes[1, 0].set_title('Cosine Annealing LR')
# Warmup + Cosine
def warmup_cosine(optimizer, warmup_epochs, total_epochs, min_lr=1e-6):
def lr_lambda(epoch):
if epoch < warmup_epochs:
return epoch / warmup_epochs
return min_lr + 0.5 * (1 - min_lr) * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (total_epochs - warmup_epochs)))
return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
optimizer = optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1)
scheduler = warmup_cosine(optimizer, warmup_epochs=5, total_epochs=50)
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler.step()
axes[1, 1].plot(lr_history)
axes[1, 1].set_title('Warmup + Cosine')
for ax in axes.flat:
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
import torch.optim as optim
import matplotlib.pyplot as plt
# 模拟训练 epoch
epochs = 50
# 1. Step LR:每 10 个 epoch 衰减一半
scheduler_step = optim.lr_scheduler.StepLR(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
step_size=10, gamma=0.5
)
# 2. MultiStep LR:指定 epoch 衰减
scheduler_multistep = optim.lr_scheduler.MultiStepLR(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
milestones=[15, 30, 45], gamma=0.1
)
# 3. Cosine Annealing
scheduler_cosine = optim.lr_scheduler.CosineAnnealingLR(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
T_max=50
)
# 4. ReduceLROnPlateau
scheduler_plateau = optim.lr_scheduler.ReduceLROnPlateau(
optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1),
mode='min', factor=0.5, patience=5
)
# 绘制学习率曲线
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Step LR
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler_step.step()
axes[0, 0].plot(lr_history)
axes[0, 0].set_title('Step LR')
# 重新初始化
optimizer = optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15, 30, 45], gamma=0.1)
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler.step()
axes[0, 1].plot(lr_history)
axes[0, 1].set_title('MultiStep LR')
# Cosine
optimizer = optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler.step()
axes[1, 0].plot(lr_history)
axes[1, 0].set_title('Cosine Annealing LR')
# Warmup + Cosine
def warmup_cosine(optimizer, warmup_epochs, total_epochs, min_lr=1e-6):
def lr_lambda(epoch):
if epoch < warmup_epochs:
return epoch / warmup_epochs
return min_lr + 0.5 * (1 - min_lr) * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (total_epochs - warmup_epochs)))
return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
optimizer = optim.SGD(torch.nn.Linear(10, 10).parameters(), lr=0.1)
scheduler = warmup_cosine(optimizer, warmup_epochs=5, total_epochs=50)
lr_history = []
for _ in range(epochs):
lr_history.append(optimizer.param_groups[0]['lr'])
scheduler.step()
axes[1, 1].plot(lr_history)
axes[1, 1].set_title('Warmup + Cosine')
for ax in axes.flat:
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
5. 梯度问题诊断
5.1 梯度消失与爆炸
< h2 class="example">实例
import torch
import torch.nn as nn
def analyze_gradients(model):
"""
分析模型各层梯度
"""
grad_stats = {}
for name, param in model.named_parameters():
if param.grad is not None:
grad = param.grad
grad_stats[name] = {
'mean': grad.mean().item(),
'std': grad.std().item(),
'max': grad.abs().max().item(),
'min': grad.abs().min().item(),
'norm': grad.norm().item(),
}
return grad_stats
def detect_gradient_issues(model):
"""
检测梯度问题
"""
issues = []
grad_norms = []
for param in model.parameters():
if param.grad is not None:
grad_norms.append(param.grad.norm().item())
avg_grad_norm = sum(grad_norms) / len(grad_norms) if grad_norms else 0
if avg_grad_norm < 1e-7:
issues.append("梯度消失:梯度太小,模型可能无法学习")
elif avg_grad_norm > 100:
issues.append("梯度爆炸:梯度太大,训练不稳定")
return issues, avg_grad_norm
# 示例:检测梯度
model = nn.Sequential(
nn.Linear(100, 50),
nn.ReLU(),
nn.Linear(50, 50),
nn.ReLU(),
nn.Linear(50, 10)
)
# 模拟前向和反向传播
x = torch.randn(32, 100)
y = model(x)
loss = y.sum()
loss.backward()
issues, avg_norm = detect_gradient_issues(model)
print(f"平均梯度范数: {avg_norm:.6f}")
if issues:
for issue in issues:
print(f"警告: {issue}")
else:
print("梯度状态正常")
import torch.nn as nn
def analyze_gradients(model):
"""
分析模型各层梯度
"""
grad_stats = {}
for name, param in model.named_parameters():
if param.grad is not None:
grad = param.grad
grad_stats[name] = {
'mean': grad.mean().item(),
'std': grad.std().item(),
'max': grad.abs().max().item(),
'min': grad.abs().min().item(),
'norm': grad.norm().item(),
}
return grad_stats
def detect_gradient_issues(model):
"""
检测梯度问题
"""
issues = []
grad_norms = []
for param in model.parameters():
if param.grad is not None:
grad_norms.append(param.grad.norm().item())
avg_grad_norm = sum(grad_norms) / len(grad_norms) if grad_norms else 0
if avg_grad_norm < 1e-7:
issues.append("梯度消失:梯度太小,模型可能无法学习")
elif avg_grad_norm > 100:
issues.append("梯度爆炸:梯度太大,训练不稳定")
return issues, avg_grad_norm
# 示例:检测梯度
model = nn.Sequential(
nn.Linear(100, 50),
nn.ReLU(),
nn.Linear(50, 50),
nn.ReLU(),
nn.Linear(50, 10)
)
# 模拟前向和反向传播
x = torch.randn(32, 100)
y = model(x)
loss = y.sum()
loss.backward()
issues, avg_norm = detect_gradient_issues(model)
print(f"平均梯度范数: {avg_norm:.6f}")
if issues:
for issue in issues:
print(f"警告: {issue}")
else:
print("梯度状态正常")
5.2 梯度裁剪
实例
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as utils
# 梯度裁剪示例
def train_with_gradient_clipping(model, dataloader, max_norm=1.0):
"""
使用梯度裁剪训练
max_norm: 最大梯度范数,超过此值的梯度会被裁剪
"""
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
model.train()
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
# 梯度裁剪:防止梯度爆炸
utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
optimizer.step()
return model
# 逐元素裁剪(更保守)
def clip_grad_by_value(model, clip_value=1.0):
"""
按值裁剪梯度
"""
for param in model.parameters():
if param.grad is not None:
param.grad.data.clamp_(min=-clip_value, max=clip_value)
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as utils
# 梯度裁剪示例
def train_with_gradient_clipping(model, dataloader, max_norm=1.0):
"""
使用梯度裁剪训练
max_norm: 最大梯度范数,超过此值的梯度会被裁剪
"""
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
model.train()
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
# 梯度裁剪:防止梯度爆炸
utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
optimizer.step()
return model
# 逐元素裁剪(更保守)
def clip_grad_by_value(model, clip_value=1.0):
"""
按值裁剪梯度
"""
for param in model.parameters():
if param.grad is not None:
param.grad.data.clamp_(min=-clip_value, max=clip_value)
6. 模型性能分析
6.1 计算参数量与 FLOPs
实例
import torch
import torch.nn as nn
from thop import profile
def count_parameters(model):
"""计算模型可训练参数数量"""
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total, trainable
def count_flops(model, input_size=(1, 3, 224, 224)):
"""计算 FLOPs(需要安装 thop 库)"""
input_tensor = torch.randn(input_size)
flops, params = profile(model, inputs=(input_tensor,), verbose=False)
return flops, params
# 示例:统计模型
model = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(256, 10)
)
total, trainable = count_parameters(model)
print(f"总参数: {total:,}")
print(f"可训练参数: {trainable:,}")
print(f"模型大小: {total * 4 / 1024 / 1024:.2f} MB")
# FLOPs 计算
try:
flops, _ = count_flops(model)
print(f"FLOPs: {flops / 1e9:.2f} G")
except ImportError:
print("请安装 thop 库: pip install thop")
import torch.nn as nn
from thop import profile
def count_parameters(model):
"""计算模型可训练参数数量"""
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total, trainable
def count_flops(model, input_size=(1, 3, 224, 224)):
"""计算 FLOPs(需要安装 thop 库)"""
input_tensor = torch.randn(input_size)
flops, params = profile(model, inputs=(input_tensor,), verbose=False)
return flops, params
# 示例:统计模型
model = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(256, 10)
)
total, trainable = count_parameters(model)
print(f"总参数: {total:,}")
print(f"可训练参数: {trainable:,}")
print(f"模型大小: {total * 4 / 1024 / 1024:.2f} MB")
# FLOPs 计算
try:
flops, _ = count_flops(model)
print(f"FLOPs: {flops / 1e9:.2f} G")
except ImportError:
print("请安装 thop 库: pip install thop")
6.2 推理速度测试
< h2 class="example">实例
import torch
import time
def measure_inference_speed(model, input_size, device='cuda', num_iterations=100):
"""
测量推理速度
"""
model.eval()
model = model.to(device)
# 预热
dummy_input = torch.randn(input_size).to(device)
with torch.no_grad():
for _ in range(10):
_ = model(dummy_input)
# 计时
start = time.time()
with torch.no_grad():
for _ in range(num_iterations):
_ = model(dummy_input)
if device == 'cuda':
torch.cuda.synchronize()
end = time.time()
avg_time = (end - start) / num_iterations * 1000 # ms
return avg_time
# 使用示例
model = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(64, 10)
)
# CPU 推理速度
cpu_time = measure_inference_speed(model, (1, 3, 224, 224), device='cpu')
print(f"CPU 推理时间: {cpu_time:.2f} ms/图像")
# GPU 推理速度(如有 CUDA)
if torch.cuda.is_available():
gpu_time = measure_inference_speed(model, (1, 3, 224, 224), device='cuda')
print(f"GPU 推理时间: {gpu_time:.2f} ms/图像")
import time
def measure_inference_speed(model, input_size, device='cuda', num_iterations=100):
"""
测量推理速度
"""
model.eval()
model = model.to(device)
# 预热
dummy_input = torch.randn(input_size).to(device)
with torch.no_grad():
for _ in range(10):
_ = model(dummy_input)
# 计时
start = time.time()
with torch.no_grad():
for _ in range(num_iterations):
_ = model(dummy_input)
if device == 'cuda':
torch.cuda.synchronize()
end = time.time()
avg_time = (end - start) / num_iterations * 1000 # ms
return avg_time
# 使用示例
model = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(64, 10)
)
# CPU 推理速度
cpu_time = measure_inference_speed(model, (1, 3, 224, 224), device='cpu')
print(f"CPU 推理时间: {cpu_time:.2f} ms/图像")
# GPU 推理速度(如有 CUDA)
if torch.cuda.is_available():
gpu_time = measure_inference_speed(model, (1, 3, 224, 224), device='cuda')
print(f"GPU 推理时间: {gpu_time:.2f} ms/图像")
6.3 显存占用分析
实例
import torch
def analyze_memory(model, input_size):
"""
分析模型显存占用
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# 创建输入
x = torch.randn(input_size).to(device)
# 清理缓存
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
# 前向传播
with torch.no_grad():
output = model(x)
# 获取显存统计
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**2 # MB
reserved = torch.cuda.memory_reserved() / 1024**2
max_allocated = torch.cuda.max_memory_allocated() / 1024**2
print(f"当前分配: {allocated:.2f} MB")
print(f"缓存占用: {reserved:.2f} MB")
print(f"峰值占用: {max_allocated:.2f} MB")
else:
print("需要 CUDA 支持")
return output
# 使用示例
model = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
)
analyze_memory(model, (8, 3, 224, 224))
def analyze_memory(model, input_size):
"""
分析模型显存占用
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# 创建输入
x = torch.randn(input_size).to(device)
# 清理缓存
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
# 前向传播
with torch.no_grad():
output = model(x)
# 获取显存统计
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**2 # MB
reserved = torch.cuda.memory_reserved() / 1024**2
max_allocated = torch.cuda.max_memory_allocated() / 1024**2
print(f"当前分配: {allocated:.2f} MB")
print(f"缓存占用: {reserved:.2f} MB")
print(f"峰值占用: {max_allocated:.2f} MB")
else:
print("需要 CUDA 支持")
return output
# 使用示例
model = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
)
analyze_memory(model, (8, 3, 224, 224))
7. 常见问题与调试技巧
7.1 训练问题速查
| 症状 | 可能原因 | 解决方案 |
|---|---|---|
| Loss 不下降 | 学习率过小/过大、梯度问题 | 调整学习率、检查梯度 |
| Loss 震荡 | 学习率过大、batch size 小 | 减小学习率、增大 batch |
| NaN 出现 | 除零、log(0)、梯度爆炸 | 添加 epsilon、梯度裁剪 |
| 过拟合 | 模型复杂、数据不足 | 增加数据、加正则化 |
| 欠拟合 | 模型太简单、训练不足 | 增大模型、训练更久 |
| 显存不足 | batch size 大、模型大 | 减小 batch、梯度累积 |
7.2 调试技巧集合
实例
import torch
import random
import numpy as np
def set_seed(seed=42):
"""设置随机种子,保证结果可复现"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def debug_forward(model, x):
"""调试前向传播,检查各层输出"""
print("=" * 50)
print("前向传播调试")
print("=" * 50)
for name, layer in model.named_children():
x = layer(x)
if hasattr(x, 'shape'):
print(f"{name}: shape={x.shape}, ", end='')
if hasattr(x, 'mean'):
print(f"mean={x.mean().item():.4f}, std={x.std().item():.4f}")
if torch.isnan(x).any():
print(f" ⚠️ 检测到 NaN!")
if torch.isinf(x).any():
print(f" ⚠️ 检测到 Inf!")
print("=" * 50)
def debug_backward(model):
"""调试反向传播,检查梯度"""
print("=" * 50)
print("反向传播调试")
print("=" * 50)
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm().item()
if torch.isnan(param.grad).any():
print(f"{name}: ⚠️ 梯度 NaN!")
elif torch.isinf(param.grad).any():
print(f"{name}: ⚠️ 梯度 Inf!")
elif grad_norm > 10:
print(f"{name}: ⚠️ 梯度爆炸! norm={grad_norm:.2f}")
elif grad_norm < 1e-7:
print(f"{name}: ⚠️ 梯度消失! norm={grad_norm:.2e}")
else:
print(f"{name}: 正常 norm={grad_norm:.4f}")
print("=" * 50)
# 使用示例
set_seed(42)
model = nn.Sequential(
nn.Linear(10, 20),
nn.ReLU(),
nn.Linear(20, 20),
nn.ReLU(),
nn.Linear(20, 5)
)
x = torch.randn(2, 10)
y = model(x)
debug_forward(model, x)
loss = y.sum()
loss.backward()
debug_backward(model)
import random
import numpy as np
def set_seed(seed=42):
"""设置随机种子,保证结果可复现"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def debug_forward(model, x):
"""调试前向传播,检查各层输出"""
print("=" * 50)
print("前向传播调试")
print("=" * 50)
for name, layer in model.named_children():
x = layer(x)
if hasattr(x, 'shape'):
print(f"{name}: shape={x.shape}, ", end='')
if hasattr(x, 'mean'):
print(f"mean={x.mean().item():.4f}, std={x.std().item():.4f}")
if torch.isnan(x).any():
print(f" ⚠️ 检测到 NaN!")
if torch.isinf(x).any():
print(f" ⚠️ 检测到 Inf!")
print("=" * 50)
def debug_backward(model):
"""调试反向传播,检查梯度"""
print("=" * 50)
print("反向传播调试")
print("=" * 50)
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm().item()
if torch.isnan(param.grad).any():
print(f"{name}: ⚠️ 梯度 NaN!")
elif torch.isinf(param.grad).any():
print(f"{name}: ⚠️ 梯度 Inf!")
elif grad_norm > 10:
print(f"{name}: ⚠️ 梯度爆炸! norm={grad_norm:.2f}")
elif grad_norm < 1e-7:
print(f"{name}: ⚠️ 梯度消失! norm={grad_norm:.2e}")
else:
print(f"{name}: 正常 norm={grad_norm:.4f}")
print("=" * 50)
# 使用示例
set_seed(42)
model = nn.Sequential(
nn.Linear(10, 20),
nn.ReLU(),
nn.Linear(20, 20),
nn.ReLU(),
nn.Linear(20, 5)
)
x = torch.randn(2, 10)
y = model(x)
debug_forward(model, x)
loss = y.sum()
loss.backward()
debug_backward(model)
8. PyTorch Profiler 使用
PyTorch Profiler 是官方提供的性能分析工具,可以分析模型各部分的耗时和显存占用。
8.1 Profiler 基本使用
实例
import torch
import torch.nn as nn
from torch.profiler import profile, ProfilerActivity, schedule
# 简单模型
model = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(64, 10)
).cuda()
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
# 使用 profiler
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=schedule(wait=1, warmup=1, active=3, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
for step in range(5):
inputs = torch.randn(8, 3, 224, 224).cuda()
labels = torch.randint(0, 10, (8,)).cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
prof.step()
# 打印结果
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
import torch.nn as nn
from torch.profiler import profile, ProfilerActivity, schedule
# 简单模型
model = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(64, 10)
).cuda()
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
# 使用 profiler
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=schedule(wait=1, warmup=1, active=3, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
for step in range(5):
inputs = torch.randn(8, 3, 224, 224).cuda()
labels = torch.randint(0, 10, (8,)).cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
prof.step()
# 打印结果
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
8.2 可视化 Profiler 结果
实例
# 导出 Chrome 跟踪文件
# 生成的 ./logs 目录可以用 Chrome 浏览器打开查看
# 或者直接打印各项指标
print("=" * 80)
print("CPU 时间 Top 10:")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
print("\n" + "=" * 80)
print("CUDA 时间 Top 10:")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
print("\n" + "=" * 80)
print("显存占用 Top 10:")
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
# 生成的 ./logs 目录可以用 Chrome 浏览器打开查看
# 或者直接打印各项指标
print("=" * 80)
print("CPU 时间 Top 10:")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
print("\n" + "=" * 80)
print("CUDA 时间 Top 10:")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
print("\n" + "=" * 80)
print("显存占用 Top 10:")
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
点我分享笔记