Performance Metrics Explained
Choosing the right evaluation metrics is crucial for properly assessing model performance. Different problem types require different evaluation metrics. This chapter will详细介绍 the meaning, calculation methods, and use cases of various performance metrics.
Classification Problem Evaluation Metrics
1. Basic Metrics: Accuracy, Precision, Recall, F1 Score
python
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import seaborn as sns
# Create sample data
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=10, n_classes=3, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
def calculate_basic_metrics(y_true, y_pred):
"""Calculate basic classification metrics"""
# Accuracy
accuracy = accuracy_score(y_true, y_pred)
# Precision, Recall, F1 Score (multi-class average)
precision_macro = precision_score(y_true, y_pred, average='macro')
recall_macro = recall_score(y_true, y_pred, average='macro')
f1_macro = f1_score(y_true, y_pred, average='macro')
# Weighted average
precision_weighted = precision_score(y_true, y_pred, average='weighted')
recall_weighted = recall_score(y_true, y_pred, average='weighted')
f1_weighted = f1_score(y_true, y_pred, average='weighted')
print("=== Basic Classification Metrics ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"\nMacro Average:")
print(f" Precision: {precision_macro:.4f}")
print(f" Recall: {recall_macro:.4f}")
print(f" F1 Score: {f1_macro:.4f}")
print(f"\nWeighted Average:")
print(f" Precision: {precision_weighted:.4f}")
print(f" Recall: {recall_weighted:.4f}")
print(f" F1 Score: {f1_weighted:.4f}")
return {
'accuracy': accuracy,
'precision_macro': precision_macro,
'recall_macro': recall_macro,
'f1_macro': f1_macro,
'precision_weighted': precision_weighted,
'recall_weighted': recall_weighted,
'f1_weighted': f1_weighted
}
# Calculate basic metrics
basic_metrics = calculate_basic_metrics(y_test, y_pred)2. Confusion Matrix
python
def plot_confusion_matrix(y_true, y_pred, class_names=None):
"""Plot confusion matrix"""
cm = confusion_matrix(y_true, y_pred)
if class_names is None:
class_names = [f'Class {i}' for i in range(len(np.unique(y_true)))]
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()
# Calculate metrics for each class
print("\n=== Detailed Metrics by Class ===")
report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)
for class_name in class_names:
metrics = report[class_name]
print(f"{class_name}:")
print(f" Precision: {metrics['precision']:.4f}")
print(f" Recall: {metrics['recall']:.4f}")
print(f" F1 Score: {metrics['f1-score']:.4f}")
print(f" Support: {metrics['support']}")
return cm
# Plot confusion matrix
cm = plot_confusion_matrix(y_test, y_pred, ['Class A', 'Class B', 'Class C'])3. ROC Curve and AUC
python
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
from itertools import cycle
def plot_roc_curves(y_true, y_pred_proba, class_names=None):
"""Plot ROC curves"""
n_classes = y_pred_proba.shape[1]
if class_names is None:
class_names = [f'Class {i}' for i in range(n_classes)]
# Binarize labels
y_true_bin = label_binarize(y_true, classes=range(n_classes))
# Calculate ROC curve for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Calculate micro-average ROC curve
fpr["micro"], tpr["micro"], _ = roc_curve(y_true_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Plot ROC curves
plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', linestyle=':', linewidth=4,
label=f'Micro-average (AUC = {roc_auc["micro"]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
# Print AUC scores
print("=== AUC Scores ===")
for i, class_name in enumerate(class_names):
print(f"{class_name}: {roc_auc[i]:.4f}")
print(f"Micro-average: {roc_auc['micro']:.4f}")
# Multi-class AUC (one-vs-rest)
try:
macro_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro')
weighted_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
print(f"Macro-average AUC: {macro_auc:.4f}")
print(f"Weighted-average AUC: {weighted_auc:.4f}")
except:
print("Unable to calculate multi-class AUC")
return roc_auc
# Plot ROC curves
roc_results = plot_roc_curves(y_test, y_pred_proba, ['Class A', 'Class B', 'Class C'])4. Precision-Recall Curve
python
from sklearn.metrics import precision_recall_curve, average_precision_score
def plot_precision_recall_curves(y_true, y_pred_proba, class_names=None):
"""Plot precision-recall curves"""
n_classes = y_pred_proba.shape[1]
if class_names is None:
class_names = [f'Class {i}' for i in range(n_classes)]
# Binarize labels
y_true_bin = label_binarize(y_true, classes=range(n_classes))
# Calculate PR curve for each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i], y_pred_proba[:, i])
average_precision[i] = average_precision_score(y_true_bin[:, i], y_pred_proba[:, i])
# Calculate micro-average PR curve
precision["micro"], recall["micro"], _ = precision_recall_curve(
y_true_bin.ravel(), y_pred_proba.ravel()
)
average_precision["micro"] = average_precision_score(y_true_bin, y_pred_proba, average="micro")
# Plot PR curves
plt.figure(figsize=(10, 8))
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
for i, color in zip(range(n_classes), colors):
plt.plot(recall[i], precision[i], color=color, lw=2,
label=f'{class_names[i]} (AP = {average_precision[i]:.2f})')
plt.plot(recall["micro"], precision["micro"], color='gold', linestyle=':', linewidth=4,
label=f'Micro-average (AP = {average_precision["micro"]:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()
# Print average precision
print("=== Average Precision ===")
for i, class_name in enumerate(class_names):
print(f"{class_name}: {average_precision[i]:.4f}")
print(f"Micro-average: {average_precision['micro']:.4f}")
return average_precision
# Plot PR curves
pr_results = plot_precision_recall_curves(y_test, y_pred_proba, ['Class A', 'Class B', 'Class C'])Regression Problem Evaluation Metrics
python
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, explained_variance_score
)
# Create regression data
X_reg, y_reg = make_regression(
n_samples=1000, n_features=10, noise=10, random_state=42
)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# Train regression model
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)
def calculate_regression_metrics(y_true, y_pred):
"""Calculate regression metrics"""
# Basic metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
# Other metrics
try:
mape = mean_absolute_percentage_error(y_true, y_pred)
except:
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
explained_var = explained_variance_score(y_true, y_pred)
print("=== Regression Evaluation Metrics ===")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}%")
print(f"Explained Variance Score: {explained_var:.4f}")
# Residual analysis
residuals = y_true - y_pred
print(f"\n=== Residual Analysis ===")
print(f"Residual Mean: {np.mean(residuals):.4f}")
print(f"Residual Std: {np.std(residuals):.4f}")
print(f"Residual Max: {np.max(residuals):.4f}")
print(f"Residual Min: {np.min(residuals):.4f}")
return {
'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2,
'mape': mape, 'explained_variance': explained_var,
'residuals': residuals
}
# Calculate regression metrics
reg_metrics = calculate_regression_metrics(y_test_reg, y_pred_reg)Regression Visualization Analysis
python
def plot_regression_analysis(y_true, y_pred, metrics):
"""Plot regression analysis charts"""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. Predicted vs Actual
axes[0, 0].scatter(y_true, y_pred, alpha=0.6)
axes[0, 0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Values')
axes[0, 0].set_ylabel('Predicted Values')
axes[0, 0].set_title(f'Predicted vs Actual (R² = {metrics["r2"]:.3f})')
axes[0, 0].grid(True)
# 2. Residual plot
residuals = metrics['residuals']
axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Values')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residual Plot')
axes[0, 1].grid(True)
# 3. Residual histogram
axes[1, 0].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Residual Distribution')
axes[1, 0].grid(True)
# 4. Q-Q plot (normality test)
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Residual Q-Q Plot')
axes[1, 1].grid(True)
plt.tight_layout()
plt.show()
# Plot regression analysis
plot_regression_analysis(y_test_reg, y_pred_reg, reg_metrics)Clustering Evaluation Metrics
python
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import (
adjusted_rand_score, normalized_mutual_info_score,
silhouette_score, calinski_harabasz_score, davies_bouldin_score
)
# Create clustering data
X_cluster, y_cluster_true = make_blobs(
n_samples=300, centers=4, n_features=2,
random_state=42, cluster_std=0.60
)
# Perform clustering
kmeans = KMeans(n_clusters=4, random_state=42)
y_cluster_pred = kmeans.fit_predict(X_cluster)
def calculate_clustering_metrics(X, y_true, y_pred):
"""Calculate clustering metrics"""
# External metrics (require true labels)
ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)
# Internal metrics (don't require true labels)
silhouette = silhouette_score(X, y_pred)
calinski_harabasz = calinski_harabasz_score(X, y_pred)
davies_bouldin = davies_bouldin_score(X, y_pred)
print("=== Clustering Evaluation Metrics ===")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"Silhouette Score: {silhouette:.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
return {
'ari': ari, 'nmi': nmi, 'silhouette': silhouette,
'calinski_harabasz': calinski_harabasz, 'davies_bouldin': davies_bouldin
}
# Calculate clustering metrics
cluster_metrics = calculate_clustering_metrics(X_cluster, y_cluster_true, y_cluster_pred)
# Visualize clustering results
def plot_clustering_results(X, y_true, y_pred):
"""Visualize clustering results"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# True clusters
axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
axes[0].set_title('True Clusters')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
# Predicted clusters
axes[1].scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
axes[1].set_title('Predicted Clusters')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')
plt.tight_layout()
plt.show()
plot_clustering_results(X_cluster, y_cluster_true, y_cluster_pred)Cross-Validation Evaluation
python
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
def comprehensive_cross_validation(X, y, model, problem_type='classification'):
"""Comprehensive cross-validation evaluation"""
if problem_type == 'classification':
# Classification metrics
scoring = {
'accuracy': 'accuracy',
'precision': make_scorer(precision_score, average='macro'),
'recall': make_scorer(recall_score, average='macro'),
'f1': make_scorer(f1_score, average='macro')
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
else:
# Regression metrics
scoring = {
'r2': 'r2',
'neg_mse': 'neg_mean_squared_error',
'neg_mae': 'neg_mean_absolute_error'
}
cv = 5
# Perform cross-validation
cv_results = cross_validate(
model, X, y, cv=cv, scoring=scoring,
return_train_score=True, n_jobs=-1
)
print(f"=== {problem_type.upper()} Cross-Validation Results ===")
print(f"Cross-validation folds: {cv if isinstance(cv, int) else cv.n_splits}")
for metric in scoring.keys():
test_scores = cv_results[f'test_{metric}']
train_scores = cv_results[f'train_{metric}']
print(f"\n{metric.upper()}:")
print(f" Test set: {test_scores.mean():.4f} (+/- {test_scores.std() * 2:.4f})")
print(f" Train set: {train_scores.mean():.4f} (+/- {train_scores.std() * 2:.4f})")
print(f" Overfitting degree: {train_scores.mean() - test_scores.mean():.4f}")
# Training time analysis
fit_times = cv_results['fit_time']
score_times = cv_results['score_time']
print(f"\nTime Analysis:")
print(f" Average training time: {fit_times.mean():.4f}s (+/- {fit_times.std() * 2:.4f}s)")
print(f" Average evaluation time: {score_times.mean():.4f}s (+/- {score_times.std() * 2:.4f}s)")
return cv_results
# Classification model cross-validation
print("=== Classification Model Evaluation ===")
clf_cv_results = comprehensive_cross_validation(
X, y, RandomForestClassifier(n_estimators=100, random_state=42), 'classification'
)
print("\n" + "="*50)
# Regression model cross-validation
print("=== Regression Model Evaluation ===")
reg_cv_results = comprehensive_cross_validation(
X_reg, y_reg, RandomForestRegressor(n_estimators=100, random_state=42), 'regression'
)Model Comparison and Statistical Testing
python
from scipy import stats
from sklearn.model_selection import cross_val_score
def compare_models_statistically(X, y, models, model_names, cv=5, scoring='accuracy'):
"""Statistical model comparison"""
# Collect cross-validation scores for all models
all_scores = []
for model in models:
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
all_scores.append(scores)
# Create results DataFrame
results_df = pd.DataFrame(all_scores, index=model_names).T
print(f"=== Model Performance Comparison ({scoring}) ===")
print(results_df.describe())
# Paired t-test
print(f"\n=== Paired t-test (p-values) ===")
n_models = len(models)
p_values = np.zeros((n_models, n_models))
for i in range(n_models):
for j in range(n_models):
if i != j:
_, p_value = stats.ttest_rel(all_scores[i], all_scores[j])
p_values[i, j] = p_value
p_values_df = pd.DataFrame(p_values, index=model_names, columns=model_names)
print(p_values_df)
# Visualize comparison
plt.figure(figsize=(10, 6))
results_df.boxplot()
plt.title(f'Model Performance Comparison ({scoring})')
plt.ylabel(scoring)
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
return results_df, p_values_df
# Compare multiple classification models
models_to_compare = [
RandomForestClassifier(n_estimators=100, random_state=42),
GradientBoostingClassifier(n_estimators=100, random_state=42),
SVC(random_state=42)
]
model_names = ['Random Forest', 'Gradient Boosting', 'SVM']
comparison_results, p_values = compare_models_statistically(
X, y, models_to_compare, model_names, cv=5, scoring='accuracy'
)Custom Evaluation Metrics
python
from sklearn.metrics import make_scorer
def custom_business_metric(y_true, y_pred):
"""Custom business metric example"""
# Assume this is a business scenario:
# - Correctly predicting positive class yields 10
# - Correctly predicting negative class yields 1
# - Incorrectly predicting positive class costs -5
# - Incorrectly predicting negative class costs -2
tp = np.sum((y_true == 1) & (y_pred == 1)) # True positives
tn = np.sum((y_true == 0) & (y_pred == 0)) # True negatives
fp = np.sum((y_true == 0) & (y_pred == 1)) # False positives
fn = np.sum((y_true == 1) & (y_pred == 0)) # False negatives
business_value = tp * 10 + tn * 1 + fp * (-5) + fn * (-2)
return business_value
# Create binary classification data for demonstration
X_binary, y_binary = make_classification(
n_samples=1000, n_features=10, n_classes=2, random_state=42
)
# Create custom scorer
business_scorer = make_scorer(custom_business_metric, greater_is_better=True)
# Evaluate model using custom metric
model_binary = RandomForestClassifier(n_estimators=100, random_state=42)
business_scores = cross_val_score(model_binary, X_binary, y_binary,
cv=5, scoring=business_scorer)
print("=== Custom Business Metric Evaluation ===")
print(f"Business value score: {business_scores.mean():.2f} (+/- {business_scores.std() * 2:.2f})")
# Compare with standard metrics
accuracy_scores = cross_val_score(model_binary, X_binary, y_binary,
cv=5, scoring='accuracy')
print(f"Accuracy: {accuracy_scores.mean():.4f} (+/- {accuracy_scores.std() * 2:.4f})")Evaluation Metric Selection Guide
python
def metric_selection_guide():
"""Evaluation metric selection guide"""
guide = """
=== Evaluation Metric Selection Guide ===
Classification Problems:
├── Balanced Datasets
│ ├── Overall performance → Accuracy
│ ├── Per-class performance → Macro-average F1 Score
│ └── Probability prediction → AUC-ROC
│
├── Imbalanced Datasets
│ ├── Focus on minority class → Recall, AUC-PR
│ ├── Precision prediction → Precision
│ └── Balanced consideration → F1 Score, Weighted-average metrics
│
├── Multi-class Problems
│ ├── Macro-average → Equal weight for each class
│ ├── Micro-average → Equal weight for each sample
│ └── Weighted-average → Weighted by class sample count
│
└── Business Scenarios
├── Medical diagnosis → Recall (avoid missed diagnoses)
├── Spam detection → Precision (avoid false positives)
└── Recommendation systems → AUC, Top-K Accuracy
Regression Problems:
├── Error Magnitude
│ ├── Average error → MAE
│ ├── Large error sensitivity → MSE, RMSE
│ └── Relative error → MAPE
│
├── Interpretability
│ ├── Goodness of fit → R²
│ ├── Variance explained → Explained variance score
│ └── Baseline comparison → Relative improvement
│
└── Business Scenarios
├── Price prediction → MAPE (relative error important)
├── Sales prediction → MAE (absolute error important)
└── Risk assessment → MSE (large errors costly)
Clustering Problems:
├── With True Labels
│ ├── Clustering quality → ARI, NMI
│ └── Label consistency → Adjusted mutual information
│
├── Without True Labels
│ ├── Cluster compactness → Silhouette score
│ ├── Cluster separation → Calinski-Harabasz Index
│ └── Cluster compactness → Davies-Bouldin Index
│
└── Choosing Number of Clusters
├── Elbow method → Within-cluster sum of squares
├── Silhouette analysis → Silhouette score
└── Gap statistic → Comparison with random data
"""
print(guide)
# Display metric selection guide
metric_selection_guide()Summary
Choosing appropriate evaluation metrics is key to the success of machine learning projects:
Key Principles:
- Problem-Oriented: Choose metrics based on specific problem types
- Business-Relevant: Consider actual business scenarios and costs
- Data Characteristics: Consider data balance, noise, etc.
- Multi-Metric Evaluation: Use multiple metrics for comprehensive assessment
- Statistical Significance: Perform statistical tests to ensure reliable results
Common Combinations:
- Classification: Accuracy + F1 Score + AUC
- Regression: R² + RMSE + MAE
- Clustering: Silhouette Score + Calinski-Harabasz Index
Important Notes:
- Avoid repeatedly tuning on the test set
- Use cross-validation to obtain stable estimates
- Consider computational cost and interpretability
- Customize metrics based on business requirements
In the next chapter, we will learn about Pipelines and Workflows to understand how to build efficient machine learning pipelines.