Skip to content

Performance Metrics Explained

Choosing the right evaluation metrics is crucial for properly assessing model performance. Different problem types require different evaluation metrics. This chapter will详细介绍 the meaning, calculation methods, and use cases of various performance metrics.

Classification Problem Evaluation Metrics

1. Basic Metrics: Accuracy, Precision, Recall, F1 Score

python
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import seaborn as sns

# Create sample data
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_redundant=10, n_classes=3, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

def calculate_basic_metrics(y_true, y_pred):
    """Calculate basic classification metrics"""

    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Precision, Recall, F1 Score (multi-class average)
    precision_macro = precision_score(y_true, y_pred, average='macro')
    recall_macro = recall_score(y_true, y_pred, average='macro')
    f1_macro = f1_score(y_true, y_pred, average='macro')

    # Weighted average
    precision_weighted = precision_score(y_true, y_pred, average='weighted')
    recall_weighted = recall_score(y_true, y_pred, average='weighted')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')

    print("=== Basic Classification Metrics ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"\nMacro Average:")
    print(f"  Precision: {precision_macro:.4f}")
    print(f"  Recall: {recall_macro:.4f}")
    print(f"  F1 Score: {f1_macro:.4f}")
    print(f"\nWeighted Average:")
    print(f"  Precision: {precision_weighted:.4f}")
    print(f"  Recall: {recall_weighted:.4f}")
    print(f"  F1 Score: {f1_weighted:.4f}")

    return {
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_weighted': precision_weighted,
        'recall_weighted': recall_weighted,
        'f1_weighted': f1_weighted
    }

# Calculate basic metrics
basic_metrics = calculate_basic_metrics(y_test, y_pred)

2. Confusion Matrix

python
def plot_confusion_matrix(y_true, y_pred, class_names=None):
    """Plot confusion matrix"""

    cm = confusion_matrix(y_true, y_pred)

    if class_names is None:
        class_names = [f'Class {i}' for i in range(len(np.unique(y_true)))]

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

    # Calculate metrics for each class
    print("\n=== Detailed Metrics by Class ===")
    report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)

    for class_name in class_names:
        metrics = report[class_name]
        print(f"{class_name}:")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1 Score: {metrics['f1-score']:.4f}")
        print(f"  Support: {metrics['support']}")

    return cm

# Plot confusion matrix
cm = plot_confusion_matrix(y_test, y_pred, ['Class A', 'Class B', 'Class C'])

3. ROC Curve and AUC

python
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
from itertools import cycle

def plot_roc_curves(y_true, y_pred_proba, class_names=None):
    """Plot ROC curves"""

    n_classes = y_pred_proba.shape[1]

    if class_names is None:
        class_names = [f'Class {i}' for i in range(n_classes)]

    # Binarize labels
    y_true_bin = label_binarize(y_true, classes=range(n_classes))

    # Calculate ROC curve for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Calculate micro-average ROC curve
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true_bin.ravel(), y_pred_proba.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Plot ROC curves
    plt.figure(figsize=(10, 8))
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green'])

    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')

    plt.plot(fpr["micro"], tpr["micro"], color='deeppink', linestyle=':', linewidth=4,
             label=f'Micro-average (AUC = {roc_auc["micro"]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()

    # Print AUC scores
    print("=== AUC Scores ===")
    for i, class_name in enumerate(class_names):
        print(f"{class_name}: {roc_auc[i]:.4f}")
    print(f"Micro-average: {roc_auc['micro']:.4f}")

    # Multi-class AUC (one-vs-rest)
    try:
        macro_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro')
        weighted_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
        print(f"Macro-average AUC: {macro_auc:.4f}")
        print(f"Weighted-average AUC: {weighted_auc:.4f}")
    except:
        print("Unable to calculate multi-class AUC")

    return roc_auc

# Plot ROC curves
roc_results = plot_roc_curves(y_test, y_pred_proba, ['Class A', 'Class B', 'Class C'])

4. Precision-Recall Curve

python
from sklearn.metrics import precision_recall_curve, average_precision_score

def plot_precision_recall_curves(y_true, y_pred_proba, class_names=None):
    """Plot precision-recall curves"""

    n_classes = y_pred_proba.shape[1]

    if class_names is None:
        class_names = [f'Class {i}' for i in range(n_classes)]

    # Binarize labels
    y_true_bin = label_binarize(y_true, classes=range(n_classes))

    # Calculate PR curve for each class
    precision = dict()
    recall = dict()
    average_precision = dict()

    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i], y_pred_proba[:, i])
        average_precision[i] = average_precision_score(y_true_bin[:, i], y_pred_proba[:, i])

    # Calculate micro-average PR curve
    precision["micro"], recall["micro"], _ = precision_recall_curve(
        y_true_bin.ravel(), y_pred_proba.ravel()
    )
    average_precision["micro"] = average_precision_score(y_true_bin, y_pred_proba, average="micro")

    # Plot PR curves
    plt.figure(figsize=(10, 8))
    colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

    for i, color in zip(range(n_classes), colors):
        plt.plot(recall[i], precision[i], color=color, lw=2,
                label=f'{class_names[i]} (AP = {average_precision[i]:.2f})')

    plt.plot(recall["micro"], precision["micro"], color='gold', linestyle=':', linewidth=4,
             label=f'Micro-average (AP = {average_precision["micro"]:.2f})')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.grid(True)
    plt.show()

    # Print average precision
    print("=== Average Precision ===")
    for i, class_name in enumerate(class_names):
        print(f"{class_name}: {average_precision[i]:.4f}")
    print(f"Micro-average: {average_precision['micro']:.4f}")

    return average_precision

# Plot PR curves
pr_results = plot_precision_recall_curves(y_test, y_pred_proba, ['Class A', 'Class B', 'Class C'])

Regression Problem Evaluation Metrics

python
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error, explained_variance_score
)

# Create regression data
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=10, noise=10, random_state=42
)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train regression model
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)

def calculate_regression_metrics(y_true, y_pred):
    """Calculate regression metrics"""

    # Basic metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    # Other metrics
    try:
        mape = mean_absolute_percentage_error(y_true, y_pred)
    except:
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    explained_var = explained_variance_score(y_true, y_pred)

    print("=== Regression Evaluation Metrics ===")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}%")
    print(f"Explained Variance Score: {explained_var:.4f}")

    # Residual analysis
    residuals = y_true - y_pred
    print(f"\n=== Residual Analysis ===")
    print(f"Residual Mean: {np.mean(residuals):.4f}")
    print(f"Residual Std: {np.std(residuals):.4f}")
    print(f"Residual Max: {np.max(residuals):.4f}")
    print(f"Residual Min: {np.min(residuals):.4f}")

    return {
        'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2,
        'mape': mape, 'explained_variance': explained_var,
        'residuals': residuals
    }

# Calculate regression metrics
reg_metrics = calculate_regression_metrics(y_test_reg, y_pred_reg)

Regression Visualization Analysis

python
def plot_regression_analysis(y_true, y_pred, metrics):
    """Plot regression analysis charts"""

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # 1. Predicted vs Actual
    axes[0, 0].scatter(y_true, y_pred, alpha=0.6)
    axes[0, 0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    axes[0, 0].set_xlabel('Actual Values')
    axes[0, 0].set_ylabel('Predicted Values')
    axes[0, 0].set_title(f'Predicted vs Actual (R² = {metrics["r2"]:.3f})')
    axes[0, 0].grid(True)

    # 2. Residual plot
    residuals = metrics['residuals']
    axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
    axes[0, 1].axhline(y=0, color='r', linestyle='--')
    axes[0, 1].set_xlabel('Predicted Values')
    axes[0, 1].set_ylabel('Residuals')
    axes[0, 1].set_title('Residual Plot')
    axes[0, 1].grid(True)

    # 3. Residual histogram
    axes[1, 0].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
    axes[1, 0].set_xlabel('Residuals')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Residual Distribution')
    axes[1, 0].grid(True)

    # 4. Q-Q plot (normality test)
    from scipy import stats
    stats.probplot(residuals, dist="norm", plot=axes[1, 1])
    axes[1, 1].set_title('Residual Q-Q Plot')
    axes[1, 1].grid(True)

    plt.tight_layout()
    plt.show()

# Plot regression analysis
plot_regression_analysis(y_test_reg, y_pred_reg, reg_metrics)

Clustering Evaluation Metrics

python
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import (
    adjusted_rand_score, normalized_mutual_info_score,
    silhouette_score, calinski_harabasz_score, davies_bouldin_score
)

# Create clustering data
X_cluster, y_cluster_true = make_blobs(
    n_samples=300, centers=4, n_features=2,
    random_state=42, cluster_std=0.60
)

# Perform clustering
kmeans = KMeans(n_clusters=4, random_state=42)
y_cluster_pred = kmeans.fit_predict(X_cluster)

def calculate_clustering_metrics(X, y_true, y_pred):
    """Calculate clustering metrics"""

    # External metrics (require true labels)
    ari = adjusted_rand_score(y_true, y_pred)
    nmi = normalized_mutual_info_score(y_true, y_pred)

    # Internal metrics (don't require true labels)
    silhouette = silhouette_score(X, y_pred)
    calinski_harabasz = calinski_harabasz_score(X, y_pred)
    davies_bouldin = davies_bouldin_score(X, y_pred)

    print("=== Clustering Evaluation Metrics ===")
    print(f"Adjusted Rand Index (ARI): {ari:.4f}")
    print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
    print(f"Silhouette Score: {silhouette:.4f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")

    return {
        'ari': ari, 'nmi': nmi, 'silhouette': silhouette,
        'calinski_harabasz': calinski_harabasz, 'davies_bouldin': davies_bouldin
    }

# Calculate clustering metrics
cluster_metrics = calculate_clustering_metrics(X_cluster, y_cluster_true, y_cluster_pred)

# Visualize clustering results
def plot_clustering_results(X, y_true, y_pred):
    """Visualize clustering results"""

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # True clusters
    axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
    axes[0].set_title('True Clusters')
    axes[0].set_xlabel('Feature 1')
    axes[0].set_ylabel('Feature 2')

    # Predicted clusters
    axes[1].scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
    axes[1].set_title('Predicted Clusters')
    axes[1].set_xlabel('Feature 1')
    axes[1].set_ylabel('Feature 2')

    plt.tight_layout()
    plt.show()

plot_clustering_results(X_cluster, y_cluster_true, y_cluster_pred)

Cross-Validation Evaluation

python
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer

def comprehensive_cross_validation(X, y, model, problem_type='classification'):
    """Comprehensive cross-validation evaluation"""

    if problem_type == 'classification':
        # Classification metrics
        scoring = {
            'accuracy': 'accuracy',
            'precision': make_scorer(precision_score, average='macro'),
            'recall': make_scorer(recall_score, average='macro'),
            'f1': make_scorer(f1_score, average='macro')
        }
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    else:
        # Regression metrics
        scoring = {
            'r2': 'r2',
            'neg_mse': 'neg_mean_squared_error',
            'neg_mae': 'neg_mean_absolute_error'
        }
        cv = 5

    # Perform cross-validation
    cv_results = cross_validate(
        model, X, y, cv=cv, scoring=scoring,
        return_train_score=True, n_jobs=-1
    )

    print(f"=== {problem_type.upper()} Cross-Validation Results ===")
    print(f"Cross-validation folds: {cv if isinstance(cv, int) else cv.n_splits}")

    for metric in scoring.keys():
        test_scores = cv_results[f'test_{metric}']
        train_scores = cv_results[f'train_{metric}']

        print(f"\n{metric.upper()}:")
        print(f"  Test set: {test_scores.mean():.4f} (+/- {test_scores.std() * 2:.4f})")
        print(f"  Train set: {train_scores.mean():.4f} (+/- {train_scores.std() * 2:.4f})")
        print(f"  Overfitting degree: {train_scores.mean() - test_scores.mean():.4f}")

    # Training time analysis
    fit_times = cv_results['fit_time']
    score_times = cv_results['score_time']

    print(f"\nTime Analysis:")
    print(f"  Average training time: {fit_times.mean():.4f}s (+/- {fit_times.std() * 2:.4f}s)")
    print(f"  Average evaluation time: {score_times.mean():.4f}s (+/- {score_times.std() * 2:.4f}s)")

    return cv_results

# Classification model cross-validation
print("=== Classification Model Evaluation ===")
clf_cv_results = comprehensive_cross_validation(
    X, y, RandomForestClassifier(n_estimators=100, random_state=42), 'classification'
)

print("\n" + "="*50)

# Regression model cross-validation
print("=== Regression Model Evaluation ===")
reg_cv_results = comprehensive_cross_validation(
    X_reg, y_reg, RandomForestRegressor(n_estimators=100, random_state=42), 'regression'
)

Model Comparison and Statistical Testing

python
from scipy import stats
from sklearn.model_selection import cross_val_score

def compare_models_statistically(X, y, models, model_names, cv=5, scoring='accuracy'):
    """Statistical model comparison"""

    # Collect cross-validation scores for all models
    all_scores = []

    for model in models:
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
        all_scores.append(scores)

    # Create results DataFrame
    results_df = pd.DataFrame(all_scores, index=model_names).T

    print(f"=== Model Performance Comparison ({scoring}) ===")
    print(results_df.describe())

    # Paired t-test
    print(f"\n=== Paired t-test (p-values) ===")
    n_models = len(models)
    p_values = np.zeros((n_models, n_models))

    for i in range(n_models):
        for j in range(n_models):
            if i != j:
                _, p_value = stats.ttest_rel(all_scores[i], all_scores[j])
                p_values[i, j] = p_value

    p_values_df = pd.DataFrame(p_values, index=model_names, columns=model_names)
    print(p_values_df)

    # Visualize comparison
    plt.figure(figsize=(10, 6))
    results_df.boxplot()
    plt.title(f'Model Performance Comparison ({scoring})')
    plt.ylabel(scoring)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return results_df, p_values_df

# Compare multiple classification models
models_to_compare = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    GradientBoostingClassifier(n_estimators=100, random_state=42),
    SVC(random_state=42)
]

model_names = ['Random Forest', 'Gradient Boosting', 'SVM']

comparison_results, p_values = compare_models_statistically(
    X, y, models_to_compare, model_names, cv=5, scoring='accuracy'
)

Custom Evaluation Metrics

python
from sklearn.metrics import make_scorer

def custom_business_metric(y_true, y_pred):
    """Custom business metric example"""
    # Assume this is a business scenario:
    # - Correctly predicting positive class yields 10
    # - Correctly predicting negative class yields 1
    # - Incorrectly predicting positive class costs -5
    # - Incorrectly predicting negative class costs -2

    tp = np.sum((y_true == 1) & (y_pred == 1))  # True positives
    tn = np.sum((y_true == 0) & (y_pred == 0))  # True negatives
    fp = np.sum((y_true == 0) & (y_pred == 1))  # False positives
    fn = np.sum((y_true == 1) & (y_pred == 0))  # False negatives

    business_value = tp * 10 + tn * 1 + fp * (-5) + fn * (-2)
    return business_value

# Create binary classification data for demonstration
X_binary, y_binary = make_classification(
    n_samples=1000, n_features=10, n_classes=2, random_state=42
)

# Create custom scorer
business_scorer = make_scorer(custom_business_metric, greater_is_better=True)

# Evaluate model using custom metric
model_binary = RandomForestClassifier(n_estimators=100, random_state=42)
business_scores = cross_val_score(model_binary, X_binary, y_binary,
                                cv=5, scoring=business_scorer)

print("=== Custom Business Metric Evaluation ===")
print(f"Business value score: {business_scores.mean():.2f} (+/- {business_scores.std() * 2:.2f})")

# Compare with standard metrics
accuracy_scores = cross_val_score(model_binary, X_binary, y_binary,
                                cv=5, scoring='accuracy')
print(f"Accuracy: {accuracy_scores.mean():.4f} (+/- {accuracy_scores.std() * 2:.4f})")

Evaluation Metric Selection Guide

python
def metric_selection_guide():
    """Evaluation metric selection guide"""

    guide = """
    === Evaluation Metric Selection Guide ===

    Classification Problems:
    ├── Balanced Datasets
    │   ├── Overall performance → Accuracy
    │   ├── Per-class performance → Macro-average F1 Score
    │   └── Probability prediction → AUC-ROC

    ├── Imbalanced Datasets
    │   ├── Focus on minority class → Recall, AUC-PR
    │   ├── Precision prediction → Precision
    │   └── Balanced consideration → F1 Score, Weighted-average metrics

    ├── Multi-class Problems
    │   ├── Macro-average → Equal weight for each class
    │   ├── Micro-average → Equal weight for each sample
    │   └── Weighted-average → Weighted by class sample count

    └── Business Scenarios
        ├── Medical diagnosis → Recall (avoid missed diagnoses)
        ├── Spam detection → Precision (avoid false positives)
        └── Recommendation systems → AUC, Top-K Accuracy

    Regression Problems:
    ├── Error Magnitude
    │   ├── Average error → MAE
    │   ├── Large error sensitivity → MSE, RMSE
    │   └── Relative error → MAPE

    ├── Interpretability
    │   ├── Goodness of fit → R²
    │   ├── Variance explained → Explained variance score
    │   └── Baseline comparison → Relative improvement

    └── Business Scenarios
        ├── Price prediction → MAPE (relative error important)
        ├── Sales prediction → MAE (absolute error important)
        └── Risk assessment → MSE (large errors costly)

    Clustering Problems:
    ├── With True Labels
    │   ├── Clustering quality → ARI, NMI
    │   └── Label consistency → Adjusted mutual information

    ├── Without True Labels
    │   ├── Cluster compactness → Silhouette score
    │   ├── Cluster separation → Calinski-Harabasz Index
    │   └── Cluster compactness → Davies-Bouldin Index

    └── Choosing Number of Clusters
        ├── Elbow method → Within-cluster sum of squares
        ├── Silhouette analysis → Silhouette score
        └── Gap statistic → Comparison with random data
    """

    print(guide)

# Display metric selection guide
metric_selection_guide()

Summary

Choosing appropriate evaluation metrics is key to the success of machine learning projects:

Key Principles:

  1. Problem-Oriented: Choose metrics based on specific problem types
  2. Business-Relevant: Consider actual business scenarios and costs
  3. Data Characteristics: Consider data balance, noise, etc.
  4. Multi-Metric Evaluation: Use multiple metrics for comprehensive assessment
  5. Statistical Significance: Perform statistical tests to ensure reliable results

Common Combinations:

  • Classification: Accuracy + F1 Score + AUC
  • Regression: R² + RMSE + MAE
  • Clustering: Silhouette Score + Calinski-Harabasz Index

Important Notes:

  • Avoid repeatedly tuning on the test set
  • Use cross-validation to obtain stable estimates
  • Consider computational cost and interpretability
  • Customize metrics based on business requirements

In the next chapter, we will learn about Pipelines and Workflows to understand how to build efficient machine learning pipelines.

Content is for learning and research only.