Skip to content

Hyperparameter Tuning

Hyperparameter tuning is a critical step in machine learning for improving model performance. Unlike model parameters, hyperparameters are configuration parameters set before training begins, and they control the learning process itself.

What are Hyperparameters?

Hyperparameters are parameters in machine learning algorithms that need to be manually set before training. They cannot be directly learned from the training data. Common hyperparameters include:

  • Learning rate
  • Regularization parameters
  • Tree depth
  • Number of clusters
  • Kernel function parameters

Grid search is the most intuitive hyperparameter tuning method, which tries all possible parameter combinations.

python
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create model
rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

# Execute search
grid_search.fit(X_train, y_train)

# View best parameters
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Use best model for prediction
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set score:", test_score)

When the parameter space is large, random search is more efficient than grid search.

python
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# Random search
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=100,  # Try 100 combinations
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)

print("Random search best parameters:", random_search.best_params_)
print("Random search best score:", random_search.best_score_)

Bayesian Optimization

Bayesian optimization is a more intelligent hyperparameter tuning method that uses previous evaluation results to guide next parameter selection.

python
# Need to install: pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Define search space
search_spaces = {
    'n_estimators': Integer(50, 500),
    'max_depth': Categorical([3, 5, 7, 10, None]),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical(['sqrt', 'log2', None])
}

# Bayesian search
bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=search_spaces,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

bayes_search.fit(X_train, y_train)

print("Bayesian optimization best parameters:", bayes_search.best_params_)
print("Bayesian optimization best score:", bayes_search.best_score_)

Common Hyperparameters for Different Algorithms

Support Vector Machine (SVM)

python
from sklearn.svm import SVC

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

svm_grid = GridSearchCV(SVC(), svm_params, cv=5)

Logistic Regression

python
from sklearn.linear_model import LogisticRegression

lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

lr_grid = GridSearchCV(LogisticRegression(), lr_params, cv=5)

Gradient Boosting

python
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=5)

Validation Curve Analysis

Validation curves help us understand the impact of individual hyperparameters on model performance.

python
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
import numpy as np

# Analyze n_estimators parameter
param_range = [10, 50, 100, 200, 300, 400, 500]
train_scores, test_scores = validation_curve(
    RandomForestClassifier(random_state=42),
    X_train, y_train,
    param_name='n_estimators',
    param_range=param_range,
    cv=5,
    scoring='accuracy'
)

# Calculate mean and standard deviation
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot validation curve
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, 'o-', color='blue', label='Training score')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(param_range, test_mean, 'o-', color='red', label='Validation score')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')

plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.title('Random Forest Validation Curve')
plt.legend()
plt.grid(True)
plt.show()

Learning Curve Analysis

Learning curves show how model performance changes with the number of training samples.

python
from sklearn.model_selection import learning_curve

# Generate learning curves
train_sizes, train_scores, test_scores = learning_curve(
    RandomForestClassifier(n_estimators=100, random_state=42),
    X_train, y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)

# Plot learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Validation score')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()

Best Practices for Hyperparameter Tuning

1. Hierarchical Tuning

python
# Step 1: Coarse tuning
coarse_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 7, None]
}

coarse_search = GridSearchCV(rf, coarse_params, cv=3)
coarse_search.fit(X_train, y_train)

# Step 2: Fine tuning
fine_params = {
    'n_estimators': [80, 100, 120],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 3, 4]
}

fine_search = GridSearchCV(rf, fine_params, cv=5)
fine_search.fit(X_train, y_train)

2. Early Stopping Strategy

python
from sklearn.ensemble import GradientBoostingClassifier

# Use validation set for early stopping
gb = GradientBoostingClassifier(
    n_estimators=1000,
    validation_fraction=0.2,
    n_iter_no_change=10,
    random_state=42
)

gb.fit(X_train, y_train)
print(f"Optimal number of iterations: {gb.n_estimators_}")

3. Cross-Validation Strategy Selection

python
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit

# Stratified K-fold (suitable for classification problems)
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Time series cross-validation (suitable for time series data)
ts_cv = TimeSeriesSplit(n_splits=5)

grid_search_stratified = GridSearchCV(
    rf, param_grid, cv=stratified_cv, scoring='f1_macro'
)

Multi-Objective Optimization

Sometimes we need to optimize multiple metrics simultaneously.

python
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

# Define multiple scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, average='macro'),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

# Multi-metric grid search
multi_score_search = GridSearchCV(
    rf, param_grid,
    cv=5,
    scoring=scoring,
    refit='f1',  # Use F1 score to select best model
    return_train_score=True
)

multi_score_search.fit(X_train, y_train)

# View results for all metrics
results = multi_score_search.cv_results_
for metric in scoring.keys():
    print(f"Best {metric}: {results[f'mean_test_{metric}'][multi_score_search.best_index_]:.4f}")

Practical Example: Complete Tuning Process

python
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid (note parameter naming in pipeline)
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__class_weight': [None, 'balanced']
}

# Execute grid search
grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Evaluate final model
final_model = grid_search.best_estimator_
test_score = final_model.score(X_test, y_test)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Cross-validation score: {grid_search.best_score_:.4f}")
print(f"Test set score: {test_score:.4f}")

# Feature importance analysis
feature_importance = final_model.named_steps['classifier'].feature_importances_
feature_names = cancer.feature_names

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nTop 10 important features:")
print(importance_df.head(10))

Summary

Hyperparameter tuning is an important means to improve model performance:

  1. Choose appropriate search strategy: Grid search is suitable for small parameter spaces, random search for large parameter spaces
  2. Use cross-validation: Ensure reliability of results
  3. Analyze validation curves: Understand the impact of parameters on performance
  4. Consider computational cost: Balance search precision and time cost
  5. Avoid overfitting: Do not tune parameters on the test set

The next chapter will cover Model Selection Strategies, learning how to choose the most suitable model among multiple algorithms.

Content is for learning and research only.