Hyperparameter Tuning

Hyperparameter tuning is a critical step in machine learning for improving model performance. Unlike model parameters, hyperparameters are configuration parameters set before training begins, and they control the learning process itself.

What are Hyperparameters?

Hyperparameters are parameters in machine learning algorithms that need to be manually set before training. They cannot be directly learned from the training data. Common hyperparameters include:

Learning rate
Regularization parameters
Tree depth
Number of clusters
Kernel function parameters

Grid Search

Grid search is the most intuitive hyperparameter tuning method, which tries all possible parameter combinations.

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create model
rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

# Execute search
grid_search.fit(X_train, y_train)

# View best parameters
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Use best model for prediction
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set score:", test_score)

Random Search

When the parameter space is large, random search is more efficient than grid search.

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# Random search
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=100,  # Try 100 combinations
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)

print("Random search best parameters:", random_search.best_params_)
print("Random search best score:", random_search.best_score_)

Bayesian Optimization

Bayesian optimization is a more intelligent hyperparameter tuning method that uses previous evaluation results to guide next parameter selection.

# Need to install: pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Define search space
search_spaces = {
    'n_estimators': Integer(50, 500),
    'max_depth': Categorical([3, 5, 7, 10, None]),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical(['sqrt', 'log2', None])
}

# Bayesian search
bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=search_spaces,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

bayes_search.fit(X_train, y_train)

print("Bayesian optimization best parameters:", bayes_search.best_params_)
print("Bayesian optimization best score:", bayes_search.best_score_)

Common Hyperparameters for Different Algorithms

Support Vector Machine (SVM)

from sklearn.svm import SVC

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

svm_grid = GridSearchCV(SVC(), svm_params, cv=5)

Logistic Regression

from sklearn.linear_model import LogisticRegression

lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

lr_grid = GridSearchCV(LogisticRegression(), lr_params, cv=5)

Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=5)

Validation Curve Analysis

Validation curves help us understand the impact of individual hyperparameters on model performance.

from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
import numpy as np

# Analyze n_estimators parameter
param_range = [10, 50, 100, 200, 300, 400, 500]
train_scores, test_scores = validation_curve(
    RandomForestClassifier(random_state=42),
    X_train, y_train,
    param_name='n_estimators',
    param_range=param_range,
    cv=5,
    scoring='accuracy'
)

# Calculate mean and standard deviation
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot validation curve
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, 'o-', color='blue', label='Training score')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(param_range, test_mean, 'o-', color='red', label='Validation score')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')

plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.title('Random Forest Validation Curve')
plt.legend()
plt.grid(True)
plt.show()

Learning Curve Analysis

Learning curves show how model performance changes with the number of training samples.

from sklearn.model_selection import learning_curve

# Generate learning curves
train_sizes, train_scores, test_scores = learning_curve(
    RandomForestClassifier(n_estimators=100, random_state=42),
    X_train, y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)

# Plot learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Validation score')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()

Best Practices for Hyperparameter Tuning

1. Hierarchical Tuning

# Step 1: Coarse tuning
coarse_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 7, None]
}

coarse_search = GridSearchCV(rf, coarse_params, cv=3)
coarse_search.fit(X_train, y_train)

# Step 2: Fine tuning
fine_params = {
    'n_estimators': [80, 100, 120],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 3, 4]
}

fine_search = GridSearchCV(rf, fine_params, cv=5)
fine_search.fit(X_train, y_train)

2. Early Stopping Strategy

from sklearn.ensemble import GradientBoostingClassifier

# Use validation set for early stopping
gb = GradientBoostingClassifier(
    n_estimators=1000,
    validation_fraction=0.2,
    n_iter_no_change=10,
    random_state=42
)

gb.fit(X_train, y_train)
print(f"Optimal number of iterations: {gb.n_estimators_}")

3. Cross-Validation Strategy Selection

from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit

# Stratified K-fold (suitable for classification problems)
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Time series cross-validation (suitable for time series data)
ts_cv = TimeSeriesSplit(n_splits=5)

grid_search_stratified = GridSearchCV(
    rf, param_grid, cv=stratified_cv, scoring='f1_macro'
)

Multi-Objective Optimization

Sometimes we need to optimize multiple metrics simultaneously.

from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

# Define multiple scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, average='macro'),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

# Multi-metric grid search
multi_score_search = GridSearchCV(
    rf, param_grid,
    cv=5,
    scoring=scoring,
    refit='f1',  # Use F1 score to select best model
    return_train_score=True
)

multi_score_search.fit(X_train, y_train)

# View results for all metrics
results = multi_score_search.cv_results_
for metric in scoring.keys():
    print(f"Best {metric}: {results[f'mean_test_{metric}'][multi_score_search.best_index_]:.4f}")

Practical Example: Complete Tuning Process

import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid (note parameter naming in pipeline)
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__class_weight': [None, 'balanced']
}

# Execute grid search
grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Evaluate final model
final_model = grid_search.best_estimator_
test_score = final_model.score(X_test, y_test)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Cross-validation score: {grid_search.best_score_:.4f}")
print(f"Test set score: {test_score:.4f}")

# Feature importance analysis
feature_importance = final_model.named_steps['classifier'].feature_importances_
feature_names = cancer.feature_names

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nTop 10 important features:")
print(importance_df.head(10))

Summary

Hyperparameter tuning is an important means to improve model performance:

Choose appropriate search strategy: Grid search is suitable for small parameter spaces, random search for large parameter spaces
Use cross-validation: Ensure reliability of results
Analyze validation curves: Understand the impact of parameters on performance
Consider computational cost: Balance search precision and time cost
Avoid overfitting: Do not tune parameters on the test set

The next chapter will cover Model Selection Strategies, learning how to choose the most suitable model among multiple algorithms.

#Hyperparameter Tuning

#What are Hyperparameters?

#Grid Search

#Random Search

#Bayesian Optimization

#Common Hyperparameters for Different Algorithms

#Support Vector Machine (SVM)

#Logistic Regression

#Gradient Boosting

#Validation Curve Analysis

#Learning Curve Analysis

#Best Practices for Hyperparameter Tuning

#1. Hierarchical Tuning

#2. Early Stopping Strategy

#3. Cross-Validation Strategy Selection

#Multi-Objective Optimization

#Practical Example: Complete Tuning Process

#Summary