PyTorch Convolutional Neural Networks

Introduction to Convolutional Neural Networks

Convolutional Neural Network (CNN) is one of the most important architectures in deep learning, especially suitable for image processing tasks. CNN can automatically learn hierarchical features of images through the combination of convolutional layers, pooling layers, and fully connected layers.

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# Basic CNN components
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
pool_layer = nn.MaxPool2d(kernel_size=2, stride=2)
fc_layer = nn.Linear(in_features=1024, out_features=10)

Detailed Convolutional Layer

1. Basic Convolution Operation

# 2D convolution layer
conv2d = nn.Conv2d(
    in_channels=3,      # Input channels
    out_channels=64,    # Output channels (number of filters)
    kernel_size=3,      # Filter size
    stride=1,           # Stride
    padding=1,          # Padding
    dilation=1,         # Dilation
    groups=1,           # Grouped convolution
    bias=True           # Whether to use bias
)

# 1D convolution (for sequence data)
conv1d = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3)

# 3D convolution (for video data)
conv3d = nn.Conv3d(in_channels=3, out_channels=64, kernel_size=3)

# Transposed convolution (deconvolution)
conv_transpose = nn.ConvTranspose2d(64, 3, kernel_size=3, stride=2, padding=1)

2. Convolution Parameter Calculation

def conv_output_size(input_size, kernel_size, stride=1, padding=0, dilation=1):
    """Calculate convolution output size"""
    return (input_size + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1

# Example calculation
input_h, input_w = 32, 32
kernel_size = 3
stride = 1
padding = 1

output_h = conv_output_size(input_h, kernel_size, stride, padding)
output_w = conv_output_size(input_w, kernel_size, stride, padding)
print(f"Output size: {output_h} x {output_w}")  # 32 x 32

3. Different Types of Convolution

# Depthwise separable convolution
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        # Depthwise convolution
        self.depthwise = nn.Conv2d(
            in_channels, in_channels, kernel_size, stride, padding, groups=in_channels
        )
        # Pointwise convolution
        self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
    
    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x

# Dilated convolution
dilated_conv = nn.Conv2d(64, 64, kernel_size=3, padding=2, dilation=2)

# Grouped convolution
group_conv = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=4)

Pooling Layer

1. Common Pooling Operations

# Max pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)

# Average pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)

# Adaptive pooling (output fixed size)
adaptive_avg_pool = nn.AdaptiveAvgPool2d((7, 7))
adaptive_max_pool = nn.AdaptiveMaxPool2d((1, 1))  # Global pooling

# Test pooling effect
x = torch.randn(1, 64, 32, 32)
pooled = max_pool(x)
print(f"Before pooling: {x.shape}")      # [1, 64, 32, 32]
print(f"After pooling: {pooled.shape}")  # [1, 64, 16, 16]

2. Custom Pooling

class StochasticPool2d(nn.Module):
    """Stochastic pooling"""
    def __init__(self, kernel_size, stride=None):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride or kernel_size
    
    def forward(self, x):
        if self.training:
            # Use stochastic pooling during training
            return F.adaptive_avg_pool2d(x, 
                (x.size(2) // self.stride, x.size(3) // self.stride))
        else:
            # Use average pooling during testing
            return F.avg_pool2d(x, self.kernel_size, self.stride)

Classic CNN Architectures

1. LeNet-5

class LeNet5(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(16 * 5 * 5, 120),
            nn.Tanh(),
            nn.Linear(120, 84),
            nn.Tanh(),
            nn.Linear(84, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

2. AlexNet

class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

3. VGG Network

class VGG(nn.Module):
    def __init__(self, features, num_classes=1000):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

def make_vgg_layers(cfg, batch_norm=False):
    """Build VGG feature layers"""
    layers = []
    in_channels = 3
    
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    
    return nn.Sequential(*layers)

# VGG configurations
vgg_configs = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

def vgg16(num_classes=1000, batch_norm=True):
    return VGG(make_vgg_layers(vgg_configs['VGG16'], batch_norm), num_classes)

Modern CNN Architectures

1. ResNet (Residual Network)

class BasicBlock(nn.Module):
    """ResNet basic block"""
    expansion = 1
    
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)  # Residual connection
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)
    
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

2. DenseNet (Densely Connected Network)

class DenseBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, num_layers):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList()
        
        for i in range(num_layers):
            layer = nn.Sequential(
                nn.BatchNorm2d(in_channels + i * growth_rate),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels + i * growth_rate, growth_rate, kernel_size=3, padding=1)
            )
            self.layers.append(layer)
    
    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, 1))
            features.append(new_feature)
        return torch.cat(features, 1)

class TransitionLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionLayer, self).__init__()
        self.transition = nn.Sequential(
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
    
    def forward(self, x):
        return self.transition(x)

Attention Mechanisms

1. Channel Attention (SE Module)

class SEBlock(nn.Module):
    """Squeeze-and-Excitation Block"""
    def __init__(self, channels, reduction=16):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

2. Spatial Attention

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        attention = torch.cat([avg_out, max_out], dim=1)
        attention = self.conv(attention)
        return x * self.sigmoid(attention)

Practical Application Examples

1. CIFAR-10 Image Classification

# Data preprocessing
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# Load data
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

# Define model
class CIFAR10CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CIFAR10CNN, self).__init__()
        self.features = nn.Sequential(
            # First convolution block
            nn.Conv2d(3, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # Second convolution block
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # Third convolution block
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(256, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Train model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CIFAR10CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

Visualization and Analysis

1. Feature Map Visualization

def visualize_feature_maps(model, input_tensor, layer_name):
    """Visualize feature maps"""
    activation = {}
    
    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output.detach()
        return hook
    
    # Register hooks
    for name, module in model.named_modules():
        if name == layer_name:
            module.register_forward_hook(get_activation(name))
    
    # Forward pass
    model.eval()
    with torch.no_grad():
        _ = model(input_tensor)
    
    # Get feature maps
    feature_maps = activation[layer_name]
    
    # Visualization
    fig, axes = plt.subplots(4, 8, figsize=(16, 8))
    for i in range(min(32, feature_maps.shape[1])):
        row, col = i // 8, i % 8
        axes[row, col].imshow(feature_maps[0, i].cpu(), cmap='viridis')
        axes[row, col].axis('off')
        axes[row, col].set_title(f'Channel {i}')
    
    plt.tight_layout()
    plt.show()

# Usage example
sample_input = torch.randn(1, 3, 32, 32).to(device)
visualize_feature_maps(model, sample_input, 'features.0')

2. Convolution Kernel Visualization

def visualize_conv_filters(model, layer_name):
    """Visualize convolution kernels"""
    for name, module in model.named_modules():
        if name == layer_name and isinstance(module, nn.Conv2d):
            weights = module.weight.data
            
            # Show only first 16 filters
            num_filters = min(16, weights.shape[0])
            fig, axes = plt.subplots(4, 4, figsize=(8, 8))
            
            for i in range(num_filters):
                row, col = i // 4, i % 4
                # If RGB input, show all channels
                if weights.shape[1] == 3:
                    filter_img = weights[i].permute(1, 2, 0)
                    filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
                    axes[row, col].imshow(filter_img.cpu())
                else:
                    axes[row, col].imshow(weights[i, 0].cpu(), cmap='gray')
                
                axes[row, col].axis('off')
                axes[row, col].set_title(f'Filter {i}')
            
            plt.tight_layout()
            plt.show()
            break

# Usage example
visualize_conv_filters(model, 'features.0')

Summary

Convolutional Neural Networks are the foundation of computer vision. This chapter introduced:

  1. Basic Concepts: Principles and implementations of convolution and pooling layers
  2. Classic Architectures: Important networks like LeNet, AlexNet, VGG, ResNet
  3. Modern Techniques: Advanced techniques like attention mechanisms and dense connections
  4. Practical Applications: Complete image classification project implementation
  5. Visualization Analysis: Methods for visualizing feature maps and convolution kernels

Mastering CNN will lay a solid foundation for your further learning in computer vision!