Skip to content

PyTorch Convolutional Neural Networks

Introduction to Convolutional Neural Networks

Convolutional Neural Network (CNN) is one of the most important architectures in deep learning, especially suitable for image processing tasks. CNN can automatically learn hierarchical features of images through the combination of convolutional layers, pooling layers, and fully connected layers.

python
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# Basic CNN components
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
pool_layer = nn.MaxPool2d(kernel_size=2, stride=2)
fc_layer = nn.Linear(in_features=1024, out_features=10)

Detailed Convolutional Layer

1. Basic Convolution Operation

python
# 2D convolution layer
conv2d = nn.Conv2d(
    in_channels=3,      # Input channels
    out_channels=64,    # Output channels (number of filters)
    kernel_size=3,      # Filter size
    stride=1,           # Stride
    padding=1,          # Padding
    dilation=1,         # Dilation
    groups=1,           # Grouped convolution
    bias=True           # Whether to use bias
)

# 1D convolution (for sequence data)
conv1d = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3)

# 3D convolution (for video data)
conv3d = nn.Conv3d(in_channels=3, out_channels=64, kernel_size=3)

# Transposed convolution (deconvolution)
conv_transpose = nn.ConvTranspose2d(64, 3, kernel_size=3, stride=2, padding=1)

2. Convolution Parameter Calculation

python
def conv_output_size(input_size, kernel_size, stride=1, padding=0, dilation=1):
    """Calculate convolution output size"""
    return (input_size + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1

# Example calculation
input_h, input_w = 32, 32
kernel_size = 3
stride = 1
padding = 1

output_h = conv_output_size(input_h, kernel_size, stride, padding)
output_w = conv_output_size(input_w, kernel_size, stride, padding)
print(f"Output size: {output_h} x {output_w}")  # 32 x 32

3. Different Types of Convolution

python
# Depthwise separable convolution
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        # Depthwise convolution
        self.depthwise = nn.Conv2d(
            in_channels, in_channels, kernel_size, stride, padding, groups=in_channels
        )
        # Pointwise convolution
        self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
    
    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x

# Dilated convolution
dilated_conv = nn.Conv2d(64, 64, kernel_size=3, padding=2, dilation=2)

# Grouped convolution
group_conv = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=4)

Pooling Layer

1. Common Pooling Operations

python
# Max pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)

# Average pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)

# Adaptive pooling (output fixed size)
adaptive_avg_pool = nn.AdaptiveAvgPool2d((7, 7))
adaptive_max_pool = nn.AdaptiveMaxPool2d((1, 1))  # Global pooling

# Test pooling effect
x = torch.randn(1, 64, 32, 32)
pooled = max_pool(x)
print(f"Before pooling: {x.shape}")      # [1, 64, 32, 32]
print(f"After pooling: {pooled.shape}")  # [1, 64, 16, 16]

2. Custom Pooling

python
class StochasticPool2d(nn.Module):
    """Stochastic pooling"""
    def __init__(self, kernel_size, stride=None):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride or kernel_size
    
    def forward(self, x):
        if self.training:
            # Use stochastic pooling during training
            return F.adaptive_avg_pool2d(x, 
                (x.size(2) // self.stride, x.size(3) // self.stride))
        else:
            # Use average pooling during testing
            return F.avg_pool2d(x, self.kernel_size, self.stride)

Classic CNN Architectures

1. LeNet-5

python
class LeNet5(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(16 * 5 * 5, 120),
            nn.Tanh(),
            nn.Linear(120, 84),
            nn.Tanh(),
            nn.Linear(84, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

2. AlexNet

python
class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

3. VGG Network

python
class VGG(nn.Module):
    def __init__(self, features, num_classes=1000):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

def make_vgg_layers(cfg, batch_norm=False):
    """Build VGG feature layers"""
    layers = []
    in_channels = 3
    
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    
    return nn.Sequential(*layers)

# VGG configurations
vgg_configs = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

def vgg16(num_classes=1000, batch_norm=True):
    return VGG(make_vgg_layers(vgg_configs['VGG16'], batch_norm), num_classes)

Modern CNN Architectures

1. ResNet (Residual Network)

python
class BasicBlock(nn.Module):
    """ResNet basic block"""
    expansion = 1
    
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)  # Residual connection
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)
    
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

2. DenseNet (Densely Connected Network)

python
class DenseBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, num_layers):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList()
        
        for i in range(num_layers):
            layer = nn.Sequential(
                nn.BatchNorm2d(in_channels + i * growth_rate),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels + i * growth_rate, growth_rate, kernel_size=3, padding=1)
            )
            self.layers.append(layer)
    
    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, 1))
            features.append(new_feature)
        return torch.cat(features, 1)

class TransitionLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionLayer, self).__init__()
        self.transition = nn.Sequential(
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
    
    def forward(self, x):
        return self.transition(x)

Attention Mechanisms

1. Channel Attention (SE Module)

python
class SEBlock(nn.Module):
    """Squeeze-and-Excitation Block"""
    def __init__(self, channels, reduction=16):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

2. Spatial Attention

python
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        attention = torch.cat([avg_out, max_out], dim=1)
        attention = self.conv(attention)
        return x * self.sigmoid(attention)

Practical Application Examples

1. CIFAR-10 Image Classification

python
# Data preprocessing
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# Load data
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

# Define model
class CIFAR10CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CIFAR10CNN, self).__init__()
        self.features = nn.Sequential(
            # First convolution block
            nn.Conv2d(3, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # Second convolution block
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # Third convolution block
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(256, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Train model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CIFAR10CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

Visualization and Analysis

1. Feature Map Visualization

python
def visualize_feature_maps(model, input_tensor, layer_name):
    """Visualize feature maps"""
    activation = {}
    
    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output.detach()
        return hook
    
    # Register hooks
    for name, module in model.named_modules():
        if name == layer_name:
            module.register_forward_hook(get_activation(name))
    
    # Forward pass
    model.eval()
    with torch.no_grad():
        _ = model(input_tensor)
    
    # Get feature maps
    feature_maps = activation[layer_name]
    
    # Visualization
    fig, axes = plt.subplots(4, 8, figsize=(16, 8))
    for i in range(min(32, feature_maps.shape[1])):
        row, col = i // 8, i % 8
        axes[row, col].imshow(feature_maps[0, i].cpu(), cmap='viridis')
        axes[row, col].axis('off')
        axes[row, col].set_title(f'Channel {i}')
    
    plt.tight_layout()
    plt.show()

# Usage example
sample_input = torch.randn(1, 3, 32, 32).to(device)
visualize_feature_maps(model, sample_input, 'features.0')

2. Convolution Kernel Visualization

python
def visualize_conv_filters(model, layer_name):
    """Visualize convolution kernels"""
    for name, module in model.named_modules():
        if name == layer_name and isinstance(module, nn.Conv2d):
            weights = module.weight.data
            
            # Show only first 16 filters
            num_filters = min(16, weights.shape[0])
            fig, axes = plt.subplots(4, 4, figsize=(8, 8))
            
            for i in range(num_filters):
                row, col = i // 4, i % 4
                # If RGB input, show all channels
                if weights.shape[1] == 3:
                    filter_img = weights[i].permute(1, 2, 0)
                    filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
                    axes[row, col].imshow(filter_img.cpu())
                else:
                    axes[row, col].imshow(weights[i, 0].cpu(), cmap='gray')
                
                axes[row, col].axis('off')
                axes[row, col].set_title(f'Filter {i}')
            
            plt.tight_layout()
            plt.show()
            break

# Usage example
visualize_conv_filters(model, 'features.0')

Summary

Convolutional Neural Networks are the foundation of computer vision. This chapter introduced:

  1. Basic Concepts: Principles and implementations of convolution and pooling layers
  2. Classic Architectures: Important networks like LeNet, AlexNet, VGG, ResNet
  3. Modern Techniques: Advanced techniques like attention mechanisms and dense connections
  4. Practical Applications: Complete image classification project implementation
  5. Visualization Analysis: Methods for visualizing feature maps and convolution kernels

Mastering CNN will lay a solid foundation for your further learning in computer vision!

Content is for learning and research only.