PyTorch Convolutional Neural Networks
Introduction to Convolutional Neural Networks
Convolutional Neural Network (CNN) is one of the most important architectures in deep learning, especially suitable for image processing tasks. CNN can automatically learn hierarchical features of images through the combination of convolutional layers, pooling layers, and fully connected layers.
python
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
# Basic CNN components
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
pool_layer = nn.MaxPool2d(kernel_size=2, stride=2)
fc_layer = nn.Linear(in_features=1024, out_features=10)Detailed Convolutional Layer
1. Basic Convolution Operation
python
# 2D convolution layer
conv2d = nn.Conv2d(
in_channels=3, # Input channels
out_channels=64, # Output channels (number of filters)
kernel_size=3, # Filter size
stride=1, # Stride
padding=1, # Padding
dilation=1, # Dilation
groups=1, # Grouped convolution
bias=True # Whether to use bias
)
# 1D convolution (for sequence data)
conv1d = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3)
# 3D convolution (for video data)
conv3d = nn.Conv3d(in_channels=3, out_channels=64, kernel_size=3)
# Transposed convolution (deconvolution)
conv_transpose = nn.ConvTranspose2d(64, 3, kernel_size=3, stride=2, padding=1)2. Convolution Parameter Calculation
python
def conv_output_size(input_size, kernel_size, stride=1, padding=0, dilation=1):
"""Calculate convolution output size"""
return (input_size + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
# Example calculation
input_h, input_w = 32, 32
kernel_size = 3
stride = 1
padding = 1
output_h = conv_output_size(input_h, kernel_size, stride, padding)
output_w = conv_output_size(input_w, kernel_size, stride, padding)
print(f"Output size: {output_h} x {output_w}") # 32 x 323. Different Types of Convolution
python
# Depthwise separable convolution
class DepthwiseSeparableConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
super().__init__()
# Depthwise convolution
self.depthwise = nn.Conv2d(
in_channels, in_channels, kernel_size, stride, padding, groups=in_channels
)
# Pointwise convolution
self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
# Dilated convolution
dilated_conv = nn.Conv2d(64, 64, kernel_size=3, padding=2, dilation=2)
# Grouped convolution
group_conv = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=4)Pooling Layer
1. Common Pooling Operations
python
# Max pooling
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
# Average pooling
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
# Adaptive pooling (output fixed size)
adaptive_avg_pool = nn.AdaptiveAvgPool2d((7, 7))
adaptive_max_pool = nn.AdaptiveMaxPool2d((1, 1)) # Global pooling
# Test pooling effect
x = torch.randn(1, 64, 32, 32)
pooled = max_pool(x)
print(f"Before pooling: {x.shape}") # [1, 64, 32, 32]
print(f"After pooling: {pooled.shape}") # [1, 64, 16, 16]2. Custom Pooling
python
class StochasticPool2d(nn.Module):
"""Stochastic pooling"""
def __init__(self, kernel_size, stride=None):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
def forward(self, x):
if self.training:
# Use stochastic pooling during training
return F.adaptive_avg_pool2d(x,
(x.size(2) // self.stride, x.size(3) // self.stride))
else:
# Use average pooling during testing
return F.avg_pool2d(x, self.kernel_size, self.stride)Classic CNN Architectures
1. LeNet-5
python
class LeNet5(nn.Module):
def __init__(self, num_classes=10):
super(LeNet5, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5),
nn.Tanh(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5),
nn.Tanh(),
nn.AvgPool2d(kernel_size=2, stride=2)
)
self.classifier = nn.Sequential(
nn.Linear(16 * 5 * 5, 120),
nn.Tanh(),
nn.Linear(120, 84),
nn.Tanh(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x2. AlexNet
python
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x3. VGG Network
python
class VGG(nn.Module):
def __init__(self, features, num_classes=1000):
super(VGG, self).__init__()
self.features = features
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
def make_vgg_layers(cfg, batch_norm=False):
"""Build VGG feature layers"""
layers = []
in_channels = 3
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
# VGG configurations
vgg_configs = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
def vgg16(num_classes=1000, batch_norm=True):
return VGG(make_vgg_layers(vgg_configs['VGG16'], batch_norm), num_classes)Modern CNN Architectures
1. ResNet (Residual Network)
python
class BasicBlock(nn.Module):
"""ResNet basic block"""
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # Residual connection
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def ResNet18():
return ResNet(BasicBlock, [2, 2, 2, 2])2. DenseNet (Densely Connected Network)
python
class DenseBlock(nn.Module):
def __init__(self, in_channels, growth_rate, num_layers):
super(DenseBlock, self).__init__()
self.layers = nn.ModuleList()
for i in range(num_layers):
layer = nn.Sequential(
nn.BatchNorm2d(in_channels + i * growth_rate),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels + i * growth_rate, growth_rate, kernel_size=3, padding=1)
)
self.layers.append(layer)
def forward(self, x):
features = [x]
for layer in self.layers:
new_feature = layer(torch.cat(features, 1))
features.append(new_feature)
return torch.cat(features, 1)
class TransitionLayer(nn.Module):
def __init__(self, in_channels, out_channels):
super(TransitionLayer, self).__init__()
self.transition = nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
nn.AvgPool2d(kernel_size=2, stride=2)
)
def forward(self, x):
return self.transition(x)Attention Mechanisms
1. Channel Attention (SE Module)
python
class SEBlock(nn.Module):
"""Squeeze-and-Excitation Block"""
def __init__(self, channels, reduction=16):
super(SEBlock, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channels, channels // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channels // reduction, channels, bias=False),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)2. Spatial Attention
python
class SpatialAttention(nn.Module):
def __init__(self, kernel_size=7):
super(SpatialAttention, self).__init__()
self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
avg_out = torch.mean(x, dim=1, keepdim=True)
max_out, _ = torch.max(x, dim=1, keepdim=True)
attention = torch.cat([avg_out, max_out], dim=1)
attention = self.conv(attention)
return x * self.sigmoid(attention)Practical Application Examples
1. CIFAR-10 Image Classification
python
# Data preprocessing
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# Load data
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
# Define model
class CIFAR10CNN(nn.Module):
def __init__(self, num_classes=10):
super(CIFAR10CNN, self).__init__()
self.features = nn.Sequential(
# First convolution block
nn.Conv2d(3, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Dropout(0.25),
# Second convolution block
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Dropout(0.25),
# Third convolution block
nn.Conv2d(128, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Dropout(0.25),
)
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(),
nn.Linear(256, 512),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# Train model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CIFAR10CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)Visualization and Analysis
1. Feature Map Visualization
python
def visualize_feature_maps(model, input_tensor, layer_name):
"""Visualize feature maps"""
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
# Register hooks
for name, module in model.named_modules():
if name == layer_name:
module.register_forward_hook(get_activation(name))
# Forward pass
model.eval()
with torch.no_grad():
_ = model(input_tensor)
# Get feature maps
feature_maps = activation[layer_name]
# Visualization
fig, axes = plt.subplots(4, 8, figsize=(16, 8))
for i in range(min(32, feature_maps.shape[1])):
row, col = i // 8, i % 8
axes[row, col].imshow(feature_maps[0, i].cpu(), cmap='viridis')
axes[row, col].axis('off')
axes[row, col].set_title(f'Channel {i}')
plt.tight_layout()
plt.show()
# Usage example
sample_input = torch.randn(1, 3, 32, 32).to(device)
visualize_feature_maps(model, sample_input, 'features.0')2. Convolution Kernel Visualization
python
def visualize_conv_filters(model, layer_name):
"""Visualize convolution kernels"""
for name, module in model.named_modules():
if name == layer_name and isinstance(module, nn.Conv2d):
weights = module.weight.data
# Show only first 16 filters
num_filters = min(16, weights.shape[0])
fig, axes = plt.subplots(4, 4, figsize=(8, 8))
for i in range(num_filters):
row, col = i // 4, i % 4
# If RGB input, show all channels
if weights.shape[1] == 3:
filter_img = weights[i].permute(1, 2, 0)
filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
axes[row, col].imshow(filter_img.cpu())
else:
axes[row, col].imshow(weights[i, 0].cpu(), cmap='gray')
axes[row, col].axis('off')
axes[row, col].set_title(f'Filter {i}')
plt.tight_layout()
plt.show()
break
# Usage example
visualize_conv_filters(model, 'features.0')Summary
Convolutional Neural Networks are the foundation of computer vision. This chapter introduced:
- Basic Concepts: Principles and implementations of convolution and pooling layers
- Classic Architectures: Important networks like LeNet, AlexNet, VGG, ResNet
- Modern Techniques: Advanced techniques like attention mechanisms and dense connections
- Practical Applications: Complete image classification project implementation
- Visualization Analysis: Methods for visualizing feature maps and convolution kernels
Mastering CNN will lay a solid foundation for your further learning in computer vision!