#PyTorch Neural Network Fundamentals
#Introduction to torch.nn Module
torch.nn is the core module in PyTorch for building neural networks, providing various components such as layers, activation functions, and loss functions.
import torch
import torch.nn as nn
import torch.nn.functional as F
# Basic neural network layers
linear = nn.Linear(10, 5) # Linear layer: 10 input dimensions, 5 output dimensions
conv = nn.Conv2d(3, 16, 3) # Convolutional layer: 3 input channels, 16 output channels, 3x3 kernel
lstm = nn.LSTM(10, 20, 2) # LSTM layer: 10 input dimensions, 20 hidden dimensions, 2 layers#Building Your First Neural Network
#1. Using nn.Sequential
import torch
import torch.nn as nn
# Simplest approach: Sequential container
model = nn.Sequential(
nn.Linear(784, 128), # Input layer to hidden layer
nn.ReLU(), # Activation function
nn.Linear(128, 64), # Hidden layer
nn.ReLU(),
nn.Linear(64, 10) # Output layer
)
# Test the model
x = torch.randn(32, 784) # Batch size 32, 784 features
output = model(x)
print(f"Output shape: {output.shape}") # [32, 10]#2. Custom nn.Module
class MLP(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(MLP, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
# Create model instance
model = MLP(784, 128, 10)
print(model)
# View model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")#Common Neural Network Layers
#1. Linear Layer
# Fully connected layer
linear = nn.Linear(in_features=100, out_features=50, bias=True)
# View parameters
print(f"Weight shape: {linear.weight.shape}") # [50, 100]
print(f"Bias shape: {linear.bias.shape}") # [50]
# Custom initialization
nn.init.xavier_uniform_(linear.weight)
nn.init.zeros_(linear.bias)#2. Convolutional Layer
# 2D convolution
conv2d = nn.Conv2d(
in_channels=3, # Input channels
out_channels=16, # Output channels
kernel_size=3, # Kernel size
stride=1, # Stride
padding=1, # Padding
bias=True
)
# 1D convolution (for sequential data)
conv1d = nn.Conv1d(in_channels=10, out_channels=20, kernel_size=3)
# Transposed convolution (deconvolution)
conv_transpose = nn.ConvTranspose2d(16, 3, 3, stride=2, padding=1)
# Test convolution layer
x = torch.randn(32, 3, 64, 64) # [batch, channels, height, width]
output = conv2d(x)
print(f"Convolution output shape: {output.shape}") # [32, 16, 64, 64]#3. Pooling Layer
# Max pooling
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
# Average pooling
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
# Adaptive pooling (fixed output size)
adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))
# Global average pooling
global_pool = nn.AdaptiveAvgPool2d((1, 1))
# Test pooling
x = torch.randn(32, 16, 64, 64)
pooled = maxpool(x)
print(f"After pooling shape: {pooled.shape}") # [32, 16, 32, 32]#4. Recurrent Layers
# LSTM layer
lstm = nn.LSTM(
input_size=100, # Input feature dimension
hidden_size=128, # Hidden state dimension
num_layers=2, # Number of layers
batch_first=True, # Input shape is (batch, seq, feature)
dropout=0.1, # Dropout probability
bidirectional=False # Whether to use bidirectional
)
# GRU layer
gru = nn.GRU(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
# Simple RNN
rnn = nn.RNN(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
# Test LSTM
x = torch.randn(32, 50, 100) # [batch, seq_len, input_size]
output, (hidden, cell) = lstm(x)
print(f"LSTM output shape: {output.shape}") # [32, 50, 128]
print(f"Hidden state shape: {hidden.shape}") # [2, 32, 128]#5. Attention Mechanism
# Multi-head attention
attention = nn.MultiheadAttention(
embed_dim=512, # Embedding dimension
num_heads=8, # Number of attention heads
dropout=0.1,
batch_first=True
)
# Test attention
query = torch.randn(32, 50, 512) # [batch, seq_len, embed_dim]
key = torch.randn(32, 50, 512)
value = torch.randn(32, 50, 512)
attn_output, attn_weights = attention(query, key, value)
print(f"Attention output shape: {attn_output.shape}") # [32, 50, 512]#Activation Functions
# Common activation functions
relu = nn.ReLU()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
elu = nn.ELU()
gelu = nn.GELU()
swish = nn.SiLU() # Swish activation function
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)
# Functional interface
x = torch.randn(10)
y1 = F.relu(x)
y2 = F.gelu(x)
y3 = F.softmax(x, dim=0)
print(f"ReLU output: {y1}")
print(f"GELU output: {y2}")
print(f"Softmax output: {y3}")#Regularization Techniques
#1. Dropout
# Dropout layer
dropout = nn.Dropout(p=0.5) # 50% of neurons randomly set to zero
# Different behavior during training and evaluation
model.train() # Training mode, enable dropout
output_train = dropout(x)
model.eval() # Evaluation mode, disable dropout
output_eval = dropout(x)#2. Batch Normalization
# 1D batch normalization (for fully connected layers)
bn1d = nn.BatchNorm1d(num_features=128)
# 2D batch normalization (for convolutional layers)
bn2d = nn.BatchNorm2d(num_features=64)
# Layer normalization
ln = nn.LayerNorm(normalized_shape=128)
# Group normalization
gn = nn.GroupNorm(num_groups=8, num_channels=64)
# Instance normalization
in_norm = nn.InstanceNorm2d(num_features=64)#3. Weight Initialization
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
# Apply initialization
model.apply(init_weights)#Complex Network Architectures
#1. Residual Connection
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# Skip connection
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = self.shortcut(x)
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += residual # Residual connection
out = F.relu(out)
return out#2. Attention Mechanism
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(SelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.query = nn.Linear(embed_dim, embed_dim)
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
self.out = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
batch_size, seq_len, embed_dim = x.size()
# Compute Q, K, V
Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# Compute attention scores
scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
attn_weights = F.softmax(scores, dim=-1)
# Apply attention
attn_output = torch.matmul(attn_weights, V)
attn_output = attn_output.transpose(1, 2).contiguous().view(
batch_size, seq_len, embed_dim
)
return self.out(attn_output)#Model Management
#1. Parameter Access and Modification
# Access all parameters
for name, param in model.named_parameters():
print(f"{name}: {param.shape}")
# Access specific layer parameters
linear_layer = model.fc1
print(f"Weight: {linear_layer.weight.shape}")
print(f"Bias: {linear_layer.bias.shape}")
# Freeze parameters
for param in model.parameters():
param.requires_grad = False
# Only train specific layers
for name, param in model.named_parameters():
if 'fc3' in name: # Only train the last layer
param.requires_grad = True
else:
param.requires_grad = False#2. Model State Management
# Training and evaluation modes
model.train() # Enable dropout, batch norm, etc.
model.eval() # Disable dropout, batch norm, etc.
# Check mode
print(f"Is model in training mode: {model.training}")
# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# Check model device
print(f"Model device: {next(model.parameters()).device}")#3. Model Saving and Loading
# Save entire model
torch.save(model, 'model.pth')
# Save only parameters (recommended)
torch.save(model.state_dict(), 'model_params.pth')
# Load model
model = torch.load('model.pth')
# Load parameters
model = MLP(784, 128, 10) # First create model architecture
model.load_state_dict(torch.load('model_params.pth'))
# Save training state
checkpoint = {
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch,
'loss': loss
}
torch.save(checkpoint, 'checkpoint.pth')#Practical Application Examples
#1. Image Classification Network
class ImageClassifier(nn.Module):
def __init__(self, num_classes=10):
super(ImageClassifier, self).__init__()
self.features = nn.Sequential(
# First convolutional block
nn.Conv2d(3, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
# Second convolutional block
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
# Third convolutional block
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
)
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(),
nn.Linear(128, 256),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# Test model
model = ImageClassifier(num_classes=10)
x = torch.randn(32, 3, 32, 32) # CIFAR-10 size
output = model(x)
print(f"Classification output shape: {output.shape}") # [32, 10]#2. Sequence-to-Sequence Model
class Seq2Seq(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2):
super(Seq2Seq, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# Encoder
self.encoder = nn.LSTM(input_size, hidden_size, num_layers,
batch_first=True, dropout=0.1)
# Decoder
self.decoder = nn.LSTM(output_size, hidden_size, num_layers,
batch_first=True, dropout=0.1)
# Output layer
self.out = nn.Linear(hidden_size, output_size)
def forward(self, src, tgt):
# Encode
encoder_output, (hidden, cell) = self.encoder(src)
# Decode
decoder_output, _ = self.decoder(tgt, (hidden, cell))
# Output
output = self.out(decoder_output)
return output#Debugging Tips
#1. Inspect Model Architecture
from torchsummary import summary
# Print model summary
summary(model, input_size=(3, 32, 32))
# Or use torchinfo
from torchinfo import summary
summary(model, input_size=(32, 3, 32, 32))#2. Gradient Check
# Check if gradients are normal
def check_gradients(model):
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm()
print(f"{name}: Gradient norm = {grad_norm:.6f}")
else:
print(f"{name}: No gradient")#3. Visualize Network
import torch.nn as nn
import matplotlib.pyplot as plt
def visualize_weights(model, layer_name):
for name, module in model.named_modules():
if name == layer_name and isinstance(module, nn.Conv2d):
weights = module.weight.data
# Visualize first filter
plt.imshow(weights[0, 0].cpu(), cmap='gray')
plt.title(f'{layer_name} - First Filter')
plt.show()
break#Summary
Neural networks are the core of deep learning, and PyTorch provides rich tools to build various network architectures:
- Basic Components: Master various layers, activation functions, and regularization techniques
- Model Building: Learn to use Sequential and custom Module
- Advanced Architectures: Understand residual connections, attention mechanisms, and modern techniques
- Model Management: Master parameter access, state management, and saving/loading
- Debugging Tips: Learn to inspect model architecture, gradients, weights, etc.
These knowledge will lay a solid foundation for subsequent model training and optimization!