PyTorch Recurrent Neural Networks
Introduction to Recurrent Neural Networks
Recurrent Neural Networks (RNN) are neural network architectures specifically designed for processing sequential data. Unlike traditional feedforward neural networks, RNNs have memory capabilities and can handle variable-length sequences while capturing temporal dependencies.
python
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
# Basic RNN components
rnn = nn.RNN(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
lstm = nn.LSTM(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
gru = nn.GRU(input_size=100, hidden_size=128, num_layers=2, batch_first=True)Basic RNN
1. Simple RNN Implementation
python
class SimpleRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(SimpleRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# RNN layer
self.rnn = nn.RNN(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
nonlinearity='tanh' # 'tanh' or 'relu'
)
# Output layer
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden=None):
# x shape: (batch_size, seq_len, input_size)
batch_size = x.size(0)
# Initialize hidden state
if hidden is None:
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
if x.is_cuda:
hidden = hidden.cuda()
# RNN forward pass
rnn_out, hidden = self.rnn(x, hidden)
# Use only the last timestep output
output = self.fc(rnn_out[:, -1, :])
return output, hidden
# Test simple RNN
input_size, hidden_size, output_size = 10, 20, 5
seq_len, batch_size = 15, 32
model = SimpleRNN(input_size, hidden_size, output_size)
x = torch.randn(batch_size, seq_len, input_size)
output, hidden = model(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Hidden state shape: {hidden.shape}")2. Bidirectional RNN
python
class BiRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(BiRNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# Bidirectional RNN
self.rnn = nn.RNN(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
bidirectional=True
)
# Output layer (note: bidirectional RNN output dimension is 2*hidden_size)
self.fc = nn.Linear(hidden_size * 2, output_size)
def forward(self, x):
# Bidirectional RNN forward pass
rnn_out, _ = self.rnn(x)
# Use the last timestep output
output = self.fc(rnn_out[:, -1, :])
return output
# Test bidirectional RNN
bi_model = BiRNN(input_size, hidden_size, output_size)
bi_output = bi_model(x)
print(f"Bidirectional RNN output shape: {bi_output.shape}")LSTM Networks
1. Basic LSTM Implementation
python
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.2):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# LSTM layer
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0,
bidirectional=False
)
# Dropout layer
self.dropout = nn.Dropout(dropout)
# Output layer
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden=None):
batch_size = x.size(0)
# Initialize hidden state and cell state
if hidden is None:
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
if x.is_cuda:
h0, c0 = h0.cuda(), c0.cuda()
hidden = (h0, c0)
# LSTM forward pass
lstm_out, hidden = self.lstm(x, hidden)
# Apply dropout
lstm_out = self.dropout(lstm_out)
# Output layer
output = self.fc(lstm_out[:, -1, :]) # Use the last timestep
return output, hidden
# Test LSTM
lstm_model = LSTMModel(input_size, hidden_size, output_size)
lstm_output, lstm_hidden = lstm_model(x)
print(f"LSTM output shape: {lstm_output.shape}")
print(f"LSTM hidden state shape: {lstm_hidden[0].shape}, {lstm_hidden[1].shape}")2. Many-to-Many LSTM (Sequence to Sequence)
python
class Seq2SeqLSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2):
super(Seq2SeqLSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True
)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# x shape: (batch_size, seq_len, input_size)
lstm_out, _ = self.lstm(x)
# Predict for each timestep
output = self.fc(lstm_out) # (batch_size, seq_len, output_size)
return output
# Test sequence-to-sequence LSTM
seq2seq_model = Seq2SeqLSTM(input_size, hidden_size, output_size)
seq2seq_output = seq2seq_model(x)
print(f"Seq2Seq output shape: {seq2seq_output.shape}")GRU Networks
1. GRU Implementation
python
class GRUModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.2):
super(GRUModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# GRU layer
self.gru = nn.GRU(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0
)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden=None):
batch_size = x.size(0)
if hidden is None:
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
if x.is_cuda:
hidden = hidden.cuda()
gru_out, hidden = self.gru(x, hidden)
gru_out = self.dropout(gru_out)
output = self.fc(gru_out[:, -1, :])
return output, hidden
# Test GRU
gru_model = GRUModel(input_size, hidden_size, output_size)
gru_output, gru_hidden = gru_model(x)
print(f"GRU output shape: {gru_output.shape}")Attention Mechanisms
1. Basic Attention
python
class AttentionRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2):
super(AttentionRNN, self).__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True
)
# Attention mechanism
self.attention = nn.Linear(hidden_size, 1)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# LSTM output
lstm_out, _ = self.lstm(x) # (batch_size, seq_len, hidden_size)
# Compute attention weights
attention_weights = torch.softmax(self.attention(lstm_out), dim=1) # (batch_size, seq_len, 1)
# Weighted sum
context_vector = torch.sum(attention_weights * lstm_out, dim=1) # (batch_size, hidden_size)
# Output
output = self.fc(context_vector)
return output, attention_weights
# Test attention RNN
attention_model = AttentionRNN(input_size, hidden_size, output_size)
attention_output, attention_weights = attention_model(x)
print(f"Attention RNN output shape: {attention_output.shape}")
print(f"Attention weights shape: {attention_weights.shape}")2. Self-Attention Mechanism
python
class SelfAttentionRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_heads=8):
super(SelfAttentionRNN, self).__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
# LSTM layer
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
# Multi-head self-attention
self.multihead_attn = nn.MultiheadAttention(
embed_dim=hidden_size,
num_heads=num_heads,
batch_first=True
)
# Layer normalization
self.layer_norm = nn.LayerNorm(hidden_size)
# Output layer
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# LSTM processing
lstm_out, _ = self.lstm(x)
# Self-attention
attn_out, attn_weights = self.multihead_attn(lstm_out, lstm_out, lstm_out)
# Residual connection and layer normalization
out = self.layer_norm(lstm_out + attn_out)
# Global average pooling
out = torch.mean(out, dim=1)
# Output
output = self.fc(out)
return output, attn_weights
# Test self-attention RNN
self_attn_model = SelfAttentionRNN(input_size, hidden_size, output_size)
self_attn_output, self_attn_weights = self_attn_model(x)
print(f"Self-attention RNN output shape: {self_attn_output.shape}")Practical Application Examples
1. Text Classification
python
class TextClassificationRNN(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_size, num_classes, num_layers=2):
super(TextClassificationRNN, self).__init__()
# Word embedding layer
self.embedding = nn.Embedding(vocab_size, embed_dim)
# LSTM layer
self.lstm = nn.LSTM(
input_size=embed_dim,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=0.3,
bidirectional=True
)
# Classification layer
self.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(hidden_size * 2, hidden_size),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
# Word embedding
embedded = self.embedding(x) # (batch_size, seq_len, embed_dim)
# LSTM processing
lstm_out, _ = self.lstm(embedded)
# Use the last timestep output
output = self.classifier(lstm_out[:, -1, :])
return output
# Create text classification model
vocab_size, embed_dim, num_classes = 10000, 128, 5
text_model = TextClassificationRNN(vocab_size, embed_dim, hidden_size, num_classes)
# Test
text_input = torch.randint(0, vocab_size, (32, 50)) # 32 samples, each with 50 words
text_output = text_model(text_input)
print(f"Text classification output shape: {text_output.shape}")2. Time Series Prediction
python
class TimeSeriesPredictor(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=3, dropout=0.2):
super(TimeSeriesPredictor, self).__init__()
# Multi-layer LSTM
self.lstm1 = nn.LSTM(input_size, hidden_size, batch_first=True)
self.dropout1 = nn.Dropout(dropout)
self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
self.dropout2 = nn.Dropout(dropout)
self.lstm3 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
self.dropout3 = nn.Dropout(dropout)
# Output layer
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
# First LSTM layer
out, _ = self.lstm1(x)
out = self.dropout1(out)
# Second LSTM layer
out, _ = self.lstm2(out)
out = self.dropout2(out)
# Third LSTM layer
out, _ = self.lstm3(out)
out = self.dropout3(out)
# Predict next timestep
prediction = self.fc(out[:, -1, :])
return prediction
# Create time series prediction model
ts_model = TimeSeriesPredictor(input_size=1, hidden_size=64)
# Generate example time series data
def generate_sine_wave(seq_len, num_samples):
x = np.linspace(0, 4*np.pi, seq_len)
data = []
for _ in range(num_samples):
phase = np.random.uniform(0, 2*np.pi)
amplitude = np.random.uniform(0.5, 2.0)
noise = np.random.normal(0, 0.1, seq_len)
y = amplitude * np.sin(x + phase) + noise
data.append(y)
return np.array(data)
# Test time series prediction
ts_data = generate_sine_wave(50, 32)
ts_input = torch.FloatTensor(ts_data).unsqueeze(-1) # (32, 50, 1)
ts_output = ts_model(ts_input)
print(f"Time series prediction output shape: {ts_output.shape}")3. Sequence to Sequence Translation
python
class Seq2SeqTranslator(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, hidden_size):
super(Seq2SeqTranslator, self).__init__()
# Encoder
self.src_embedding = nn.Embedding(src_vocab_size, embed_dim)
self.encoder = nn.LSTM(embed_dim, hidden_size, batch_first=True)
# Decoder
self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_dim)
self.decoder = nn.LSTM(embed_dim, hidden_size, batch_first=True)
# Output layer
self.output_projection = nn.Linear(hidden_size, tgt_vocab_size)
def forward(self, src, tgt):
# Encode
src_embedded = self.src_embedding(src)
encoder_out, (hidden, cell) = self.encoder(src_embedded)
# Decode
tgt_embedded = self.tgt_embedding(tgt)
decoder_out, _ = self.decoder(tgt_embedded, (hidden, cell))
# Output projection
output = self.output_projection(decoder_out)
return output
# Create translation model
src_vocab_size, tgt_vocab_size = 5000, 4000
translator = Seq2SeqTranslator(src_vocab_size, tgt_vocab_size, embed_dim, hidden_size)
# Test
src_seq = torch.randint(0, src_vocab_size, (32, 20)) # Source sequence
tgt_seq = torch.randint(0, tgt_vocab_size, (32, 25)) # Target sequence
translation_output = translator(src_seq, tgt_seq)
print(f"Translation output shape: {translation_output.shape}")Training Techniques
1. Gradient Clipping
python
def train_rnn_with_gradient_clipping(model, dataloader, criterion, optimizer, max_norm=1.0):
model.train()
total_loss = 0
for batch_idx, (data, target) in enumerate(dataloader):
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)2. Learning Rate Warmup
python
class WarmupScheduler:
def __init__(self, optimizer, warmup_steps, d_model):
self.optimizer = optimizer
self.warmup_steps = warmup_steps
self.d_model = d_model
self.step_num = 0
def step(self):
self.step_num += 1
lr = self.d_model ** (-0.5) * min(
self.step_num ** (-0.5),
self.step_num * self.warmup_steps ** (-1.5)
)
for param_group in self.optimizer.param_groups:
param_group['lr'] = lr3. PackedSequence
python
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class PackedRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(PackedRNN, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, lengths):
# Pack sequence
packed_x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
# LSTM processing
packed_out, (hidden, cell) = self.lstm(packed_x)
# Unpack sequence
lstm_out, _ = pad_packed_sequence(packed_out, batch_first=True)
# Get last valid output for each sequence
batch_size = x.size(0)
last_outputs = []
for i, length in enumerate(lengths):
last_outputs.append(lstm_out[i, length-1, :])
last_outputs = torch.stack(last_outputs)
output = self.fc(last_outputs)
return output
# Use packed sequences
def collate_fn(batch):
# Assume batch is [(seq1, label1), (seq2, label2), ...]
sequences, labels = zip(*batch)
lengths = [len(seq) for seq in sequences]
# Pad sequences to same length
max_len = max(lengths)
padded_sequences = []
for seq in sequences:
padded = torch.zeros(max_len, seq.size(-1))
padded[:len(seq)] = seq
padded_sequences.append(padded)
return torch.stack(padded_sequences), torch.tensor(lengths), torch.tensor(labels)Visualization and Analysis
1. Attention Weights Visualization
python
def visualize_attention(attention_weights, input_tokens, figsize=(10, 8)):
"""Visualize attention weights"""
attention_weights = attention_weights.squeeze().detach().cpu().numpy()
plt.figure(figsize=figsize)
plt.imshow(attention_weights.T, cmap='Blues', aspect='auto')
plt.colorbar()
plt.xlabel('Time Step')
plt.ylabel('Input Position')
plt.title('Attention Weights Heatmap')
if input_tokens:
plt.yticks(range(len(input_tokens)), input_tokens)
plt.tight_layout()
plt.show()
# Usage example
# visualize_attention(attention_weights, ['word1', 'word2', 'word3', ...])2. Hidden States Visualization
python
def visualize_hidden_states(model, input_sequence, layer_idx=0):
"""Visualize RNN hidden state evolution"""
model.eval()
hidden_states = []
# Get hidden state at each timestep
with torch.no_grad():
hidden = None
for t in range(input_sequence.size(1)):
input_t = input_sequence[:, t:t+1, :]
output, hidden = model.lstm(input_t, hidden)
if isinstance(hidden, tuple): # LSTM
hidden_states.append(hidden[0][layer_idx, 0, :].cpu().numpy())
else: # RNN/GRU
hidden_states.append(hidden[layer_idx, 0, :].cpu().numpy())
hidden_states = np.array(hidden_states)
# Visualization
plt.figure(figsize=(12, 8))
plt.imshow(hidden_states.T, cmap='viridis', aspect='auto')
plt.colorbar()
plt.xlabel('Time Step')
plt.ylabel('Hidden Units')
plt.title(f'Layer {layer_idx} Hidden State Evolution')
plt.tight_layout()
plt.show()
return hidden_statesPerformance Optimization
1. Batch Processing Optimization
python
class OptimizedRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2):
super(OptimizedRNN, self).__init__()
# Use more efficient LSTM implementation
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=0.1 if num_layers > 1 else 0
)
self.fc = nn.Linear(hidden_size, output_size)
# Enable cuDNN optimization
self.lstm.flatten_parameters()
def forward(self, x):
# Ensure parameters are contiguous (cuDNN optimization)
self.lstm.flatten_parameters()
lstm_out, _ = self.lstm(x)
output = self.fc(lstm_out[:, -1, :])
return output2. Memory Optimization
python
def train_with_checkpointing(model, dataloader, criterion, optimizer):
"""Use gradient checkpointing to save memory"""
from torch.utils.checkpoint import checkpoint
model.train()
total_loss = 0
for data, target in dataloader:
optimizer.zero_grad()
# Use gradient checkpointing
def run_function(x):
return model(x)
output = checkpoint(run_function, data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)Summary
Recurrent neural networks are important tools for processing sequential data. This chapter introduced:
- Basic Architectures: Principles and implementations of RNN, LSTM, and GRU
- Advanced Techniques: Attention mechanisms, bidirectional RNN, sequence-to-sequence models
- Practical Applications: Text classification, time series prediction, machine translation
- Training Techniques: Gradient clipping, sequence packing, learning rate scheduling
- Visualization Analysis: Attention weights and hidden state visualization methods
- Performance Optimization: Batch processing and memory optimization techniques
Mastering RNN will lay a solid foundation for your applications in natural language processing, time series analysis, and other fields!