Skip to content

PyTorch Recurrent Neural Networks

Introduction to Recurrent Neural Networks

Recurrent Neural Networks (RNN) are neural network architectures specifically designed for processing sequential data. Unlike traditional feedforward neural networks, RNNs have memory capabilities and can handle variable-length sequences while capturing temporal dependencies.

python
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset

# Basic RNN components
rnn = nn.RNN(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
lstm = nn.LSTM(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
gru = nn.GRU(input_size=100, hidden_size=128, num_layers=2, batch_first=True)

Basic RNN

1. Simple RNN Implementation

python
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # RNN layer
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            nonlinearity='tanh'  # 'tanh' or 'relu'
        )
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden=None):
        # x shape: (batch_size, seq_len, input_size)
        batch_size = x.size(0)
        
        # Initialize hidden state
        if hidden is None:
            hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
            if x.is_cuda:
                hidden = hidden.cuda()
        
        # RNN forward pass
        rnn_out, hidden = self.rnn(x, hidden)
        
        # Use only the last timestep output
        output = self.fc(rnn_out[:, -1, :])
        
        return output, hidden

# Test simple RNN
input_size, hidden_size, output_size = 10, 20, 5
seq_len, batch_size = 15, 32

model = SimpleRNN(input_size, hidden_size, output_size)
x = torch.randn(batch_size, seq_len, input_size)
output, hidden = model(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Hidden state shape: {hidden.shape}")

2. Bidirectional RNN

python
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Bidirectional RNN
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        
        # Output layer (note: bidirectional RNN output dimension is 2*hidden_size)
        self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        # Bidirectional RNN forward pass
        rnn_out, _ = self.rnn(x)
        
        # Use the last timestep output
        output = self.fc(rnn_out[:, -1, :])
        
        return output

# Test bidirectional RNN
bi_model = BiRNN(input_size, hidden_size, output_size)
bi_output = bi_model(x)
print(f"Bidirectional RNN output shape: {bi_output.shape}")

LSTM Networks

1. Basic LSTM Implementation

python
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden=None):
        batch_size = x.size(0)
        
        # Initialize hidden state and cell state
        if hidden is None:
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
            if x.is_cuda:
                h0, c0 = h0.cuda(), c0.cuda()
            hidden = (h0, c0)
        
        # LSTM forward pass
        lstm_out, hidden = self.lstm(x, hidden)
        
        # Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Output layer
        output = self.fc(lstm_out[:, -1, :])  # Use the last timestep
        
        return output, hidden

# Test LSTM
lstm_model = LSTMModel(input_size, hidden_size, output_size)
lstm_output, lstm_hidden = lstm_model(x)
print(f"LSTM output shape: {lstm_output.shape}")
print(f"LSTM hidden state shape: {lstm_hidden[0].shape}, {lstm_hidden[1].shape}")

2. Many-to-Many LSTM (Sequence to Sequence)

python
class Seq2SeqLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(Seq2SeqLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x shape: (batch_size, seq_len, input_size)
        lstm_out, _ = self.lstm(x)
        
        # Predict for each timestep
        output = self.fc(lstm_out)  # (batch_size, seq_len, output_size)
        
        return output

# Test sequence-to-sequence LSTM
seq2seq_model = Seq2SeqLSTM(input_size, hidden_size, output_size)
seq2seq_output = seq2seq_model(x)
print(f"Seq2Seq output shape: {seq2seq_output.shape}")

GRU Networks

1. GRU Implementation

python
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.2):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # GRU layer
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden=None):
        batch_size = x.size(0)
        
        if hidden is None:
            hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
            if x.is_cuda:
                hidden = hidden.cuda()
        
        gru_out, hidden = self.gru(x, hidden)
        gru_out = self.dropout(gru_out)
        output = self.fc(gru_out[:, -1, :])
        
        return output, hidden

# Test GRU
gru_model = GRUModel(input_size, hidden_size, output_size)
gru_output, gru_hidden = gru_model(x)
print(f"GRU output shape: {gru_output.shape}")

Attention Mechanisms

1. Basic Attention

python
class AttentionRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(AttentionRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # LSTM output
        lstm_out, _ = self.lstm(x)  # (batch_size, seq_len, hidden_size)
        
        # Compute attention weights
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)  # (batch_size, seq_len, 1)
        
        # Weighted sum
        context_vector = torch.sum(attention_weights * lstm_out, dim=1)  # (batch_size, hidden_size)
        
        # Output
        output = self.fc(context_vector)
        
        return output, attention_weights

# Test attention RNN
attention_model = AttentionRNN(input_size, hidden_size, output_size)
attention_output, attention_weights = attention_model(x)
print(f"Attention RNN output shape: {attention_output.shape}")
print(f"Attention weights shape: {attention_weights.shape}")

2. Self-Attention Mechanism

python
class SelfAttentionRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_heads=8):
        super(SelfAttentionRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        
        # Multi-head self-attention
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            batch_first=True
        )
        
        # Layer normalization
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # LSTM processing
        lstm_out, _ = self.lstm(x)
        
        # Self-attention
        attn_out, attn_weights = self.multihead_attn(lstm_out, lstm_out, lstm_out)
        
        # Residual connection and layer normalization
        out = self.layer_norm(lstm_out + attn_out)
        
        # Global average pooling
        out = torch.mean(out, dim=1)
        
        # Output
        output = self.fc(out)
        
        return output, attn_weights

# Test self-attention RNN
self_attn_model = SelfAttentionRNN(input_size, hidden_size, output_size)
self_attn_output, self_attn_weights = self_attn_model(x)
print(f"Self-attention RNN output shape: {self_attn_output.shape}")

Practical Application Examples

1. Text Classification

python
class TextClassificationRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes, num_layers=2):
        super(TextClassificationRNN, self).__init__()
        
        # Word embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.3,
            bidirectional=True
        )
        
        # Classification layer
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
    
    def forward(self, x):
        # Word embedding
        embedded = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        
        # LSTM processing
        lstm_out, _ = self.lstm(embedded)
        
        # Use the last timestep output
        output = self.classifier(lstm_out[:, -1, :])
        
        return output

# Create text classification model
vocab_size, embed_dim, num_classes = 10000, 128, 5
text_model = TextClassificationRNN(vocab_size, embed_dim, hidden_size, num_classes)

# Test
text_input = torch.randint(0, vocab_size, (32, 50))  # 32 samples, each with 50 words
text_output = text_model(text_input)
print(f"Text classification output shape: {text_output.shape}")

2. Time Series Prediction

python
class TimeSeriesPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=3, dropout=0.2):
        super(TimeSeriesPredictor, self).__init__()
        
        # Multi-layer LSTM
        self.lstm1 = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout1 = nn.Dropout(dropout)
        
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout2 = nn.Dropout(dropout)
        
        self.lstm3 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout3 = nn.Dropout(dropout)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # First LSTM layer
        out, _ = self.lstm1(x)
        out = self.dropout1(out)
        
        # Second LSTM layer
        out, _ = self.lstm2(out)
        out = self.dropout2(out)
        
        # Third LSTM layer
        out, _ = self.lstm3(out)
        out = self.dropout3(out)
        
        # Predict next timestep
        prediction = self.fc(out[:, -1, :])
        
        return prediction

# Create time series prediction model
ts_model = TimeSeriesPredictor(input_size=1, hidden_size=64)

# Generate example time series data
def generate_sine_wave(seq_len, num_samples):
    x = np.linspace(0, 4*np.pi, seq_len)
    data = []
    for _ in range(num_samples):
        phase = np.random.uniform(0, 2*np.pi)
        amplitude = np.random.uniform(0.5, 2.0)
        noise = np.random.normal(0, 0.1, seq_len)
        y = amplitude * np.sin(x + phase) + noise
        data.append(y)
    return np.array(data)

# Test time series prediction
ts_data = generate_sine_wave(50, 32)
ts_input = torch.FloatTensor(ts_data).unsqueeze(-1)  # (32, 50, 1)
ts_output = ts_model(ts_input)
print(f"Time series prediction output shape: {ts_output.shape}")

3. Sequence to Sequence Translation

python
class Seq2SeqTranslator(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, hidden_size):
        super(Seq2SeqTranslator, self).__init__()
        
        # Encoder
        self.src_embedding = nn.Embedding(src_vocab_size, embed_dim)
        self.encoder = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        
        # Decoder
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_dim)
        self.decoder = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        
        # Output layer
        self.output_projection = nn.Linear(hidden_size, tgt_vocab_size)
    
    def forward(self, src, tgt):
        # Encode
        src_embedded = self.src_embedding(src)
        encoder_out, (hidden, cell) = self.encoder(src_embedded)
        
        # Decode
        tgt_embedded = self.tgt_embedding(tgt)
        decoder_out, _ = self.decoder(tgt_embedded, (hidden, cell))
        
        # Output projection
        output = self.output_projection(decoder_out)
        
        return output

# Create translation model
src_vocab_size, tgt_vocab_size = 5000, 4000
translator = Seq2SeqTranslator(src_vocab_size, tgt_vocab_size, embed_dim, hidden_size)

# Test
src_seq = torch.randint(0, src_vocab_size, (32, 20))  # Source sequence
tgt_seq = torch.randint(0, tgt_vocab_size, (32, 25))  # Target sequence
translation_output = translator(src_seq, tgt_seq)
print(f"Translation output shape: {translation_output.shape}")

Training Techniques

1. Gradient Clipping

python
def train_rnn_with_gradient_clipping(model, dataloader, criterion, optimizer, max_norm=1.0):
    model.train()
    total_loss = 0
    
    for batch_idx, (data, target) in enumerate(dataloader):
        optimizer.zero_grad()
        
        output = model(data)
        loss = criterion(output, target)
        
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

2. Learning Rate Warmup

python
class WarmupScheduler:
    def __init__(self, optimizer, warmup_steps, d_model):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.d_model = d_model
        self.step_num = 0
    
    def step(self):
        self.step_num += 1
        lr = self.d_model ** (-0.5) * min(
            self.step_num ** (-0.5),
            self.step_num * self.warmup_steps ** (-1.5)
        )
        
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

3. PackedSequence

python
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class PackedRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PackedRNN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, lengths):
        # Pack sequence
        packed_x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        
        # LSTM processing
        packed_out, (hidden, cell) = self.lstm(packed_x)
        
        # Unpack sequence
        lstm_out, _ = pad_packed_sequence(packed_out, batch_first=True)
        
        # Get last valid output for each sequence
        batch_size = x.size(0)
        last_outputs = []
        for i, length in enumerate(lengths):
            last_outputs.append(lstm_out[i, length-1, :])
        
        last_outputs = torch.stack(last_outputs)
        output = self.fc(last_outputs)
        
        return output

# Use packed sequences
def collate_fn(batch):
    # Assume batch is [(seq1, label1), (seq2, label2), ...]
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    
    # Pad sequences to same length
    max_len = max(lengths)
    padded_sequences = []
    for seq in sequences:
        padded = torch.zeros(max_len, seq.size(-1))
        padded[:len(seq)] = seq
        padded_sequences.append(padded)
    
    return torch.stack(padded_sequences), torch.tensor(lengths), torch.tensor(labels)

Visualization and Analysis

1. Attention Weights Visualization

python
def visualize_attention(attention_weights, input_tokens, figsize=(10, 8)):
    """Visualize attention weights"""
    attention_weights = attention_weights.squeeze().detach().cpu().numpy()
    
    plt.figure(figsize=figsize)
    plt.imshow(attention_weights.T, cmap='Blues', aspect='auto')
    plt.colorbar()
    plt.xlabel('Time Step')
    plt.ylabel('Input Position')
    plt.title('Attention Weights Heatmap')
    
    if input_tokens:
        plt.yticks(range(len(input_tokens)), input_tokens)
    
    plt.tight_layout()
    plt.show()

# Usage example
# visualize_attention(attention_weights, ['word1', 'word2', 'word3', ...])

2. Hidden States Visualization

python
def visualize_hidden_states(model, input_sequence, layer_idx=0):
    """Visualize RNN hidden state evolution"""
    model.eval()
    
    hidden_states = []
    
    # Get hidden state at each timestep
    with torch.no_grad():
        hidden = None
        for t in range(input_sequence.size(1)):
            input_t = input_sequence[:, t:t+1, :]
            output, hidden = model.lstm(input_t, hidden)
            if isinstance(hidden, tuple):  # LSTM
                hidden_states.append(hidden[0][layer_idx, 0, :].cpu().numpy())
            else:  # RNN/GRU
                hidden_states.append(hidden[layer_idx, 0, :].cpu().numpy())
    
    hidden_states = np.array(hidden_states)
    
    # Visualization
    plt.figure(figsize=(12, 8))
    plt.imshow(hidden_states.T, cmap='viridis', aspect='auto')
    plt.colorbar()
    plt.xlabel('Time Step')
    plt.ylabel('Hidden Units')
    plt.title(f'Layer {layer_idx} Hidden State Evolution')
    plt.tight_layout()
    plt.show()
    
    return hidden_states

Performance Optimization

1. Batch Processing Optimization

python
class OptimizedRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(OptimizedRNN, self).__init__()
        
        # Use more efficient LSTM implementation
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.1 if num_layers > 1 else 0
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
        
        # Enable cuDNN optimization
        self.lstm.flatten_parameters()
    
    def forward(self, x):
        # Ensure parameters are contiguous (cuDNN optimization)
        self.lstm.flatten_parameters()
        
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        
        return output

2. Memory Optimization

python
def train_with_checkpointing(model, dataloader, criterion, optimizer):
    """Use gradient checkpointing to save memory"""
    from torch.utils.checkpoint import checkpoint
    
    model.train()
    total_loss = 0
    
    for data, target in dataloader:
        optimizer.zero_grad()
        
        # Use gradient checkpointing
        def run_function(x):
            return model(x)
        
        output = checkpoint(run_function, data)
        loss = criterion(output, target)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

Summary

Recurrent neural networks are important tools for processing sequential data. This chapter introduced:

  1. Basic Architectures: Principles and implementations of RNN, LSTM, and GRU
  2. Advanced Techniques: Attention mechanisms, bidirectional RNN, sequence-to-sequence models
  3. Practical Applications: Text classification, time series prediction, machine translation
  4. Training Techniques: Gradient clipping, sequence packing, learning rate scheduling
  5. Visualization Analysis: Attention weights and hidden state visualization methods
  6. Performance Optimization: Batch processing and memory optimization techniques

Mastering RNN will lay a solid foundation for your applications in natural language processing, time series analysis, and other fields!

Content is for learning and research only.