Skip to content

Recurrent Neural Networks (RNN)

Recurrent Neural Networks (RNN) are a class of neural networks specifically designed to process sequence data. Unlike traditional feedforward neural networks, RNNs have memory capabilities and can handle variable-length sequence inputs.

RNN Basic Concepts

What is RNN?

RNN is a neural network with recurrent connections that can process sequence data such as text, time series, speech, etc. The core idea of RNN is to introduce recurrent connections in the network, allowing it to maintain memory of previous information.

RNN Structure

python
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# Basic structure of RNN
def simple_rnn_example():
    # Create a simple RNN model
    model = keras.Sequential([
        keras.layers.SimpleRNN(32, return_sequences=True, input_shape=(None, 1)),
        keras.layers.SimpleRNN(32),
        keras.layers.Dense(1)
    ])

    return model

# View model structure
model = simple_rnn_example()
model.summary()

Types of RNN

1. Simple RNN

python
# Simple RNN example
def create_simple_rnn(units=50, input_shape=(None, 1)):
    model = keras.Sequential([
        keras.layers.SimpleRNN(units, input_shape=input_shape),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# Create model
simple_model = create_simple_rnn()
simple_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2. LSTM (Long Short-Term Memory Network)

python
# LSTM example
def create_lstm_model(units=50, input_shape=(None, 1)):
    model = keras.Sequential([
        keras.layers.LSTM(units, return_sequences=True, input_shape=input_shape),
        keras.layers.Dropout(0.2),
        keras.layers.LSTM(units),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1)
    ])
    return model

# Create LSTM model
lstm_model = create_lstm_model()
lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

3. GRU (Gated Recurrent Unit)

python
# GRU example
def create_gru_model(units=50, input_shape=(None, 1)):
    model = keras.Sequential([
        keras.layers.GRU(units, return_sequences=True, input_shape=input_shape),
        keras.layers.Dropout(0.2),
        keras.layers.GRU(units),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1)
    ])
    return model

# Create GRU model
gru_model = create_gru_model()
gru_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

Sequence Data Processing

Data Preprocessing

python
def prepare_sequence_data(data, sequence_length):
    """
    Prepare sequence data
    """
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length)])
        y.append(data[i + sequence_length])
    return np.array(X), np.array(y)

# Example: Time series data
# Generate example data
time_steps = np.arange(0, 100, 0.1)
data = np.sin(time_steps) + np.random.normal(0, 0.1, len(time_steps))

# Prepare training data
sequence_length = 10
X, y = prepare_sequence_data(data, sequence_length)
X = X.reshape((X.shape[0], X.shape[1], 1))

print(f"Input shape: {X.shape}")
print(f"Output shape: {y.shape}")

Text Sequence Processing

Text Preprocessing and Word Embeddings

python
# Text sequence processing example
def create_text_rnn_model(vocab_size, embedding_dim=100, max_length=100):
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# Text preprocessing
def preprocess_text_data(texts, max_words=10000, max_length=100):
    # Create tokenizer
    tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)

    # Convert to sequences
    sequences = tokenizer.texts_to_sequences(texts)

    # Pad sequences
    padded_sequences = keras.preprocessing.sequence.pad_sequences(
        sequences, maxlen=max_length
    )

    return padded_sequences, tokenizer

# Sample text data
sample_texts = [
    "这是一个正面的评论",
    "这个产品很糟糕",
    "我很喜欢这个服务",
    "完全不推荐"
]

# Preprocess text
sequences, tokenizer = preprocess_text_data(sample_texts)
print(f"Sequence shape: {sequences.shape}")

Bidirectional RNN

python
def create_bidirectional_rnn(units=50, input_shape=(None, 1)):
    """
    Create bidirectional RNN model
    """
    model = keras.Sequential([
        keras.layers.Bidirectional(
            keras.layers.LSTM(units, return_sequences=True),
            input_shape=input_shape
        ),
        keras.layers.Bidirectional(keras.layers.LSTM(units)),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# Create bidirectional model
bidirectional_model = create_bidirectional_rnn()
bidirectional_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

Sequence-to-Sequence Models

python
def create_seq2seq_model(input_vocab_size, output_vocab_size,
                        embedding_dim=256, units=512):
    """
    Create sequence-to-sequence model (encoder-decoder architecture)
    """
    # Encoder
    encoder_inputs = keras.layers.Input(shape=(None,))
    encoder_embedding = keras.layers.Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = keras.layers.LSTM(units, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = keras.layers.Input(shape=(None,))
    decoder_embedding = keras.layers.Embedding(output_vocab_size, embedding_dim)
    decoder_lstm = keras.layers.LSTM(units, return_sequences=True, return_state=True)
    decoder_dense = keras.layers.Dense(output_vocab_size, activation='softmax')

    decoder_embedding_output = decoder_embedding(decoder_inputs)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)
    decoder_outputs = decoder_dense(decoder_outputs)

    # Create model
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

Attention Mechanism

python
class AttentionLayer(keras.layers.Layer):
    """
    Simple attention mechanism layer
    """
    def __init__(self, units):
        super(AttentionLayer, self).__init__()
        self.units = units
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)

    def call(self, query, values):
        # Calculate attention scores
        score = self.V(tf.nn.tanh(self.W1(query) + self.W2(values)))

        # Calculate attention weights
        attention_weights = tf.nn.softmax(score, axis=1)

        # Calculate context vector
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

def create_attention_model(vocab_size, embedding_dim=100, units=128, max_length=100):
    """
    Create RNN model with attention mechanism
    """
    inputs = keras.layers.Input(shape=(max_length,))
    embedding = keras.layers.Embedding(vocab_size, embedding_dim)(inputs)

    # LSTM layer
    lstm_output = keras.layers.LSTM(units, return_sequences=True)(embedding)

    # Attention layer
    attention = AttentionLayer(units)
    context_vector, attention_weights = attention(lstm_output, lstm_output)

    # Output layer
    output = keras.layers.Dense(1, activation='sigmoid')(context_vector)

    model = keras.Model(inputs=inputs, outputs=output)
    return model

Practical Application Examples

Stock Price Prediction

python
def stock_price_prediction_example():
    """
    Stock price prediction example
    """
    # Generate simulated stock data
    np.random.seed(42)
    days = 1000
    prices = 100 + np.cumsum(np.random.randn(days) * 0.5)

    # Data normalization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled_prices = scaler.fit_transform(prices.reshape(-1, 1)).flatten()

    # Prepare sequence data
    sequence_length = 60
    X, y = prepare_sequence_data(scaled_prices, sequence_length)
    X = X.reshape((X.shape[0], X.shape[1], 1))

    # Split training and test data
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    # Create model
    model = keras.Sequential([
        keras.layers.LSTM(50, return_sequences=True, input_shape=(sequence_length, 1)),
        keras.layers.Dropout(0.2),
        keras.layers.LSTM(50, return_sequences=True),
        keras.layers.Dropout(0.2),
        keras.layers.LSTM(50),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1)
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_test, y_test),
        verbose=1
    )

    return model, history, scaler

# Run example
# model, history, scaler = stock_price_prediction_example()

Sentiment Analysis

python
def sentiment_analysis_example():
    """
    Sentiment analysis example
    """
    # Sample data (in practice should use larger dataset)
    texts = [
        "这个电影真的很棒!",
        "我不喜欢这个产品",
        "服务质量很好",
        "完全浪费时间",
        "强烈推荐给大家"
    ]
    labels = [1, 0, 1, 0, 1]  # 1: positive, 0: negative

    # Preprocessing
    max_words = 1000
    max_length = 50

    tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    X = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length)
    y = np.array(labels)

    # Create model
    model = keras.Sequential([
        keras.layers.Embedding(max_words, 100, input_length=max_length),
        keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model, X, y, tokenizer

# Run example
# model, X, y, tokenizer = sentiment_analysis_example()

Pros and Cons of RNN

Advantages

  • Can handle variable-length sequences
  • Has memory capability
  • Parameter sharing, relatively simple model

Disadvantages

  • Vanishing gradient problem
  • Slower training speed
  • Difficult to parallelize

Solutions

  • Use LSTM or GRU to solve vanishing gradients
  • Use attention mechanism to improve performance
  • Consider using Transformer instead of RNN

Best Practices

  1. Choose appropriate RNN type:

    • Use SimpleRNN for simple tasks
    • Use LSTM or GRU for long sequences
    • Use Bidirectional RNN when bidirectional information is needed
  2. Data preprocessing:

    • Appropriate sequence length
    • Data normalization
    • Handle variable-length sequences
  3. Model optimization:

    • Use Dropout to prevent overfitting
    • Appropriate learning rate
    • Batch size tuning
  4. Monitor training:

    • Use validation set to monitor performance
    • Early stopping mechanism
    • Learning rate scheduling

Summary

RNN is an important tool for processing sequence data. Although it has been surpassed by newer architectures like Transformer in some tasks, it remains very effective in many applications. Understanding RNN principles and implementation is essential for deep learning practitioners.

In the next chapter, we will learn about Transformer models, which have become the mainstream choice for many NLP tasks.

Content is for learning and research only.