Recurrent Neural Networks (RNN)
Recurrent Neural Networks (RNN) are a class of neural networks specifically designed to process sequence data. Unlike traditional feedforward neural networks, RNNs have memory capabilities and can handle variable-length sequence inputs.
RNN Basic Concepts
What is RNN?
RNN is a neural network with recurrent connections that can process sequence data such as text, time series, speech, etc. The core idea of RNN is to introduce recurrent connections in the network, allowing it to maintain memory of previous information.
RNN Structure
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
# Basic structure of RNN
def simple_rnn_example():
# Create a simple RNN model
model = keras.Sequential([
keras.layers.SimpleRNN(32, return_sequences=True, input_shape=(None, 1)),
keras.layers.SimpleRNN(32),
keras.layers.Dense(1)
])
return model
# View model structure
model = simple_rnn_example()
model.summary()Types of RNN
1. Simple RNN
# Simple RNN example
def create_simple_rnn(units=50, input_shape=(None, 1)):
model = keras.Sequential([
keras.layers.SimpleRNN(units, input_shape=input_shape),
keras.layers.Dense(1, activation='sigmoid')
])
return model
# Create model
simple_model = create_simple_rnn()
simple_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])2. LSTM (Long Short-Term Memory Network)
# LSTM example
def create_lstm_model(units=50, input_shape=(None, 1)):
model = keras.Sequential([
keras.layers.LSTM(units, return_sequences=True, input_shape=input_shape),
keras.layers.Dropout(0.2),
keras.layers.LSTM(units),
keras.layers.Dropout(0.2),
keras.layers.Dense(1)
])
return model
# Create LSTM model
lstm_model = create_lstm_model()
lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])3. GRU (Gated Recurrent Unit)
# GRU example
def create_gru_model(units=50, input_shape=(None, 1)):
model = keras.Sequential([
keras.layers.GRU(units, return_sequences=True, input_shape=input_shape),
keras.layers.Dropout(0.2),
keras.layers.GRU(units),
keras.layers.Dropout(0.2),
keras.layers.Dense(1)
])
return model
# Create GRU model
gru_model = create_gru_model()
gru_model.compile(optimizer='adam', loss='mse', metrics=['mae'])Sequence Data Processing
Data Preprocessing
def prepare_sequence_data(data, sequence_length):
"""
Prepare sequence data
"""
X, y = [], []
for i in range(len(data) - sequence_length):
X.append(data[i:(i + sequence_length)])
y.append(data[i + sequence_length])
return np.array(X), np.array(y)
# Example: Time series data
# Generate example data
time_steps = np.arange(0, 100, 0.1)
data = np.sin(time_steps) + np.random.normal(0, 0.1, len(time_steps))
# Prepare training data
sequence_length = 10
X, y = prepare_sequence_data(data, sequence_length)
X = X.reshape((X.shape[0], X.shape[1], 1))
print(f"Input shape: {X.shape}")
print(f"Output shape: {y.shape}")Text Sequence Processing
Text Preprocessing and Word Embeddings
# Text sequence processing example
def create_text_rnn_model(vocab_size, embedding_dim=100, max_length=100):
model = keras.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5),
keras.layers.Dense(1, activation='sigmoid')
])
return model
# Text preprocessing
def preprocess_text_data(texts, max_words=10000, max_length=100):
# Create tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
# Convert to sequences
sequences = tokenizer.texts_to_sequences(texts)
# Pad sequences
padded_sequences = keras.preprocessing.sequence.pad_sequences(
sequences, maxlen=max_length
)
return padded_sequences, tokenizer
# Sample text data
sample_texts = [
"这是一个正面的评论",
"这个产品很糟糕",
"我很喜欢这个服务",
"完全不推荐"
]
# Preprocess text
sequences, tokenizer = preprocess_text_data(sample_texts)
print(f"Sequence shape: {sequences.shape}")Bidirectional RNN
def create_bidirectional_rnn(units=50, input_shape=(None, 1)):
"""
Create bidirectional RNN model
"""
model = keras.Sequential([
keras.layers.Bidirectional(
keras.layers.LSTM(units, return_sequences=True),
input_shape=input_shape
),
keras.layers.Bidirectional(keras.layers.LSTM(units)),
keras.layers.Dense(1, activation='sigmoid')
])
return model
# Create bidirectional model
bidirectional_model = create_bidirectional_rnn()
bidirectional_model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)Sequence-to-Sequence Models
def create_seq2seq_model(input_vocab_size, output_vocab_size,
embedding_dim=256, units=512):
"""
Create sequence-to-sequence model (encoder-decoder architecture)
"""
# Encoder
encoder_inputs = keras.layers.Input(shape=(None,))
encoder_embedding = keras.layers.Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]
# Decoder
decoder_inputs = keras.layers.Input(shape=(None,))
decoder_embedding = keras.layers.Embedding(output_vocab_size, embedding_dim)
decoder_lstm = keras.layers.LSTM(units, return_sequences=True, return_state=True)
decoder_dense = keras.layers.Dense(output_vocab_size, activation='softmax')
decoder_embedding_output = decoder_embedding(decoder_inputs)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder_outputs)
# Create model
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
return modelAttention Mechanism
class AttentionLayer(keras.layers.Layer):
"""
Simple attention mechanism layer
"""
def __init__(self, units):
super(AttentionLayer, self).__init__()
self.units = units
self.W1 = keras.layers.Dense(units)
self.W2 = keras.layers.Dense(units)
self.V = keras.layers.Dense(1)
def call(self, query, values):
# Calculate attention scores
score = self.V(tf.nn.tanh(self.W1(query) + self.W2(values)))
# Calculate attention weights
attention_weights = tf.nn.softmax(score, axis=1)
# Calculate context vector
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
def create_attention_model(vocab_size, embedding_dim=100, units=128, max_length=100):
"""
Create RNN model with attention mechanism
"""
inputs = keras.layers.Input(shape=(max_length,))
embedding = keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
# LSTM layer
lstm_output = keras.layers.LSTM(units, return_sequences=True)(embedding)
# Attention layer
attention = AttentionLayer(units)
context_vector, attention_weights = attention(lstm_output, lstm_output)
# Output layer
output = keras.layers.Dense(1, activation='sigmoid')(context_vector)
model = keras.Model(inputs=inputs, outputs=output)
return modelPractical Application Examples
Stock Price Prediction
def stock_price_prediction_example():
"""
Stock price prediction example
"""
# Generate simulated stock data
np.random.seed(42)
days = 1000
prices = 100 + np.cumsum(np.random.randn(days) * 0.5)
# Data normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_prices = scaler.fit_transform(prices.reshape(-1, 1)).flatten()
# Prepare sequence data
sequence_length = 60
X, y = prepare_sequence_data(scaled_prices, sequence_length)
X = X.reshape((X.shape[0], X.shape[1], 1))
# Split training and test data
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
# Create model
model = keras.Sequential([
keras.layers.LSTM(50, return_sequences=True, input_shape=(sequence_length, 1)),
keras.layers.Dropout(0.2),
keras.layers.LSTM(50, return_sequences=True),
keras.layers.Dropout(0.2),
keras.layers.LSTM(50),
keras.layers.Dropout(0.2),
keras.layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train model
history = model.fit(
X_train, y_train,
epochs=50,
batch_size=32,
validation_data=(X_test, y_test),
verbose=1
)
return model, history, scaler
# Run example
# model, history, scaler = stock_price_prediction_example()Sentiment Analysis
def sentiment_analysis_example():
"""
Sentiment analysis example
"""
# Sample data (in practice should use larger dataset)
texts = [
"这个电影真的很棒!",
"我不喜欢这个产品",
"服务质量很好",
"完全浪费时间",
"强烈推荐给大家"
]
labels = [1, 0, 1, 0, 1] # 1: positive, 0: negative
# Preprocessing
max_words = 1000
max_length = 50
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length)
y = np.array(labels)
# Create model
model = keras.Sequential([
keras.layers.Embedding(max_words, 100, input_length=max_length),
keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
return model, X, y, tokenizer
# Run example
# model, X, y, tokenizer = sentiment_analysis_example()Pros and Cons of RNN
Advantages
- Can handle variable-length sequences
- Has memory capability
- Parameter sharing, relatively simple model
Disadvantages
- Vanishing gradient problem
- Slower training speed
- Difficult to parallelize
Solutions
- Use LSTM or GRU to solve vanishing gradients
- Use attention mechanism to improve performance
- Consider using Transformer instead of RNN
Best Practices
Choose appropriate RNN type:
- Use SimpleRNN for simple tasks
- Use LSTM or GRU for long sequences
- Use Bidirectional RNN when bidirectional information is needed
Data preprocessing:
- Appropriate sequence length
- Data normalization
- Handle variable-length sequences
Model optimization:
- Use Dropout to prevent overfitting
- Appropriate learning rate
- Batch size tuning
Monitor training:
- Use validation set to monitor performance
- Early stopping mechanism
- Learning rate scheduling
Summary
RNN is an important tool for processing sequence data. Although it has been surpassed by newer architectures like Transformer in some tasks, it remains very effective in many applications. Understanding RNN principles and implementation is essential for deep learning practitioners.
In the next chapter, we will learn about Transformer models, which have become the mainstream choice for many NLP tasks.