Model Deployment
Model deployment is the final and most critical step of a machine learning project. This chapter will detail how to deploy TensorFlow models to production environments, including various deployment methods and best practices.
Deployment Overview
Deployment Method Comparison
| Deployment Method | Use Cases | Advantages | Disadvantages |
|---|---|---|---|
| TensorFlow Serving | High-performance services | High throughput, low latency | Complex configuration |
| Flask/FastAPI | Rapid prototyping | Simple and easy to use, flexible | Limited performance |
| TensorFlow Lite | Mobile/edge devices | Small model size, fast inference | Limited functionality |
| TensorFlow.js | Browser/Node.js | Client-side inference | Model size limitations |
| Docker Containers | Cloud deployment | Environment consistency | Resource overhead |
| Kubernetes | Large-scale deployment | Auto-scaling | High complexity |
python
import tensorflow as tf
from tensorflow import keras
import numpy as np
import json
import os
from pathlib import Path
import requests
import time
print(f"TensorFlow version: {tf.__version__}")TensorFlow Serving
Model Preparation and Export
python
def create_sample_model():
"""
Create sample model for deployment
"""
model = keras.Sequential([
keras.layers.Dense(128, activation='relu', input_shape=(784,)),
keras.layers.Dropout(0.2),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
return model
def export_model_for_serving(model, export_path, version=1):
"""
Export model for TensorFlow Serving
"""
# Create version directory
versioned_path = os.path.join(export_path, str(version))
# Save model
tf.saved_model.save(model, versioned_path)
print(f"Model exported to: {versioned_path}")
# Verify exported model
loaded_model = tf.saved_model.load(versioned_path)
print("Model signatures:")
print(list(loaded_model.signatures.keys()))
return versioned_path
def create_model_with_preprocessing():
"""
Create model with preprocessing included
"""
# Input layer
inputs = keras.layers.Input(shape=(28, 28), name='image')
# Preprocessing layer
x = keras.layers.Reshape((784,))(inputs)
x = keras.layers.Lambda(lambda x: tf.cast(x, tf.float32) /255.0)(x)
# Main model
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dropout(0.2)(x)
outputs = keras.layers.Dense(10, activation='softmax', name='predictions')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
return model
# Create and export model
sample_model = create_sample_model()
model_with_preprocessing = create_model_with_preprocessing()
# Export model
export_path = './models/mnist_classifier'
export_model_for_serving(model_with_preprocessing, export_path, version=1)TensorFlow Serving Configuration
python
def create_serving_config(model_name, model_base_path, model_platform='tensorflow'):
"""
Create TensorFlow Serving configuration file
"""
config = {
"model_config_list": [
{
"name": model_name,
"base_path": model_base_path,
"model_platform": model_platform,
"model_version_policy": {
"latest": {
"num_versions": 2
}
}
}
]
}
config_path = f"{model_name}_config.json"
with open(config_path, 'w') as f:
json.dump(config, f, indent=2)
print(f"Configuration file saved to: {config_path}")
return config_path
def create_docker_compose_serving():
"""
Create Docker Compose file for TensorFlow Serving
"""
docker_compose_content = """
version: '3.8'
services:
tensorflow-serving:
image: tensorflow/serving:latest
ports:
- "8501:8501" # REST API
- "8500:8500" # gRPC API
volumes:
- ./models:/models
environment:
- MODEL_NAME=mnist_classifier
- MODEL_BASE_PATH=/models/mnist_classifier
command: >
tensorflow_model_server
--rest_api_port=8501
--model_name=mnist_classifier
--model_base_path=/models/mnist_classifier
--monitoring_config_file=""
"""
with open('docker-compose-serving.yml', 'w') as f:
f.write(docker_compose_content)
print("Docker Compose file created: docker-compose-serving.yml")
# Create configuration files
create_serving_config('mnist_classifier', '/models/mnist_classifier')
create_docker_compose_serving()Client Calls
python
class TensorFlowServingClient:
"""
TensorFlow Serving client
"""
def __init__(self, server_url='http://localhost:8501', model_name='mnist_classifier'):
self.server_url = server_url
self.model_name = model_name
self.predict_url = f"{server_url}/v1/models/{model_name}:predict"
self.metadata_url = f"{server_url}/v1/models/{model_name}/metadata"
def get_model_metadata(self):
"""
Get model metadata
"""
try:
response = requests.get(self.metadata_url)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Failed to get model metadata: {e}")
return None
def predict(self, instances):
"""
Make prediction
"""
data = {
"instances": instances
}
try:
response = requests.post(
self.predict_url,
json=data,
headers={'Content-Type': 'application/json'}
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Prediction request failed: {e}")
return None
def predict_batch(self, instances, batch_size=32):
"""
Batch prediction
"""
results = []
for i in range(0, len(instances), batch_size):
batch = instances[i:i + batch_size]
result = self.predict(batch)
if result and 'predictions' in result:
results.extend(result['predictions'])
else:
print(f"Batch {i//batch_size + 1} prediction failed")
return results
def benchmark(self, instances, num_requests=100):
"""
Performance benchmark test
"""
print(f"Starting performance test, sending {num_requests} requests...")
start_time = time.time()
successful_requests = 0
for i in range(num_requests):
result = self.predict(instances)
if result:
successful_requests += 1
if (i + 1) % 10 == 0:
print(f"Completed {i + 1}/{num_requests} requests")
end_time = time.time()
total_time = end_time - start_time
print(f"\nPerformance test results:")
print(f"Total requests: {num_requests}")
print(f"Successful requests: {successful_requests}")
print(f"Total time: {total_time:.2f} seconds")
print(f"Average latency: {total_time/num_requests*1000:.2f} ms")
print(f"QPS: {successful_requests/total_time:.2f}")
# Usage example
def test_serving_client():
"""
Test TensorFlow Serving client
"""
client = TensorFlowServingClient()
# Create test data
test_data = np.random.randint(0, 255, (5, 28, 28)).tolist()
# Get model metadata
metadata = client.get_model_metadata()
if metadata:
print("Model metadata:")
print(json.dumps(metadata, indent=2))
# Make prediction
result = client.predict(test_data)
if result:
print("\nPrediction results:")
print(json.dumps(result, indent=2))
# Performance test
single_instance = [test_data[0]]
client.benchmark(single_instance, num_requests=50)
# Note: Need to start TensorFlow Serving service first
# test_serving_client()Flask/FastAPI Deployment
Flask Deployment
python
def create_flask_app(model_path):
"""
Create Flask application
"""
from flask import Flask, request, jsonify
import pickle
app = Flask(__name__)
# Load model
model = keras.models.load_model(model_path)
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'healthy', 'model_loaded': True})
@app.route('/predict', methods=['POST'])
def predict():
try:
data = request.get_json()
if 'instances' not in data:
return jsonify({'error': 'Missing instances field'}), 400
instances = np.array(data['instances'])
# Prediction
predictions = model.predict(instances)
# Convert to list for JSON serialization
predictions_list = predictions.tolist()
return jsonify({
'predictions': predictions_list,
'model_version': '1.0'
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/predict_class', methods=['POST'])
def predict_class():
try:
data = request.get_json()
instances = np.array(data['instances'])
# Prediction
predictions = model.predict(instances)
predicted_classes = np.argmax(predictions, axis=1)
confidences = np.max(predictions, axis=1)
results = []
for i in range(len(predicted_classes)):
results.append({
'predicted_class': int(predicted_classes[i]),
'confidence': float(confidences[i]),
'all_probabilities': predictions[i].tolist()
})
return jsonify({'results': results})
except Exception as e:
return jsonify({'error': str(e)}), 500
return app
def create_flask_with_monitoring():
"""
Create Flask application with monitoring
"""
from flask import Flask, request, jsonify
import time
import logging
from collections import defaultdict
app = Flask(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Monitoring metrics
metrics = {
'request_count': defaultdict(int),
'response_times': [],
'error_count': 0
}
@app.before_request
def before_request():
request.start_time = time.time()
@app.after_request
def after_request(response):
# Log response time
if hasattr(request, 'start_time'):
response_time = time.time() - request.start_time
metrics['response_times'].append(response_time)
# Keep last 1000 response times
if len(metrics['response_times']) > 1000:
metrics['response_times'] = metrics['response_times'][-1000:]
# Log request count
metrics['request_count'][request.endpoint] += 1
return response
@app.route('/metrics', methods=['GET'])
def get_metrics():
avg_response_time = np.mean(metrics['response_times']) if metrics['response_times'] else 0
return jsonify({
'request_count': dict(metrics['request_count']),
'average_response_time': avg_response_time,
'error_count': metrics['error_count'],
'total_requests': sum(metrics['request_count'].values())
})
return app
# Create Flask application
# flask_app = create_flask_app('./models/mnist_classifier/1')
# flask_app.run(host='0.0.0.0', port=5000, debug=False)FastAPI Deployment
python
def create_fastapi_app():
"""
Create FastAPI application
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import uvicorn
app = FastAPI(title="ML Model API", version="1.0.0")
# Request models
class PredictionRequest(BaseModel):
instances: List[List[float]]
class PredictionResponse(BaseModel):
predictions: List[List[float]]
model_version: str
class ClassificationResponse(BaseModel):
predicted_class: int
confidence: float
all_probabilities: List[float]
# Load model (should load at startup in real applications)
model = None
@app.on_event("startup")
async def startup_event():
global model
# model = keras.models.load_model('./models/mnist_classifier/1')
print("Model loaded")
@app.get("/health")
async def health():
return {"status": "healthy", "model_loaded": model is not None}
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
try:
if model is None:
raise HTTPException(status_code=503, detail="Model not loaded")
instances = np.array(request.instances)
predictions = model.predict(instances)
return PredictionResponse(
predictions=predictions.tolist(),
model_version="1.0"
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/predict_class")
async def predict_class(request: PredictionRequest):
try:
if model is None:
raise HTTPException(status_code=503, detail="Model not loaded")
instances = np.array(request.instances)
predictions = model.predict(instances)
results = []
for pred in predictions:
predicted_class = int(np.argmax(pred))
confidence = float(np.max(pred))
results.append(ClassificationResponse(
predicted_class=predicted_class,
confidence=confidence,
all_probabilities=pred.tolist()
))
return {"results": results}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return app
# Create FastAPI application
# fastapi_app = create_fastapi_app()
# uvicorn.run(fastapi_app, host="0.0.0.0", port=8000)TensorFlow Lite Deployment
Model Conversion
python
def convert_to_tflite(model, optimization=True, quantization=False):
"""
Convert model to TensorFlow Lite format
"""
converter = tf.lite.TFLiteConverter.from_keras_model(model)
# Optimization settings
if optimization:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# Quantization settings
if quantization:
converter.representative_dataset = representative_dataset_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
# Convert model
tflite_model = converter.convert()
return tflite_model
def representative_dataset_gen():
"""
Representative dataset generator (for quantization)
"""
# Generate representative data
for _ in range(100):
data = np.random.random((1, 28, 28)).astype(np.float32)
yield [data]
def save_tflite_model(tflite_model, model_path):
"""
Save TensorFlow Lite model
"""
with open(model_path, 'wb') as f:
f.write(tflite_model)
print(f"TensorFlow Lite model saved to: {model_path}")
def analyze_tflite_model(model_path):
"""
Analyze TensorFlow Lite model
"""
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()
# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print("Model analysis:")
print(f"Input details: {input_details}")
print(f"Output details: {output_details}")
# Get model size
model_size = os.path.getsize(model_path)
print(f"Model size: {model_size / 1024:.2f} KB")
return interpreter, input_details, output_details
# Convert model
sample_model = create_sample_model()
# Standard conversion
tflite_model = convert_to_tflite(sample_model, optimization=True)
save_tflite_model(tflite_model, 'model.tflite')
# Quantized conversion
tflite_quantized_model = convert_to_tflite(sample_model, optimization=True, quantization=True)
save_tflite_model(tflite_quantized_model, 'model_quantized.tflite')
# Analyze model
interpreter, input_details, output_details = analyze_tflite_model('model.tflite')TensorFlow Lite Inference
python
class TFLitePredictor:
"""
TensorFlow Lite predictor
"""
def __init__(self, model_path):
self.interpreter = tf.lite.Interpreter(model_path=model_path)
self.interpreter.allocate_tensors()
self.input_details = self.interpreter.get_input_details()
self.output_details = self.interpreter.get_output_details()
print(f"Model loaded: {model_path}")
print(f"Input shape: {self.input_details[0]['shape']}")
print(f"Output shape: {self.output_details[0]['shape']}")
def predict(self, input_data):
"""
Make prediction
"""
# Set input
self.interpreter.set_tensor(self.input_details[0]['index'], input_data)
# Run inference
self.interpreter.invoke()
# Get output
output_data = self.interpreter.get_tensor(self.output_details[0]['index'])
return output_data
def benchmark(self, input_data, num_runs=1000):
"""
Performance benchmark test
"""
print(f"Starting TensorFlow Lite performance test, running {num_runs} times...")
# Warmup
for _ in range(10):
self.predict(input_data)
# Test
start_time = time.time()
for _ in range(num_runs):
self.predict(input_data)
end_time = time.time()
total_time = end_time - start_time
avg_time = total_time / num_runs
print(f"Total time: {total_time:.4f} seconds")
print(f"Average inference time: {avg_time*1000:.4f} ms")
print(f"QPS: {num_runs/total_time:.2f}")
# Use TensorFlow Lite predictor
tflite_predictor = TFLitePredictor('model.tflite')
# Create test data
test_input = np.random.random((1, 784)).astype(np.float32)
# Make prediction
prediction = tflite_predictor.predict(test_input)
print(f"Prediction result: {prediction}")
# Performance test
tflite_predictor.benchmark(test_input, num_runs=1000)Summary
This chapter detailed various TensorFlow model deployment methods:
Key Points:
- Deployment Selection: Choose appropriate deployment method based on requirements
- TensorFlow Serving: High-performance production environment deployment
- Containerization: Use Docker to ensure environment consistency
- Kubernetes: Large-scale automated deployment and management
- Monitoring and Logging: Comprehensive monitoring and logging systems
- Performance Optimization: Model optimization and inference acceleration
- Security Considerations: API security and access control
Best Practices:
- Choose appropriate deployment architecture
- Implement comprehensive monitoring and logging
- Consider security and access control
- Perform performance testing and optimization
- Establish CI/CD workflows
- Prepare disaster recovery plans
- Regular updates and maintenance
Model deployment is a complex engineering problem that requires considering multiple aspects including performance, reliability, security, and maintainability.