Skip to content

Model Deployment

Model deployment is the final and most critical step of a machine learning project. This chapter will detail how to deploy TensorFlow models to production environments, including various deployment methods and best practices.

Deployment Overview

Deployment Method Comparison

Deployment MethodUse CasesAdvantagesDisadvantages
TensorFlow ServingHigh-performance servicesHigh throughput, low latencyComplex configuration
Flask/FastAPIRapid prototypingSimple and easy to use, flexibleLimited performance
TensorFlow LiteMobile/edge devicesSmall model size, fast inferenceLimited functionality
TensorFlow.jsBrowser/Node.jsClient-side inferenceModel size limitations
Docker ContainersCloud deploymentEnvironment consistencyResource overhead
KubernetesLarge-scale deploymentAuto-scalingHigh complexity
python
import tensorflow as tf
from tensorflow import keras
import numpy as np
import json
import os
from pathlib import Path
import requests
import time

print(f"TensorFlow version: {tf.__version__}")

TensorFlow Serving

Model Preparation and Export

python
def create_sample_model():
    """
    Create sample model for deployment
    """
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(784,)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

def export_model_for_serving(model, export_path, version=1):
    """
    Export model for TensorFlow Serving
    """
    # Create version directory
    versioned_path = os.path.join(export_path, str(version))

    # Save model
    tf.saved_model.save(model, versioned_path)

    print(f"Model exported to: {versioned_path}")

    # Verify exported model
    loaded_model = tf.saved_model.load(versioned_path)
    print("Model signatures:")
    print(list(loaded_model.signatures.keys()))

    return versioned_path

def create_model_with_preprocessing():
    """
    Create model with preprocessing included
    """
    # Input layer
    inputs = keras.layers.Input(shape=(28, 28), name='image')

    # Preprocessing layer
    x = keras.layers.Reshape((784,))(inputs)
    x = keras.layers.Lambda(lambda x: tf.cast(x, tf.float32) /255.0)(x)

    # Main model
    x = keras.layers.Dense(128, activation='relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(64, activation='relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    outputs = keras.layers.Dense(10, activation='softmax', name='predictions')(x)

    model = keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

# Create and export model
sample_model = create_sample_model()
model_with_preprocessing = create_model_with_preprocessing()

# Export model
export_path = './models/mnist_classifier'
export_model_for_serving(model_with_preprocessing, export_path, version=1)

TensorFlow Serving Configuration

python
def create_serving_config(model_name, model_base_path, model_platform='tensorflow'):
    """
    Create TensorFlow Serving configuration file
    """
    config = {
        "model_config_list": [
            {
                "name": model_name,
                "base_path": model_base_path,
                "model_platform": model_platform,
                "model_version_policy": {
                    "latest": {
                        "num_versions": 2
                    }
                }
            }
        ]
    }

    config_path = f"{model_name}_config.json"
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)

    print(f"Configuration file saved to: {config_path}")
    return config_path

def create_docker_compose_serving():
    """
    Create Docker Compose file for TensorFlow Serving
    """
    docker_compose_content = """
 version: '3.8'

 services:
   tensorflow-serving:
     image: tensorflow/serving:latest
     ports:
       - "8501:8501"  # REST API
       - "8500:8500"  # gRPC API
     volumes:
       - ./models:/models
     environment:
       - MODEL_NAME=mnist_classifier
       - MODEL_BASE_PATH=/models/mnist_classifier
     command: >
       tensorflow_model_server
       --rest_api_port=8501
       --model_name=mnist_classifier
       --model_base_path=/models/mnist_classifier
       --monitoring_config_file=""
"""

    with open('docker-compose-serving.yml', 'w') as f:
        f.write(docker_compose_content)

    print("Docker Compose file created: docker-compose-serving.yml")

# Create configuration files
create_serving_config('mnist_classifier', '/models/mnist_classifier')
create_docker_compose_serving()

Client Calls

python
class TensorFlowServingClient:
    """
    TensorFlow Serving client
    """
    def __init__(self, server_url='http://localhost:8501', model_name='mnist_classifier'):
        self.server_url = server_url
        self.model_name = model_name
        self.predict_url = f"{server_url}/v1/models/{model_name}:predict"
        self.metadata_url = f"{server_url}/v1/models/{model_name}/metadata"

    def get_model_metadata(self):
        """
        Get model metadata
        """
        try:
            response = requests.get(self.metadata_url)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Failed to get model metadata: {e}")
            return None

    def predict(self, instances):
        """
        Make prediction
        """
        data = {
            "instances": instances
        }

        try:
            response = requests.post(
                self.predict_url,
                json=data,
                headers={'Content-Type': 'application/json'}
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Prediction request failed: {e}")
            return None

    def predict_batch(self, instances, batch_size=32):
        """
        Batch prediction
        """
        results = []

        for i in range(0, len(instances), batch_size):
            batch = instances[i:i + batch_size]
            result = self.predict(batch)

            if result and 'predictions' in result:
                results.extend(result['predictions'])
            else:
                print(f"Batch {i//batch_size + 1} prediction failed")

        return results

    def benchmark(self, instances, num_requests=100):
        """
        Performance benchmark test
        """
        print(f"Starting performance test, sending {num_requests} requests...")

        start_time = time.time()
        successful_requests = 0

        for i in range(num_requests):
            result = self.predict(instances)
            if result:
                successful_requests += 1

            if (i + 1) % 10 == 0:
                print(f"Completed {i + 1}/{num_requests} requests")

        end_time = time.time()
        total_time = end_time - start_time

        print(f"\nPerformance test results:")
        print(f"Total requests: {num_requests}")
        print(f"Successful requests: {successful_requests}")
        print(f"Total time: {total_time:.2f} seconds")
        print(f"Average latency: {total_time/num_requests*1000:.2f} ms")
        print(f"QPS: {successful_requests/total_time:.2f}")

# Usage example
def test_serving_client():
    """
    Test TensorFlow Serving client
    """
    client = TensorFlowServingClient()

    # Create test data
    test_data = np.random.randint(0, 255, (5, 28, 28)).tolist()

    # Get model metadata
    metadata = client.get_model_metadata()
    if metadata:
        print("Model metadata:")
        print(json.dumps(metadata, indent=2))

    # Make prediction
    result = client.predict(test_data)
    if result:
        print("\nPrediction results:")
        print(json.dumps(result, indent=2))

    # Performance test
    single_instance = [test_data[0]]
    client.benchmark(single_instance, num_requests=50)

# Note: Need to start TensorFlow Serving service first
# test_serving_client()

Flask/FastAPI Deployment

Flask Deployment

python
def create_flask_app(model_path):
    """
    Create Flask application
    """
    from flask import Flask, request, jsonify
    import pickle

    app = Flask(__name__)

    # Load model
    model = keras.models.load_model(model_path)

    @app.route('/health', methods=['GET'])
    def health():
        return jsonify({'status': 'healthy', 'model_loaded': True})

    @app.route('/predict', methods=['POST'])
    def predict():
        try:
            data = request.get_json()

            if 'instances' not in data:
                return jsonify({'error': 'Missing instances field'}), 400

            instances = np.array(data['instances'])

            # Prediction
            predictions = model.predict(instances)

            # Convert to list for JSON serialization
            predictions_list = predictions.tolist()

            return jsonify({
                'predictions': predictions_list,
                'model_version': '1.0'
            })

        except Exception as e:
            return jsonify({'error': str(e)}), 500

    @app.route('/predict_class', methods=['POST'])
    def predict_class():
        try:
            data = request.get_json()
            instances = np.array(data['instances'])

            # Prediction
            predictions = model.predict(instances)
            predicted_classes = np.argmax(predictions, axis=1)
            confidences = np.max(predictions, axis=1)

            results = []
            for i in range(len(predicted_classes)):
                results.append({
                    'predicted_class': int(predicted_classes[i]),
                    'confidence': float(confidences[i]),
                    'all_probabilities': predictions[i].tolist()
                })

            return jsonify({'results': results})

        except Exception as e:
            return jsonify({'error': str(e)}), 500

    return app

def create_flask_with_monitoring():
    """
    Create Flask application with monitoring
    """
    from flask import Flask, request, jsonify
    import time
    import logging
    from collections import defaultdict

    app = Flask(__name__)

    # Setup logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Monitoring metrics
    metrics = {
        'request_count': defaultdict(int),
        'response_times': [],
        'error_count': 0
    }

    @app.before_request
    def before_request():
        request.start_time = time.time()

    @app.after_request
    def after_request(response):
        # Log response time
        if hasattr(request, 'start_time'):
            response_time = time.time() - request.start_time
            metrics['response_times'].append(response_time)

            # Keep last 1000 response times
            if len(metrics['response_times']) > 1000:
                metrics['response_times'] = metrics['response_times'][-1000:]

        # Log request count
        metrics['request_count'][request.endpoint] += 1

        return response

    @app.route('/metrics', methods=['GET'])
    def get_metrics():
        avg_response_time = np.mean(metrics['response_times']) if metrics['response_times'] else 0

        return jsonify({
            'request_count': dict(metrics['request_count']),
            'average_response_time': avg_response_time,
            'error_count': metrics['error_count'],
            'total_requests': sum(metrics['request_count'].values())
        })

    return app

# Create Flask application
# flask_app = create_flask_app('./models/mnist_classifier/1')
# flask_app.run(host='0.0.0.0', port=5000, debug=False)

FastAPI Deployment

python
def create_fastapi_app():
    """
    Create FastAPI application
    """
    from fastapi import FastAPI, HTTPException
    from pydantic import BaseModel
    from typing import List
    import uvicorn

    app = FastAPI(title="ML Model API", version="1.0.0")

    # Request models
    class PredictionRequest(BaseModel):
        instances: List[List[float]]

    class PredictionResponse(BaseModel):
        predictions: List[List[float]]
        model_version: str

    class ClassificationResponse(BaseModel):
        predicted_class: int
        confidence: float
        all_probabilities: List[float]

    # Load model (should load at startup in real applications)
    model = None

    @app.on_event("startup")
    async def startup_event():
        global model
        # model = keras.models.load_model('./models/mnist_classifier/1')
        print("Model loaded")

    @app.get("/health")
    async def health():
        return {"status": "healthy", "model_loaded": model is not None}

    @app.post("/predict", response_model=PredictionResponse)
    async def predict(request: PredictionRequest):
        try:
            if model is None:
                raise HTTPException(status_code=503, detail="Model not loaded")

            instances = np.array(request.instances)
            predictions = model.predict(instances)

            return PredictionResponse(
                predictions=predictions.tolist(),
                model_version="1.0"
            )

        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))

    @app.post("/predict_class")
    async def predict_class(request: PredictionRequest):
        try:
            if model is None:
                raise HTTPException(status_code=503, detail="Model not loaded")

            instances = np.array(request.instances)
            predictions = model.predict(instances)

            results = []
            for pred in predictions:
                predicted_class = int(np.argmax(pred))
                confidence = float(np.max(pred))

                results.append(ClassificationResponse(
                    predicted_class=predicted_class,
                    confidence=confidence,
                    all_probabilities=pred.tolist()
                ))

            return {"results": results}

        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))

    return app

# Create FastAPI application
# fastapi_app = create_fastapi_app()
# uvicorn.run(fastapi_app, host="0.0.0.0", port=8000)

TensorFlow Lite Deployment

Model Conversion

python
def convert_to_tflite(model, optimization=True, quantization=False):
    """
    Convert model to TensorFlow Lite format
    """
    converter = tf.lite.TFLiteConverter.from_keras_model(model)

    # Optimization settings
    if optimization:
        converter.optimizations = [tf.lite.Optimize.DEFAULT]

    # Quantization settings
    if quantization:
        converter.representative_dataset = representative_dataset_gen
        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
        converter.inference_input_type = tf.int8
        converter.inference_output_type = tf.int8

    # Convert model
    tflite_model = converter.convert()

    return tflite_model

def representative_dataset_gen():
    """
    Representative dataset generator (for quantization)
    """
    # Generate representative data
    for _ in range(100):
        data = np.random.random((1, 28, 28)).astype(np.float32)
        yield [data]

def save_tflite_model(tflite_model, model_path):
    """
    Save TensorFlow Lite model
    """
    with open(model_path, 'wb') as f:
        f.write(tflite_model)

    print(f"TensorFlow Lite model saved to: {model_path}")

def analyze_tflite_model(model_path):
    """
    Analyze TensorFlow Lite model
    """
    interpreter = tf.lite.Interpreter(model_path=model_path)
    interpreter.allocate_tensors()

    # Get input and output details
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    print("Model analysis:")
    print(f"Input details: {input_details}")
    print(f"Output details: {output_details}")

    # Get model size
    model_size = os.path.getsize(model_path)
    print(f"Model size: {model_size / 1024:.2f} KB")

    return interpreter, input_details, output_details

# Convert model
sample_model = create_sample_model()

# Standard conversion
tflite_model = convert_to_tflite(sample_model, optimization=True)
save_tflite_model(tflite_model, 'model.tflite')

# Quantized conversion
tflite_quantized_model = convert_to_tflite(sample_model, optimization=True, quantization=True)
save_tflite_model(tflite_quantized_model, 'model_quantized.tflite')

# Analyze model
interpreter, input_details, output_details = analyze_tflite_model('model.tflite')

TensorFlow Lite Inference

python
class TFLitePredictor:
    """
    TensorFlow Lite predictor
    """
    def __init__(self, model_path):
        self.interpreter = tf.lite.Interpreter(model_path=model_path)
        self.interpreter.allocate_tensors()

        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()

        print(f"Model loaded: {model_path}")
        print(f"Input shape: {self.input_details[0]['shape']}")
        print(f"Output shape: {self.output_details[0]['shape']}")

    def predict(self, input_data):
        """
        Make prediction
        """
        # Set input
        self.interpreter.set_tensor(self.input_details[0]['index'], input_data)

        # Run inference
        self.interpreter.invoke()

        # Get output
        output_data = self.interpreter.get_tensor(self.output_details[0]['index'])

        return output_data

    def benchmark(self, input_data, num_runs=1000):
        """
        Performance benchmark test
        """
        print(f"Starting TensorFlow Lite performance test, running {num_runs} times...")

        # Warmup
        for _ in range(10):
            self.predict(input_data)

        # Test
        start_time = time.time()
        for _ in range(num_runs):
            self.predict(input_data)
        end_time = time.time()

        total_time = end_time - start_time
        avg_time = total_time / num_runs

        print(f"Total time: {total_time:.4f} seconds")
        print(f"Average inference time: {avg_time*1000:.4f} ms")
        print(f"QPS: {num_runs/total_time:.2f}")

# Use TensorFlow Lite predictor
tflite_predictor = TFLitePredictor('model.tflite')

# Create test data
test_input = np.random.random((1, 784)).astype(np.float32)

# Make prediction
prediction = tflite_predictor.predict(test_input)
print(f"Prediction result: {prediction}")

# Performance test
tflite_predictor.benchmark(test_input, num_runs=1000)

Summary

This chapter detailed various TensorFlow model deployment methods:

Key Points:

  1. Deployment Selection: Choose appropriate deployment method based on requirements
  2. TensorFlow Serving: High-performance production environment deployment
  3. Containerization: Use Docker to ensure environment consistency
  4. Kubernetes: Large-scale automated deployment and management
  5. Monitoring and Logging: Comprehensive monitoring and logging systems
  6. Performance Optimization: Model optimization and inference acceleration
  7. Security Considerations: API security and access control

Best Practices:

  • Choose appropriate deployment architecture
  • Implement comprehensive monitoring and logging
  • Consider security and access control
  • Perform performance testing and optimization
  • Establish CI/CD workflows
  • Prepare disaster recovery plans
  • Regular updates and maintenance

Model deployment is a complex engineering problem that requires considering multiple aspects including performance, reliability, security, and maintainability.

Content is for learning and research only.