C++ Performance Optimization

Overview

Performance optimization is an important aspect of C++ programming. This chapter introduces key technologies such as memory management, algorithm optimization, compiler optimization, and performance measurement to help developers write efficient C++ programs.

📊 Performance Measurement

Benchmarking

cpp

#include <iostream>
#include <chrono>
#include <vector>
#include <algorithm>

class Timer {
private:
    std::chrono::high_resolution_clock::time_point start_;
    std::string name_;
    
public:
    Timer(const std::string& name) : name_(name) {
        start_ = std::chrono::high_resolution_clock::now();
    }
    
    ~Timer() {
        auto end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start_);
        std::cout << name_ << " elapsed: " << duration.count() << " microseconds" << std::endl;
    }
};

void memoryAccessTest() {
    const size_t SIZE = 1024 * 1024;
    std::vector<int> data(SIZE, 1);
    
    // Sequential access
    {
        Timer timer("Sequential access");
        int sum = 0;
        for (size_t i = 0; i < SIZE; ++i) {
            sum += data[i];
        }
    }
    
    // Strided access
    {
        Timer timer("Strided access");
        int sum = 0;
        for (size_t i = 0; i < SIZE; i += 16) {
            sum += data[i];
        }
    }
}

🧠 Memory Optimization

Object Pool Pattern

cpp

#include <vector>
#include <memory>

template<typename T>
class ObjectPool {
private:
    std::vector<std::unique_ptr<T>> pool_;
    std::vector<T*> available_;
    
public:
    T* acquire() {
        if (available_.empty()) {
            pool_.push_back(std::make_unique<T>());
            return pool_.back().get();
        }
        T* obj = available_.back();
        available_.pop_back();
        return obj;
    }
    
    void release(T* obj) {
        available_.push_back(obj);
    }
};

// Usage example
void objectPoolDemo() {
    struct ExpensiveObject {
        std::vector<int> data;
        ExpensiveObject() : data(1000, 42) {}
    };
    
    ObjectPool<ExpensiveObject> pool;
    
    // Object pool version
    {
        Timer timer("Object pool");
        for (int i = 0; i < 1000; ++i) {
            auto* obj = pool.acquire();
            // Use object
            pool.release(obj);
        }
    }
    
    // Direct creation version
    {
        Timer timer("Direct creation");
        for (int i = 0; i < 1000; ++i) {
            auto obj = std::make_unique<ExpensiveObject>();
            // Use object
        }
    }
}

Memory Pool Allocator

cpp

class MemoryPool {
private:
    char* memory_;
    size_t block_size_;
    std::vector<void*> free_blocks_;
    
public:
    MemoryPool(size_t block_size, size_t num_blocks) 
        : block_size_(block_size) {
        memory_ = new char[block_size * num_blocks];
        
        for (size_t i = 0; i < num_blocks; ++i) {
            free_blocks_.push_back(memory_ + i * block_size);
        }
    }
    
    ~MemoryPool() {
        delete[] memory_;
    }
    
    void* allocate() {
        if (free_blocks_.empty()) return nullptr;
        void* block = free_blocks_.back();
        free_blocks_.pop_back();
        return block;
    }
    
    void deallocate(void* ptr) {
        free_blocks_.push_back(ptr);
    }
};

⚡ Algorithm Optimization

Container Selection

cpp

#include <set>
#include <unordered_set>
#include <vector>

void containerComparison() {
    const size_t SIZE = 100000;
    const int SEARCH_VALUE = SIZE / 2;
    
    std::vector<int> vec(SIZE);
    std::iota(vec.begin(), vec.end(), 0);
    
    std::set<int> ordered_set(vec.begin(), vec.end());
    std::unordered_set<int> hash_set(vec.begin(), vec.end());
    
    // Linear search
    {
        Timer timer("vector linear search");
        auto it = std::find(vec.begin(), vec.end(), SEARCH_VALUE);
    }
    
    // Binary search
    {
        Timer timer("vector binary search");
        bool found = std::binary_search(vec.begin(), vec.end(), SEARCH_VALUE);
    }
    
    // Ordered set search
    {
        Timer timer("set search");
        auto it = ordered_set.find(SEARCH_VALUE);
    }
    
    // Hash table search
    {
        Timer timer("unordered_set search");
        auto it = hash_set.find(SEARCH_VALUE);
    }
}

Loop Optimization

cpp

void loopOptimization() {
    const size_t SIZE = 1000;
    std::vector<std::vector<int>> matrix(SIZE, std::vector<int>(SIZE, 1));
    
    // Row-wise access (cache friendly)
    {
        Timer timer("Row-wise access");
        long long sum = 0;
        for (size_t i = 0; i < SIZE; ++i) {
            for (size_t j = 0; j < SIZE; ++j) {
                sum += matrix[i][j];
            }
        }
    }
    
    // Column-wise access (cache unfriendly)
    {
        Timer timer("Column-wise access");
        long long sum = 0;
        for (size_t j = 0; j < SIZE; ++j) {
            for (size_t i = 0; i < SIZE; ++i) {
                sum += matrix[i][j];
            }
        }
    }
    
    // Loop unrolling
    {
        Timer timer("Loop unrolling");
        long long sum = 0;
        for (size_t i = 0; i < SIZE; ++i) {
            size_t j;
            for (j = 0; j + 4 <= SIZE; j += 4) {
                sum += matrix[i][j] + matrix[i][j+1] + 
                       matrix[i][j+2] + matrix[i][j+3];
            }
            for (; j < SIZE; ++j) {
                sum += matrix[i][j];
            }
        }
    }
}

🔧 Compiler Optimization

Inline and Constant Optimization

cpp

// Inline function
inline int fastAdd(int a, int b) {
    return a + b;
}

// Constant expression
constexpr int factorial(int n) {
    return n <= 1 ? 1 : n * factorial(n - 1);
}

void compilerOptimizations() {
    const int ITERATIONS = 10000000;
    
    // Function call
    {
        Timer timer("Function call");
        int sum = 0;
        for (int i = 0; i < ITERATIONS; ++i) {
            sum = fastAdd(sum, i);
        }
    }
    
    // Direct calculation
    {
        Timer timer("Direct calculation");
        int sum = 0;
        for (int i = 0; i < ITERATIONS; ++i) {
            sum += i;
        }
    }
    
    // Compile-time calculation
    constexpr int result = factorial(10);
    std::cout << "Compile-time calculation result: " << result << std::endl;
}

Branch Prediction Optimization

cpp

void branchPrediction() {
    const size_t SIZE = 1000000;
    std::vector<int> data(SIZE);
    
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(0, 100);
    for (auto& val : data) {
        val = dis(gen);
    }
    
    const int THRESHOLD = 50;
    
    // Random data (hard to predict branches)
    {
        Timer timer("Unpredictable branches");
        int count = 0;
        for (int value : data) {
            if (value > THRESHOLD) {
                count++;
            }
        }
    }
    
    // Sorted data (predictable branches)
    std::sort(data.begin(), data.end());
    {
        Timer timer("Predictable branches");
        int count = 0;
        for (int value : data) {
            if (value > THRESHOLD) {
                count++;
            }
        }
    }
    
    // Branchless version
    {
        Timer timer("Branchless version");
        int count = 0;
        for (int value : data) {
            count += (value > THRESHOLD);
        }
    }
}

🚀 Advanced Optimization

Parallel Algorithms

cpp

#include <execution>
#include <numeric>

void parallelOptimization() {
    const size_t SIZE = 10000000;
    std::vector<int> data(SIZE);
    std::iota(data.begin(), data.end(), 1);
    
    // Serial sum
    {
        Timer timer("Serial sum");
        long long sum = std::accumulate(data.begin(), data.end(), 0LL);
    }
    
    // Parallel sum (C++17)
    #ifdef __cpp_lib_parallel_algorithm
    {
        Timer timer("Parallel sum");
        long long sum = std::reduce(std::execution::par, data.begin(), data.end(), 0LL);
    }
    #endif
}

Cache Optimization

cpp

// Array of structures vs Structure of arrays
struct Point3D_AoS {
    float x, y, z;
};

struct Points3D_SoA {
    std::vector<float> x, y, z;
    Points3D_SoA(size_t size) : x(size), y(size), z(size) {}
};

void cacheOptimization() {
    const size_t SIZE = 1000000;
    
    std::vector<Point3D_AoS> aos_points(SIZE);
    Points3D_SoA soa_points(SIZE);
    
    // AoS access (potentially cache unfriendly)
    {
        Timer timer("AoS access");
        float sum = 0;
        for (const auto& point : aos_points) {
            sum += point.x;
        }
    }
    
    // SoA access (cache friendly)
    {
        Timer timer("SoA access");
        float sum = 0;
        for (float x : soa_points.x) {
            sum += x;
        }
    }
}

Demo Program

cpp

int main() {
    std::cout << "=== C++ Performance Optimization Demo ===" << std::endl;
    
    std::cout << "\n1. Memory access patterns:" << std::endl;
    memoryAccessTest();
    
    std::cout << "\n2. Object pool optimization:" << std::endl;
    objectPoolDemo();
    
    std::cout << "\n3. Container selection:" << std::endl;
    containerComparison();
    
    std::cout << "\n4. Loop optimization:" << std::endl;
    loopOptimization();
    
    std::cout << "\n5. Compiler optimization:" << std::endl;
    compilerOptimizations();
    
    std::cout << "\n6. Branch prediction:" << std::endl;
    branchPrediction();
    
    std::cout << "\n7. Parallel optimization:" << std::endl;
    parallelOptimization();
    
    std::cout << "\n8. Cache optimization:" << std::endl;
    cacheOptimization();
    
    return 0;
}

Summary

Optimization Strategies

Measure first: Measure before optimizing
Algorithm is king: Choose correct algorithms and data structures
Memory awareness: Consider cache friendliness and memory access patterns
Compiler cooperation: Utilize compiler optimization features

Performance Points

Technique	Impact	Use Cases
Cache optimization	High	Large data processing
Memory pools	Medium	Frequent allocation
Parallel algorithms	High	CPU-intensive tasks
Branch prediction	Medium	Condition-heavy tasks

Best Practices

Choose appropriate containers and algorithms
Consider memory layout and access patterns
Utilize compiler optimization options
Perform detailed optimization on critical paths
Balance code readability and performance

Performance optimization requires deep understanding of computer architecture and compiler working principles, and is an important skill for advanced C++ development.

C++ Performance Optimization ​

Overview ​

📊 Performance Measurement ​

Benchmarking ​

🧠 Memory Optimization ​

Object Pool Pattern ​

Memory Pool Allocator ​

⚡ Algorithm Optimization ​

Container Selection ​

Loop Optimization ​

🔧 Compiler Optimization ​

Inline and Constant Optimization ​

Branch Prediction Optimization ​

🚀 Advanced Optimization ​

Parallel Algorithms ​

Cache Optimization ​

Demo Program ​

Summary ​

Optimization Strategies ​

Performance Points ​

Best Practices ​