C++ Performance Optimization
Overview
Performance optimization is an important aspect of C++ programming. This chapter introduces key technologies such as memory management, algorithm optimization, compiler optimization, and performance measurement to help developers write efficient C++ programs.
📊 Performance Measurement
Benchmarking
cpp
#include <iostream>
#include <chrono>
#include <vector>
#include <algorithm>
class Timer {
private:
std::chrono::high_resolution_clock::time_point start_;
std::string name_;
public:
Timer(const std::string& name) : name_(name) {
start_ = std::chrono::high_resolution_clock::now();
}
~Timer() {
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start_);
std::cout << name_ << " elapsed: " << duration.count() << " microseconds" << std::endl;
}
};
void memoryAccessTest() {
const size_t SIZE = 1024 * 1024;
std::vector<int> data(SIZE, 1);
// Sequential access
{
Timer timer("Sequential access");
int sum = 0;
for (size_t i = 0; i < SIZE; ++i) {
sum += data[i];
}
}
// Strided access
{
Timer timer("Strided access");
int sum = 0;
for (size_t i = 0; i < SIZE; i += 16) {
sum += data[i];
}
}
}🧠 Memory Optimization
Object Pool Pattern
cpp
#include <vector>
#include <memory>
template<typename T>
class ObjectPool {
private:
std::vector<std::unique_ptr<T>> pool_;
std::vector<T*> available_;
public:
T* acquire() {
if (available_.empty()) {
pool_.push_back(std::make_unique<T>());
return pool_.back().get();
}
T* obj = available_.back();
available_.pop_back();
return obj;
}
void release(T* obj) {
available_.push_back(obj);
}
};
// Usage example
void objectPoolDemo() {
struct ExpensiveObject {
std::vector<int> data;
ExpensiveObject() : data(1000, 42) {}
};
ObjectPool<ExpensiveObject> pool;
// Object pool version
{
Timer timer("Object pool");
for (int i = 0; i < 1000; ++i) {
auto* obj = pool.acquire();
// Use object
pool.release(obj);
}
}
// Direct creation version
{
Timer timer("Direct creation");
for (int i = 0; i < 1000; ++i) {
auto obj = std::make_unique<ExpensiveObject>();
// Use object
}
}
}Memory Pool Allocator
cpp
class MemoryPool {
private:
char* memory_;
size_t block_size_;
std::vector<void*> free_blocks_;
public:
MemoryPool(size_t block_size, size_t num_blocks)
: block_size_(block_size) {
memory_ = new char[block_size * num_blocks];
for (size_t i = 0; i < num_blocks; ++i) {
free_blocks_.push_back(memory_ + i * block_size);
}
}
~MemoryPool() {
delete[] memory_;
}
void* allocate() {
if (free_blocks_.empty()) return nullptr;
void* block = free_blocks_.back();
free_blocks_.pop_back();
return block;
}
void deallocate(void* ptr) {
free_blocks_.push_back(ptr);
}
};⚡ Algorithm Optimization
Container Selection
cpp
#include <set>
#include <unordered_set>
#include <vector>
void containerComparison() {
const size_t SIZE = 100000;
const int SEARCH_VALUE = SIZE / 2;
std::vector<int> vec(SIZE);
std::iota(vec.begin(), vec.end(), 0);
std::set<int> ordered_set(vec.begin(), vec.end());
std::unordered_set<int> hash_set(vec.begin(), vec.end());
// Linear search
{
Timer timer("vector linear search");
auto it = std::find(vec.begin(), vec.end(), SEARCH_VALUE);
}
// Binary search
{
Timer timer("vector binary search");
bool found = std::binary_search(vec.begin(), vec.end(), SEARCH_VALUE);
}
// Ordered set search
{
Timer timer("set search");
auto it = ordered_set.find(SEARCH_VALUE);
}
// Hash table search
{
Timer timer("unordered_set search");
auto it = hash_set.find(SEARCH_VALUE);
}
}Loop Optimization
cpp
void loopOptimization() {
const size_t SIZE = 1000;
std::vector<std::vector<int>> matrix(SIZE, std::vector<int>(SIZE, 1));
// Row-wise access (cache friendly)
{
Timer timer("Row-wise access");
long long sum = 0;
for (size_t i = 0; i < SIZE; ++i) {
for (size_t j = 0; j < SIZE; ++j) {
sum += matrix[i][j];
}
}
}
// Column-wise access (cache unfriendly)
{
Timer timer("Column-wise access");
long long sum = 0;
for (size_t j = 0; j < SIZE; ++j) {
for (size_t i = 0; i < SIZE; ++i) {
sum += matrix[i][j];
}
}
}
// Loop unrolling
{
Timer timer("Loop unrolling");
long long sum = 0;
for (size_t i = 0; i < SIZE; ++i) {
size_t j;
for (j = 0; j + 4 <= SIZE; j += 4) {
sum += matrix[i][j] + matrix[i][j+1] +
matrix[i][j+2] + matrix[i][j+3];
}
for (; j < SIZE; ++j) {
sum += matrix[i][j];
}
}
}
}🔧 Compiler Optimization
Inline and Constant Optimization
cpp
// Inline function
inline int fastAdd(int a, int b) {
return a + b;
}
// Constant expression
constexpr int factorial(int n) {
return n <= 1 ? 1 : n * factorial(n - 1);
}
void compilerOptimizations() {
const int ITERATIONS = 10000000;
// Function call
{
Timer timer("Function call");
int sum = 0;
for (int i = 0; i < ITERATIONS; ++i) {
sum = fastAdd(sum, i);
}
}
// Direct calculation
{
Timer timer("Direct calculation");
int sum = 0;
for (int i = 0; i < ITERATIONS; ++i) {
sum += i;
}
}
// Compile-time calculation
constexpr int result = factorial(10);
std::cout << "Compile-time calculation result: " << result << std::endl;
}Branch Prediction Optimization
cpp
void branchPrediction() {
const size_t SIZE = 1000000;
std::vector<int> data(SIZE);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(0, 100);
for (auto& val : data) {
val = dis(gen);
}
const int THRESHOLD = 50;
// Random data (hard to predict branches)
{
Timer timer("Unpredictable branches");
int count = 0;
for (int value : data) {
if (value > THRESHOLD) {
count++;
}
}
}
// Sorted data (predictable branches)
std::sort(data.begin(), data.end());
{
Timer timer("Predictable branches");
int count = 0;
for (int value : data) {
if (value > THRESHOLD) {
count++;
}
}
}
// Branchless version
{
Timer timer("Branchless version");
int count = 0;
for (int value : data) {
count += (value > THRESHOLD);
}
}
}🚀 Advanced Optimization
Parallel Algorithms
cpp
#include <execution>
#include <numeric>
void parallelOptimization() {
const size_t SIZE = 10000000;
std::vector<int> data(SIZE);
std::iota(data.begin(), data.end(), 1);
// Serial sum
{
Timer timer("Serial sum");
long long sum = std::accumulate(data.begin(), data.end(), 0LL);
}
// Parallel sum (C++17)
#ifdef __cpp_lib_parallel_algorithm
{
Timer timer("Parallel sum");
long long sum = std::reduce(std::execution::par, data.begin(), data.end(), 0LL);
}
#endif
}Cache Optimization
cpp
// Array of structures vs Structure of arrays
struct Point3D_AoS {
float x, y, z;
};
struct Points3D_SoA {
std::vector<float> x, y, z;
Points3D_SoA(size_t size) : x(size), y(size), z(size) {}
};
void cacheOptimization() {
const size_t SIZE = 1000000;
std::vector<Point3D_AoS> aos_points(SIZE);
Points3D_SoA soa_points(SIZE);
// AoS access (potentially cache unfriendly)
{
Timer timer("AoS access");
float sum = 0;
for (const auto& point : aos_points) {
sum += point.x;
}
}
// SoA access (cache friendly)
{
Timer timer("SoA access");
float sum = 0;
for (float x : soa_points.x) {
sum += x;
}
}
}Demo Program
cpp
int main() {
std::cout << "=== C++ Performance Optimization Demo ===" << std::endl;
std::cout << "\n1. Memory access patterns:" << std::endl;
memoryAccessTest();
std::cout << "\n2. Object pool optimization:" << std::endl;
objectPoolDemo();
std::cout << "\n3. Container selection:" << std::endl;
containerComparison();
std::cout << "\n4. Loop optimization:" << std::endl;
loopOptimization();
std::cout << "\n5. Compiler optimization:" << std::endl;
compilerOptimizations();
std::cout << "\n6. Branch prediction:" << std::endl;
branchPrediction();
std::cout << "\n7. Parallel optimization:" << std::endl;
parallelOptimization();
std::cout << "\n8. Cache optimization:" << std::endl;
cacheOptimization();
return 0;
}Summary
Optimization Strategies
- Measure first: Measure before optimizing
- Algorithm is king: Choose correct algorithms and data structures
- Memory awareness: Consider cache friendliness and memory access patterns
- Compiler cooperation: Utilize compiler optimization features
Performance Points
| Technique | Impact | Use Cases |
|---|---|---|
| Cache optimization | High | Large data processing |
| Memory pools | Medium | Frequent allocation |
| Parallel algorithms | High | CPU-intensive tasks |
| Branch prediction | Medium | Condition-heavy tasks |
Best Practices
- Choose appropriate containers and algorithms
- Consider memory layout and access patterns
- Utilize compiler optimization options
- Perform detailed optimization on critical paths
- Balance code readability and performance
Performance optimization requires deep understanding of computer architecture and compiler working principles, and is an important skill for advanced C++ development.