Pandas Advanced Features
Overview
This chapter explores advanced Pandas features in depth, including MultiIndex (hierarchical indexing), time series analysis, pivot tables, window functions, and custom function applications. These advanced features help you handle more complex data analysis tasks and improve data processing efficiency and flexibility.
1. MultiIndex (Hierarchical Indexing)
1.1 Creating MultiIndex
python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
print("=== MultiIndex Basics ===")
# Several methods to create MultiIndex
def create_multiindex_examples():
"""Examples of creating MultiIndex"""
# Method 1: Create using arrays
arrays = [['A', 'A', 'B', 'B'], ['one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
s1 = pd.Series(np.random.randn(4), index=index)
print("Method 1 - Create using arrays:")
print(s1)
print()
# Method 2: Create using product
from itertools import product
index2 = pd.MultiIndex.from_product([['A', 'B'], ['one', 'two']],
names=['first', 'second'])
s2 = pd.Series(np.random.randn(4), index=index2)
print("Method 2 - Create using product:")
print(s2)
print()
# Method 3: Create directly in DataFrame
df = pd.DataFrame({
'A': [1, 2, 3, 4, 5, 6],
'B': [10, 20, 30, 40, 50, 60],
'C': ['X', 'Y', 'X', 'Y', 'X', 'Y']
})
# Set MultiIndex
df_multi = df.set_index(['C', 'A'])
print("Method 3 - DataFrame MultiIndex:")
print(df_multi)
print()
return df_multi
df_multi = create_multiindex_examples()1.2 MultiIndex Operations
python
print("=== MultiIndex Operations ===")
def multiindex_operations():
"""MultiIndex operation examples"""
# Create sample data
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=12, freq='M')
regions = ['Beijing', 'Shanghai', 'Guangzhou']
products = ['Product A', 'Product B']
# Create MultiIndex DataFrame
index = pd.MultiIndex.from_product([dates, regions, products],
names=['Date', 'Region', 'Product'])
data = {
'Sales': np.random.randint(1000, 10000, len(index)),
'Quantity': np.random.randint(10, 100, len(index)),
'Profit_Rate': np.random.uniform(0.1, 0.3, len(index))
}
df = pd.DataFrame(data, index=index)
print("MultiIndex DataFrame:")
print(df.head(10))
print()
# Index selection
print("1. Select data for specific region:")
beijing_data = df.loc[(slice(None), 'Beijing', slice(None)), :]
print(beijing_data.head())
print()
print("2. Select data for specific product:")
product_a = df.loc[(slice(None), slice(None), 'Product A'), :]
print(product_a.head())
print()
print("3. Cross selection:")
cross_selection = df.loc[('2023-01-31', 'Shanghai', 'Product B'), :]
print(cross_selection)
print()
# Index level operations
print("4. Swap index levels:")
df_swapped = df.swaplevel('Region', 'Product')
print(df_swapped.head())
print()
print("5. Reset index:")
df_reset = df.reset_index()
print(df_reset.head())
print()
return df
sales_df = multiindex_operations()1.3 MultiIndex Aggregation Analysis
python
print("=== MultiIndex Aggregation Analysis ===")
def multiindex_aggregation(df):
"""MultiIndex aggregation analysis"""
print("1. Aggregate by region:")
region_agg = df.groupby(level='Region').agg({
'Sales': ['sum', 'mean'],
'Quantity': 'sum',
'Profit_Rate': 'mean'
})
print(region_agg)
print()
print("2. Aggregate by product:")
product_agg = df.groupby(level='Product').agg({
'Sales': 'sum',
'Quantity': 'sum',
'Profit_Rate': 'mean'
})
print(product_agg)
print()
print("3. Aggregate by region and product:")
region_product_agg = df.groupby(level=['Region', 'Product']).agg({
'Sales': 'sum',
'Quantity': 'sum'
})
print(region_product_agg)
print()
print("4. Time series aggregation:")
monthly_trend = df.groupby(level='Date').sum()
print(monthly_trend.head())
print()
# Visualize MultiIndex data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Regional sales comparison
region_sales = df.groupby(level='Region')['Sales'].sum()
region_sales.plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Sales Comparison by Region')
axes[0,0].set_ylabel('Sales')
# Product sales comparison
product_sales = df.groupby(level='Product')['Sales'].sum()
product_sales.plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
axes[0,1].set_title('Product Sales Proportion')
# Monthly sales trend
monthly_sales = df.groupby(level='Date')['Sales'].sum()
monthly_sales.plot(ax=axes[1,0], marker='o')
axes[1,0].set_title('Monthly Sales Trend')
axes[1,0].set_ylabel('Sales')
axes[1,0].tick_params(axis='x', rotation=45)
# Region-product heatmap
pivot_data = df.groupby(['Region', 'Product'])['Sales'].sum().unstack()
sns.heatmap(pivot_data, annot=True, fmt='.0f', ax=axes[1,1], cmap='YlOrRd')
axes[1,1].set_title('Region-Product Sales Heatmap')
plt.tight_layout()
plt.show()
return region_product_agg
agg_result = multiindex_aggregation(sales_df)2. Time Series Analysis
2.1 Basic Time Series Operations
python
print("\n=== Time Series Analysis ===")
def time_series_basics():
"""Basic time series operations"""
# Create time series data
dates = pd.date_range('2023-01-01', periods=365, freq='D')
# Simulate stock price data
np.random.seed(42)
price_changes = np.random.normal(0, 1, 365)
prices = 100 + np.cumsum(price_changes)
# Simulate trading volume
volumes = np.random.randint(1000, 10000, 365)
# Create time series DataFrame
ts_df = pd.DataFrame({
'Price': prices,
'Volume': volumes,
'Return': price_changes
}, index=dates)
print("Time series data:")
print(ts_df.head())
print()
# Time series indexing operations
print("1. Select data for specific month:")
jan_data = ts_df['2023-01']
print(jan_data.head())
print()
print("2. Select date range:")
range_data = ts_df['2023-01-01':'2023-01-31']
print(f"January data points: {len(range_data)}")
print()
print("3. Filter by weekday:")
weekday_data = ts_df[ts_df.index.weekday < 5] # 0-4 represents Monday to Friday
print(f"Weekday data points: {len(weekday_data)}")
print()
return ts_df
ts_data = time_series_basics()2.2 Time Series Resampling and Frequency Conversion
python
print("=== Time Series Resampling ===")
def time_series_resampling(ts_df):
"""Time series resampling examples"""
print("1. Convert daily data to weekly:")
weekly_data = ts_df.resample('W').agg({
'Price': 'last', # Weekend price
'Volume': 'sum', # Weekly volume sum
'Return': 'sum' # Weekly return
})
print(weekly_data.head())
print()
print("2. Convert daily data to monthly:")
monthly_data = ts_df.resample('M').agg({
'Price': 'last',
'Volume': 'mean',
'Return': ['sum', 'std']
})
print(monthly_data.head())
print()
print("3. Forward fill and backward fill:")
# Create data with missing values
ts_missing = ts_df.copy()
ts_missing.loc['2023-01-15':'2023-01-20', 'Price'] = np.nan
# Forward fill
forward_fill = ts_missing['Price'].fillna(method='ffill')
# Backward fill
backward_fill = ts_missing['Price'].fillna(method='bfill')
# Linear interpolation
interpolated = ts_missing['Price'].interpolate()
print("Missing value handling comparison:")
comparison = pd.DataFrame({
'Original': ts_missing['Price']['2023-01-10':'2023-01-25'],
'Forward_Fill': forward_fill['2023-01-10':'2023-01-25'],
'Backward_Fill': backward_fill['2023-01-10':'2023-01-25'],
'Interpolation': interpolated['2023-01-10':'2023-01-25']
})
print(comparison)
print()
return weekly_data, monthly_data
weekly, monthly = time_series_resampling(ts_data)2.3 Rolling Windows and Rolling Statistics
python
print("=== Rolling Window Analysis ===")
def rolling_window_analysis(ts_df):
"""Rolling window analysis"""
# Calculate moving averages
ts_df['MA_5'] = ts_df['Price'].rolling(window=5).mean()
ts_df['MA_20'] = ts_df['Price'].rolling(window=20).mean()
ts_df['MA_50'] = ts_df['Price'].rolling(window=50).mean()
# Calculate rolling standard deviation (volatility)
ts_df['Volatility_20'] = ts_df['Return'].rolling(window=20).std()
# Calculate Bollinger Bands
ts_df['Bollinger_Upper'] = ts_df['MA_20'] + 2 * ts_df['Price'].rolling(window=20).std()
ts_df['Bollinger_Lower'] = ts_df['MA_20'] - 2 * ts_df['Price'].rolling(window=20).std()
print("Moving averages and Bollinger Bands:")
print(ts_df[['Price', 'MA_5', 'MA_20', 'MA_50', 'Bollinger_Upper', 'Bollinger_Lower']].head(60).tail(10))
print()
# Visualize technical indicators
fig, axes = plt.subplots(3, 1, figsize=(15, 12))
# Price and moving averages
ts_df[['Price', 'MA_5', 'MA_20', 'MA_50']].plot(ax=axes[0])
axes[0].set_title('Price and Moving Averages')
axes[0].set_ylabel('Price')
axes[0].legend()
# Bollinger Bands
axes[1].plot(ts_df.index, ts_df['Price'], label='Price', alpha=0.7)
axes[1].plot(ts_df.index, ts_df['Bollinger_Upper'], label='Upper Band', linestyle='--')
axes[1].plot(ts_df.index, ts_df['Bollinger_Lower'], label='Lower Band', linestyle='--')
axes[1].fill_between(ts_df.index, ts_df['Bollinger_Upper'], ts_df['Bollinger_Lower'], alpha=0.2)
axes[1].set_title('Bollinger Bands')
axes[1].set_ylabel('Price')
axes[1].legend()
# Volatility
ts_df['Volatility_20'].plot(ax=axes[2], color='red')
axes[2].set_title('20-Day Rolling Volatility')
axes[2].set_ylabel('Volatility')
plt.tight_layout()
plt.show()
# Calculate more technical indicators
print("Technical indicator statistics:")
technical_stats = {
'Current Price': ts_df['Price'].iloc[-1],
'5-Day Average': ts_df['MA_5'].iloc[-1],
'20-Day Average': ts_df['MA_20'].iloc[-1],
'50-Day Average': ts_df['MA_50'].iloc[-1],
'Current Volatility': ts_df['Volatility_20'].iloc[-1],
'Price vs 20-Day Avg (%)': (ts_df['Price'].iloc[-1] / ts_df['MA_20'].iloc[-1] - 1) * 100
}
for indicator, value in technical_stats.items():
print(f"{indicator}: {value:.2f}")
print()
return ts_df
ts_with_indicators = rolling_window_analysis(ts_data)3. Advanced Pivot Table Applications
3.1 Creating Complex Pivot Tables
python
print("\n=== Advanced Pivot Tables ===")
def advanced_pivot_tables():
"""Advanced pivot table applications"""
# Create complex sales data
np.random.seed(42)
n_records = 1000
data = {
'Date': pd.date_range('2023-01-01', periods=n_records, freq='D'),
'Salesperson': np.random.choice(['John', 'Alice', 'Bob', 'Carol'], n_records),
'Region': np.random.choice(['North', 'East', 'South', 'West'], n_records),
'Category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Furniture'], n_records),
'Product': np.random.choice(['Product A', 'Product B', 'Product C', 'Product D', 'Product E'], n_records),
'Sales': np.random.randint(100, 5000, n_records),
'Quantity': np.random.randint(1, 50, n_records),
'Cost': np.random.randint(50, 3000, n_records)
}
sales_data = pd.DataFrame(data)
sales_data['Profit'] = sales_data['Sales'] - sales_data['Cost']
sales_data['Profit_Rate'] = sales_data['Profit'] / sales_data['Sales']
sales_data['Month'] = sales_data['Date'].dt.to_period('M')
sales_data['Quarter'] = sales_data['Date'].dt.to_period('Q')
print("Sales data sample:")
print(sales_data.head())
print()
# 1. Basic pivot table
print("1. Region-Category pivot table:")
pivot1 = pd.pivot_table(sales_data,
values='Sales',
index='Region',
columns='Category',
aggfunc='sum',
fill_value=0)
print(pivot1)
print()
# 2. Multi-value pivot table
print("2. Multi-metric pivot table:")
pivot2 = pd.pivot_table(sales_data,
values=['Sales', 'Profit', 'Quantity'],
index='Region',
columns='Category',
aggfunc={'Sales': 'sum', 'Profit': 'sum', 'Quantity': 'sum'},
fill_value=0)
print(pivot2.head())
print()
# 3. Multi-level pivot table
print("3. Multi-level index pivot table:")
pivot3 = pd.pivot_table(sales_data,
values='Sales',
index=['Region', 'Salesperson'],
columns=['Category', 'Quarter'],
aggfunc='sum',
fill_value=0)
print(pivot3.head())
print()
# 4. Custom aggregation functions
print("4. Custom aggregation function pivot table:")
def profit_margin_avg(x):
return (x.sum() if len(x) > 0 else 0)
pivot4 = pd.pivot_table(sales_data,
values=['Sales', 'Profit_Rate'],
index='Salesperson',
columns='Region',
aggfunc={'Sales': 'sum', 'Profit_Rate': 'mean'},
fill_value=0)
print(pivot4)
print()
return sales_data, pivot1, pivot2
sales_data, pivot1, pivot2 = advanced_pivot_tables()3.2 Pivot Table Visualization and Analysis
python
print("=== Pivot Table Visualization Analysis ===")
def pivot_visualization(sales_data, pivot1):
"""Pivot table visualization analysis"""
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
# 1. Regional sales stacked bar chart
pivot1.plot(kind='bar', stacked=True, ax=axes[0,0], colormap='Set3')
axes[0,0].set_title('Regional Category Sales Distribution')
axes[0,0].set_ylabel('Sales')
axes[0,0].legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
# 2. Heatmap
sns.heatmap(pivot1, annot=True, fmt='.0f', ax=axes[0,1], cmap='YlOrRd')
axes[0,1].set_title('Region-Category Sales Heatmap')
# 3. Salesperson performance comparison
salesperson_performance = sales_data.groupby('Salesperson').agg({
'Sales': 'sum',
'Profit': 'sum',
'Quantity': 'sum'
})
salesperson_performance['Sales'].plot(kind='bar', ax=axes[1,0], color='skyblue')
axes[1,0].set_title('Salesperson Performance Comparison')
axes[1,0].set_ylabel('Sales')
axes[1,0].tick_params(axis='x', rotation=45)
# 4. Monthly trend analysis
monthly_trend = sales_data.groupby('Month').agg({
'Sales': 'sum',
'Profit': 'sum'
})
monthly_trend.plot(ax=axes[1,1], marker='o')
axes[1,1].set_title('Monthly Sales and Profit Trend')
axes[1,1].set_ylabel('Amount')
axes[1,1].legend()
plt.tight_layout()
plt.show()
# Generate pivot table analysis report
print("Pivot Table Analysis Report:")
print("=" * 50)
# Regional analysis
region_total = pivot1.sum(axis=1).sort_values(ascending=False)
print(f"Best sales region: {region_total.index[0]} (Sales: {region_total.iloc[0]:,.0f})")
print(f"Worst sales region: {region_total.index[-1]} (Sales: {region_total.iloc[-1]:,.0f})")
# Category analysis
category_total = pivot1.sum(axis=0).sort_values(ascending=False)
print(f"Best category: {category_total.index[0]} (Sales: {category_total.iloc[0]:,.0f})")
print(f"Worst category: {category_total.index[-1]} (Sales: {category_total.iloc[-1]:,.0f})")
# Salesperson analysis
salesperson_ranking = sales_data.groupby('Salesperson')['Sales'].sum().sort_values(ascending=False)
print(f"Top salesperson: {salesperson_ranking.index[0]} (Sales: {salesperson_ranking.iloc[0]:,.0f})")
print("=" * 50)
pivot_visualization(sales_data, pivot1)4. Window Functions and Rolling Calculations
4.1 Rolling Window Functions
python
print("\n=== Advanced Window Functions ===")
def advanced_window_functions():
"""Advanced window function applications"""
# Create time series data
dates = pd.date_range('2023-01-01', periods=100, freq='D')
np.random.seed(42)
df = pd.DataFrame({
'Date': dates,
'Sales': np.random.randint(1000, 5000, 100) + np.sin(np.arange(100) * 0.1) * 500,
'Visitors': np.random.randint(100, 1000, 100),
'Conversion_Rate': np.random.uniform(0.01, 0.1, 100)
})
df.set_index('Date', inplace=True)
print("Original data:")
print(df.head())
print()
# 1. Basic rolling statistics
print("1. Basic rolling statistics:")
df['Sales_7D_Mean'] = df['Sales'].rolling(window=7).mean()
df['Sales_7D_Std'] = df['Sales'].rolling(window=7).std()
df['Sales_7D_Max'] = df['Sales'].rolling(window=7).max()
df['Sales_7D_Min'] = df['Sales'].rolling(window=7).min()
print(df[['Sales', 'Sales_7D_Mean', 'Sales_7D_Std']].head(10))
print()
# 2. Expanding window (cumulative statistics)
print("2. Expanding window statistics:")
df['Cumulative_Sales'] = df['Sales'].expanding().sum()
df['Cumulative_Avg_Sales'] = df['Sales'].expanding().mean()
df['Cumulative_Max_Sales'] = df['Sales'].expanding().max()
print(df[['Sales', 'Cumulative_Sales', 'Cumulative_Avg_Sales']].head(10))
print()
# 3. Custom window functions
print("3. Custom window functions:")
def coefficient_of_variation(x):
"""Coefficient of variation"""
return x.std() / x.mean() if x.mean() != 0 else 0
def percentile_75(x):
"""75th percentile"""
return x.quantile(0.75)
df['Sales_7D_CV'] = df['Sales'].rolling(window=7).apply(coefficient_of_variation)
df['Sales_7D_P75'] = df['Sales'].rolling(window=7).apply(percentile_75)
print(df[['Sales', 'Sales_7D_CV', 'Sales_7D_P75']].head(15))
print()
# 4. Multi-column window calculations
print("4. Multi-column window calculations:")
df['Sales_Efficiency'] = df['Sales'] / df['Visitors']
df['Sales_Efficiency_7D_Mean'] = df['Sales_Efficiency'].rolling(window=7).mean()
# Rolling correlation calculation
df['Sales_Visitors_7D_Corr'] = df['Sales'].rolling(window=7).corr(df['Visitors'])
print(df[['Sales_Efficiency', 'Sales_Efficiency_7D_Mean', 'Sales_Visitors_7D_Corr']].head(15))
print()
return df
window_df = advanced_window_functions()4.2 Window Function Visualization
python
print("=== Window Function Visualization ===")
def visualize_window_functions(df):
"""Window function result visualization"""
fig, axes = plt.subplots(3, 2, figsize=(16, 15))
# 1. Original data vs rolling average
df[['Sales', 'Sales_7D_Mean']].plot(ax=axes[0,0])
axes[0,0].set_title('Sales vs 7-Day Rolling Average')
axes[0,0].set_ylabel('Sales')
axes[0,0].legend()
# 2. Rolling standard deviation (volatility)
df['Sales_7D_Std'].plot(ax=axes[0,1], color='red')
axes[0,1].set_title('7-Day Rolling Standard Deviation (Volatility)')
axes[0,1].set_ylabel('Standard Deviation')
# 3. Cumulative statistics
df[['Cumulative_Sales', 'Cumulative_Avg_Sales']].plot(ax=axes[1,0])
axes[1,0].set_title('Cumulative Statistics')
axes[1,0].legend()
# 4. Coefficient of variation
df['Sales_7D_CV'].plot(ax=axes[1,1], color='green')
axes[1,1].set_title('7-Day Coefficient of Variation')
axes[1,1].set_ylabel('CV')
# 5. Sales efficiency
df[['Sales_Efficiency', 'Sales_Efficiency_7D_Mean']].plot(ax=axes[2,0])
axes[2,0].set_title('Sales Efficiency and Rolling Average')
axes[2,0].set_ylabel('Sales Efficiency')
axes[2,0].legend()
# 6. Rolling correlation
df['Sales_Visitors_7D_Corr'].plot(ax=axes[2,1], color='purple')
axes[2,1].set_title('7-Day Rolling Correlation (Sales vs Visitors)')
axes[2,1].set_ylabel('Correlation')
axes[2,1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
# Window function analysis summary
print("Window Function Analysis Summary:")
print("=" * 50)
latest_data = df.iloc[-1]
print(f"Latest sales: {latest_data['Sales']:,.0f}")
print(f"7-day average sales: {latest_data['Sales_7D_Mean']:,.0f}")
print(f"7-day sales volatility: {latest_data['Sales_7D_Std']:,.2f}")
print(f"Cumulative total sales: {latest_data['Cumulative_Sales']:,.0f}")
print(f"Current sales efficiency: {latest_data['Sales_Efficiency']:,.2f}")
print(f"Sales-Visitors correlation: {latest_data['Sales_Visitors_7D_Corr']:,.3f}")
# Trend analysis
recent_trend = df['Sales_7D_Mean'].iloc[-7:].pct_change().mean()
print(f"Recent trend (7-day avg growth rate): {recent_trend:.2%}")
print("=" * 50)
visualize_window_functions(window_df)5. Custom Function Applications
5.1 apply, map, and applymap Functions
python
print("\n=== Custom Function Applications ===")
def custom_function_applications():
"""Custom function application examples"""
# Create sample data
df = pd.DataFrame({
'Name': ['John', 'Alice', 'Bob', 'Carol', 'David'],
'Age': [25, 30, 35, 28, 32],
'Salary': [8000, 12000, 15000, 9500, 11000],
'Department': ['Tech', 'Sales', 'Tech', 'HR', 'Sales'],
'Hire_Date': ['2020-01-15', '2019-03-20', '2018-07-10', '2021-02-28', '2020-09-05']
})
df['Hire_Date'] = pd.to_datetime(df['Hire_Date'])
print("Original data:")
print(df)
print()
# 1. apply function usage
print("1. apply function usage:")
# Calculate years of service
def calculate_work_years(hire_date):
return (pd.Timestamp.now() - hire_date).days / 365.25
df['Years_of_Service'] = df['Hire_Date'].apply(calculate_work_years)
# Salary grade classification
def salary_grade(salary):
if salary < 9000:
return 'Junior'
elif salary < 12000:
return 'Mid-level'
else:
return 'Senior'
df['Salary_Grade'] = df['Salary'].apply(salary_grade)
print(df[['Name', 'Salary', 'Salary_Grade', 'Years_of_Service']])
print()
# 2. Multi-column apply usage
print("2. Multi-column apply usage:")
def performance_score(row):
"""Comprehensive performance score"""
base_score = 60
# Salary bonus
if row['Salary'] > 12000:
base_score += 20
elif row['Salary'] > 10000:
base_score += 10
# Years of service bonus
if row['Years_of_Service'] > 3:
base_score += 15
elif row['Years_of_Service'] > 1:
base_score += 10
# Age adjustment
if 25 <= row['Age'] <= 35:
base_score += 5
return min(base_score, 100) # Max 100 points
df['Performance_Score'] = df.apply(performance_score, axis=1)
print(df[['Name', 'Performance_Score']])
print()
# 3. map function usage
print("3. map function usage:")
# Department mapping
department_mapping = {
'Tech': 'Technology',
'Sales': 'Sales',
'HR': 'Human Resources'
}
df['Department_Full'] = df['Department'].map(department_mapping)
print(df[['Name', 'Department', 'Department_Full']])
print()
# 4. Complex data processing
print("4. Complex data processing:")
# Create DataFrame with complex data
complex_df = pd.DataFrame({
'A': [1, 2, 3, 4, 5],
'B': [10.5, 20.3, 30.7, 40.2, 50.9],
'C': ['apple', 'banana', 'cherry', 'date', 'elderberry']
})
# applymap applied to entire DataFrame
def format_value(x):
if isinstance(x, (int, float)):
return f"{x:.1f}"
else:
return x.upper()
formatted_df = complex_df.applymap(format_value)
print("Formatted DataFrame:")
print(formatted_df)
print()
return df
employee_df = custom_function_applications()5.2 Vectorization and Performance Optimization
python
print("=== Vectorization and Performance Optimization ===")
def vectorization_examples():
"""Vectorization operation examples"""
# Create large dataset for performance testing
n = 100000
np.random.seed(42)
large_df = pd.DataFrame({
'A': np.random.randint(1, 100, n),
'B': np.random.randint(1, 100, n),
'C': np.random.uniform(0, 1, n)
})
print(f"Test dataset size: {len(large_df):,} rows")
print()
# Performance comparison: loop vs vectorization
import time
print("Performance comparison test:")
# Method 1: Using loop (slow)
start_time = time.time()
result1 = []
for i in range(len(large_df)):
if large_df.iloc[i]['A'] > large_df.iloc[i]['B']:
result1.append(large_df.iloc[i]['A'] * large_df.iloc[i]['C'])
else:
result1.append(large_df.iloc[i]['B'] * large_df.iloc[i]['C'])
loop_time = time.time() - start_time
# Method 2: Using vectorized operation (fast)
start_time = time.time()
result2 = np.where(large_df['A'] > large_df['B'],
large_df['A'] * large_df['C'],
large_df['B'] * large_df['C'])
vectorized_time = time.time() - start_time
# Method 3: Using pandas vectorization
start_time = time.time()
result3 = large_df.apply(lambda row: row['A'] * row['C'] if row['A'] > row['B']
else row['B'] * row['C'], axis=1)
pandas_apply_time = time.time() - start_time
print(f"Loop method time: {loop_time:.4f} seconds")
print(f"NumPy vectorization time: {vectorized_time:.4f} seconds")
print(f"Pandas apply time: {pandas_apply_time:.4f} seconds")
print(f"Vectorization faster than loop: {loop_time/vectorized_time:.1f}x")
print()
# Complex vectorization operation examples
print("Complex vectorization operation examples:")
# Create sample data
df = pd.DataFrame({
'Sales': np.random.randint(1000, 10000, 1000),
'Cost': np.random.randint(500, 8000, 1000),
'Region': np.random.choice(['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen'], 1000),
'Product_Type': np.random.choice(['A', 'B', 'C'], 1000)
})
# Complex business logic vectorization
conditions = [
(df['Sales'] > 8000) & (df['Region'].isin(['Beijing', 'Shanghai'])),
(df['Sales'] > 5000) & (df['Region'].isin(['Guangzhou', 'Shenzhen'])),
df['Sales'] > 3000
]
choices = ['Excellent', 'Good', 'Average']
df['Performance_Grade'] = np.select(conditions, choices, default='Needs Improvement')
# Calculate complex metrics
df['Profit'] = df['Sales'] - df['Cost']
df['Profit_Rate'] = df['Profit'] / df['Sales']
# Regional adjustment coefficient
region_multiplier = {'Beijing': 1.2, 'Shanghai': 1.15, 'Guangzhou': 1.1, 'Shenzhen': 1.05}
df['Adjusted_Sales'] = df['Sales'] * df['Region'].map(region_multiplier)
print("Vectorization processing result:")
print(df.head())
print()
# Performance analysis summary
performance_summary = df.groupby(['Region', 'Performance_Grade']).agg({
'Sales': ['count', 'mean', 'sum'],
'Profit_Rate': 'mean'
}).round(2)
print("Performance analysis summary:")
print(performance_summary)
return df
vectorized_df = vectorization_examples()Chapter Summary
This chapter provided an in-depth introduction to Pandas advanced features:
Core Content Review
- MultiIndex: Creation, operations, and aggregation analysis
- Time Series Analysis: Resampling, frequency conversion, rolling windows
- Advanced Pivot Tables: Complex pivots, multi-dimensional analysis, visualization
- Window Functions: Rolling calculations, expanding windows, custom functions
- Custom Function Applications: apply, map, applymap, and vectorization optimization
Key Skills
- Handling complex hierarchical data structures
- Conducting professional time series analysis
- Creating multi-dimensional pivot and cross-tabulation analysis
- Implementing efficient rolling calculations and technical indicators
- Writing high-performance data processing code
Best Practices
- Prioritize vectorized operations for performance
- Choose appropriate window sizes and aggregation functions
- Use MultiIndex to simplify complex data operations
- Combine visualization for enhanced data insights
- Monitor memory usage and computational efficiency
Practical Application Scenarios
- Financial data analysis and technical indicator calculation
- Multi-dimensional business data cross-analysis
- Time series forecasting and trend analysis
- Complex report and dashboard development
- Big data processing and performance optimization
Mastering these advanced features helps you:
- Handle more complex data analysis tasks
- Improve data processing efficiency and quality
- Build professional data analysis solutions
- Deeply mine business value from data
Practice Exercises
- Create a sales data analysis system with MultiIndex
- Implement a complete set of stock technical analysis indicators
- Design a multi-dimensional business data pivot analysis tool
- Develop a high-performance big data processing workflow
- Build a time series forecasting model
In the next chapter, we will learn about Pandas performance optimization techniques and explore how to handle large-scale data and improve computational efficiency.