Pandas 高级功能

概述

本章将深入探讨 Pandas 的高级功能，包括多级索引（MultiIndex）、时间序列分析、数据透视表、窗口函数、自定义函数应用等。这些高级特性能够帮助你处理更复杂的数据分析任务，提升数据处理的效率和灵活性。

1. 多级索引（MultiIndex）

1.1 创建多级索引

python

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

print("=== 多级索引基础 ===")

# 创建多级索引的几种方法
def create_multiindex_examples():
    """创建多级索引示例"""
    
    # 方法1: 使用arrays创建
    arrays = [['A', 'A', 'B', 'B'], ['one', 'two', 'one', 'two']]
    tuples = list(zip(*arrays))
    index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
    
    s1 = pd.Series(np.random.randn(4), index=index)
    print("方法1 - 使用arrays创建:")
    print(s1)
    print()
    
    # 方法2: 使用product创建
    from itertools import product
    index2 = pd.MultiIndex.from_product([['A', 'B'], ['one', 'two']], 
                                       names=['first', 'second'])
    s2 = pd.Series(np.random.randn(4), index=index2)
    print("方法2 - 使用product创建:")
    print(s2)
    print()
    
    # 方法3: 直接在DataFrame中创建
    df = pd.DataFrame({
        'A': [1, 2, 3, 4, 5, 6],
        'B': [10, 20, 30, 40, 50, 60],
        'C': ['X', 'Y', 'X', 'Y', 'X', 'Y']
    })
    
    # 设置多级索引
    df_multi = df.set_index(['C', 'A'])
    print("方法3 - DataFrame多级索引:")
    print(df_multi)
    print()
    
    return df_multi

df_multi = create_multiindex_examples()

1.2 多级索引操作

python

print("=== 多级索引操作 ===")

def multiindex_operations():
    """多级索引操作示例"""
    
    # 创建示例数据
    np.random.seed(42)
    dates = pd.date_range('2023-01-01', periods=12, freq='M')
    regions = ['北京', '上海', '广州']
    products = ['产品A', '产品B']
    
    # 创建多级索引DataFrame
    index = pd.MultiIndex.from_product([dates, regions, products], 
                                     names=['日期', '地区', '产品'])
    
    data = {
        '销售额': np.random.randint(1000, 10000, len(index)),
        '销量': np.random.randint(10, 100, len(index)),
        '利润率': np.random.uniform(0.1, 0.3, len(index))
    }
    
    df = pd.DataFrame(data, index=index)
    print("多级索引DataFrame:")
    print(df.head(10))
    print()
    
    # 索引选择
    print("1. 选择特定地区的数据:")
    beijing_data = df.loc[(slice(None), '北京', slice(None)), :]
    print(beijing_data.head())
    print()
    
    print("2. 选择特定产品的数据:")
    product_a = df.loc[(slice(None), slice(None), '产品A'), :]
    print(product_a.head())
    print()
    
    print("3. 交叉选择:")
    cross_selection = df.loc[('2023-01-31', '上海', '产品B'), :]
    print(cross_selection)
    print()
    
    # 索引层级操作
    print("4. 交换索引层级:")
    df_swapped = df.swaplevel('地区', '产品')
    print(df_swapped.head())
    print()
    
    print("5. 重置索引:")
    df_reset = df.reset_index()
    print(df_reset.head())
    print()
    
    return df

sales_df = multiindex_operations()

1.3 多级索引聚合分析

python

print("=== 多级索引聚合分析 ===")

def multiindex_aggregation(df):
    """多级索引聚合分析"""
    
    print("1. 按地区聚合:")
    region_agg = df.groupby(level='地区').agg({
        '销售额': ['sum', 'mean'],
        '销量': 'sum',
        '利润率': 'mean'
    })
    print(region_agg)
    print()
    
    print("2. 按产品聚合:")
    product_agg = df.groupby(level='产品').agg({
        '销售额': 'sum',
        '销量': 'sum',
        '利润率': 'mean'
    })
    print(product_agg)
    print()
    
    print("3. 按地区和产品聚合:")
    region_product_agg = df.groupby(level=['地区', '产品']).agg({
        '销售额': 'sum',
        '销量': 'sum'
    })
    print(region_product_agg)
    print()
    
    print("4. 时间序列聚合:")
    monthly_trend = df.groupby(level='日期').sum()
    print(monthly_trend.head())
    print()
    
    # 可视化多级索引数据
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 地区销售额对比
    region_sales = df.groupby(level='地区')['销售额'].sum()
    region_sales.plot(kind='bar', ax=axes[0,0], color='skyblue')
    axes[0,0].set_title('各地区销售额对比')
    axes[0,0].set_ylabel('销售额')
    
    # 产品销售额对比
    product_sales = df.groupby(level='产品')['销售额'].sum()
    product_sales.plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
    axes[0,1].set_title('产品销售额占比')
    
    # 月度销售趋势
    monthly_sales = df.groupby(level='日期')['销售额'].sum()
    monthly_sales.plot(ax=axes[1,0], marker='o')
    axes[1,0].set_title('月度销售趋势')
    axes[1,0].set_ylabel('销售额')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # 地区产品热力图
    pivot_data = df.groupby(['地区', '产品'])['销售额'].sum().unstack()
    sns.heatmap(pivot_data, annot=True, fmt='.0f', ax=axes[1,1], cmap='YlOrRd')
    axes[1,1].set_title('地区-产品销售额热力图')
    
    plt.tight_layout()
    plt.show()
    
    return region_product_agg

agg_result = multiindex_aggregation(sales_df)

2. 时间序列分析

2.1 时间序列基础操作

python

print("\n=== 时间序列分析 ===")

def time_series_basics():
    """时间序列基础操作"""
    
    # 创建时间序列数据
    dates = pd.date_range('2023-01-01', periods=365, freq='D')
    
    # 模拟股价数据
    np.random.seed(42)
    price_changes = np.random.normal(0, 1, 365)
    prices = 100 + np.cumsum(price_changes)
    
    # 模拟交易量
    volumes = np.random.randint(1000, 10000, 365)
    
    # 创建时间序列DataFrame
    ts_df = pd.DataFrame({
        '价格': prices,
        '交易量': volumes,
        '收益率': price_changes
    }, index=dates)
    
    print("时间序列数据:")
    print(ts_df.head())
    print()
    
    # 时间序列索引操作
    print("1. 选择特定月份数据:")
    jan_data = ts_df['2023-01']
    print(jan_data.head())
    print()
    
    print("2. 选择日期范围:")
    range_data = ts_df['2023-01-01':'2023-01-31']
    print(f"1月份数据点数: {len(range_data)}")
    print()
    
    print("3. 按工作日筛选:")
    weekday_data = ts_df[ts_df.index.weekday < 5]  # 0-4表示周一到周五
    print(f"工作日数据点数: {len(weekday_data)}")
    print()
    
    return ts_df

ts_data = time_series_basics()

2.2 时间序列重采样和频率转换

python

print("=== 时间序列重采样 ===")

def time_series_resampling(ts_df):
    """时间序列重采样示例"""
    
    print("1. 日数据转换为周数据:")
    weekly_data = ts_df.resample('W').agg({
        '价格': 'last',      # 周末价格
        '交易量': 'sum',     # 周交易量总和
        '收益率': 'sum'      # 周收益率
    })
    print(weekly_data.head())
    print()
    
    print("2. 日数据转换为月数据:")
    monthly_data = ts_df.resample('M').agg({
        '价格': 'last',
        '交易量': 'mean',
        '收益率': ['sum', 'std']
    })
    print(monthly_data.head())
    print()
    
    print("3. 向前填充和向后填充:")
    # 创建有缺失值的数据
    ts_missing = ts_df.copy()
    ts_missing.loc['2023-01-15':'2023-01-20', '价格'] = np.nan
    
    # 向前填充
    forward_fill = ts_missing['价格'].fillna(method='ffill')
    # 向后填充
    backward_fill = ts_missing['价格'].fillna(method='bfill')
    # 线性插值
    interpolated = ts_missing['价格'].interpolate()
    
    print("缺失值处理对比:")
    comparison = pd.DataFrame({
        '原始': ts_missing['价格']['2023-01-10':'2023-01-25'],
        '向前填充': forward_fill['2023-01-10':'2023-01-25'],
        '向后填充': backward_fill['2023-01-10':'2023-01-25'],
        '线性插值': interpolated['2023-01-10':'2023-01-25']
    })
    print(comparison)
    print()
    
    return weekly_data, monthly_data

weekly, monthly = time_series_resampling(ts_data)

2.3 移动窗口和滚动统计

python

print("=== 移动窗口分析 ===")

def rolling_window_analysis(ts_df):
    """移动窗口分析"""
    
    # 计算移动平均
    ts_df['MA_5'] = ts_df['价格'].rolling(window=5).mean()
    ts_df['MA_20'] = ts_df['价格'].rolling(window=20).mean()
    ts_df['MA_50'] = ts_df['价格'].rolling(window=50).mean()
    
    # 计算移动标准差（波动率）
    ts_df['波动率_20'] = ts_df['收益率'].rolling(window=20).std()
    
    # 计算布林带
    ts_df['布林上轨'] = ts_df['MA_20'] + 2 * ts_df['价格'].rolling(window=20).std()
    ts_df['布林下轨'] = ts_df['MA_20'] - 2 * ts_df['价格'].rolling(window=20).std()
    
    print("移动平均和布林带:")
    print(ts_df[['价格', 'MA_5', 'MA_20', 'MA_50', '布林上轨', '布林下轨']].head(60).tail(10))
    print()
    
    # 可视化技术指标
    fig, axes = plt.subplots(3, 1, figsize=(15, 12))
    
    # 价格和移动平均线
    ts_df[['价格', 'MA_5', 'MA_20', 'MA_50']].plot(ax=axes[0])
    axes[0].set_title('价格和移动平均线')
    axes[0].set_ylabel('价格')
    axes[0].legend()
    
    # 布林带
    axes[1].plot(ts_df.index, ts_df['价格'], label='价格', alpha=0.7)
    axes[1].plot(ts_df.index, ts_df['布林上轨'], label='布林上轨', linestyle='--')
    axes[1].plot(ts_df.index, ts_df['布林下轨'], label='布林下轨', linestyle='--')
    axes[1].fill_between(ts_df.index, ts_df['布林上轨'], ts_df['布林下轨'], alpha=0.2)
    axes[1].set_title('布林带')
    axes[1].set_ylabel('价格')
    axes[1].legend()
    
    # 波动率
    ts_df['波动率_20'].plot(ax=axes[2], color='red')
    axes[2].set_title('20日滚动波动率')
    axes[2].set_ylabel('波动率')
    
    plt.tight_layout()
    plt.show()
    
    # 计算更多技术指标
    print("技术指标统计:")
    technical_stats = {
        '当前价格': ts_df['价格'].iloc[-1],
        '5日均价': ts_df['MA_5'].iloc[-1],
        '20日均价': ts_df['MA_20'].iloc[-1],
        '50日均价': ts_df['MA_50'].iloc[-1],
        '当前波动率': ts_df['波动率_20'].iloc[-1],
        '价格相对20日均价': (ts_df['价格'].iloc[-1] / ts_df['MA_20'].iloc[-1] - 1) * 100
    }
    
    for indicator, value in technical_stats.items():
        print(f"{indicator}: {value:.2f}")
    print()
    
    return ts_df

ts_with_indicators = rolling_window_analysis(ts_data)

3. 数据透视表高级应用

3.1 复杂透视表创建

python

print("\n=== 高级数据透视表 ===")

def advanced_pivot_tables():
    """高级数据透视表应用"""
    
    # 创建复杂的销售数据
    np.random.seed(42)
    n_records = 1000
    
    data = {
        '日期': pd.date_range('2023-01-01', periods=n_records, freq='D'),
        '销售员': np.random.choice(['张三', '李四', '王五', '赵六'], n_records),
        '地区': np.random.choice(['华北', '华东', '华南', '华西'], n_records),
        '产品类别': np.random.choice(['电子产品', '服装', '食品', '家具'], n_records),
        '产品': np.random.choice(['产品A', '产品B', '产品C', '产品D', '产品E'], n_records),
        '销售额': np.random.randint(100, 5000, n_records),
        '数量': np.random.randint(1, 50, n_records),
        '成本': np.random.randint(50, 3000, n_records)
    }
    
    sales_data = pd.DataFrame(data)
    sales_data['利润'] = sales_data['销售额'] - sales_data['成本']
    sales_data['利润率'] = sales_data['利润'] / sales_data['销售额']
    sales_data['月份'] = sales_data['日期'].dt.to_period('M')
    sales_data['季度'] = sales_data['日期'].dt.to_period('Q')
    
    print("销售数据样本:")
    print(sales_data.head())
    print()
    
    # 1. 基础透视表
    print("1. 地区-产品类别透视表:")
    pivot1 = pd.pivot_table(sales_data, 
                           values='销售额', 
                           index='地区', 
                           columns='产品类别', 
                           aggfunc='sum',
                           fill_value=0)
    print(pivot1)
    print()
    
    # 2. 多值透视表
    print("2. 多指标透视表:")
    pivot2 = pd.pivot_table(sales_data,
                           values=['销售额', '利润', '数量'],
                           index='地区',
                           columns='产品类别',
                           aggfunc={'销售额': 'sum', '利润': 'sum', '数量': 'sum'},
                           fill_value=0)
    print(pivot2.head())
    print()
    
    # 3. 多级透视表
    print("3. 多级索引透视表:")
    pivot3 = pd.pivot_table(sales_data,
                           values='销售额',
                           index=['地区', '销售员'],
                           columns=['产品类别', '季度'],
                           aggfunc='sum',
                           fill_value=0)
    print(pivot3.head())
    print()
    
    # 4. 自定义聚合函数
    print("4. 自定义聚合函数透视表:")
    def profit_margin_avg(x):
        return (x.sum() if len(x) > 0 else 0)
    
    pivot4 = pd.pivot_table(sales_data,
                           values=['销售额', '利润率'],
                           index='销售员',
                           columns='地区',
                           aggfunc={'销售额': 'sum', '利润率': 'mean'},
                           fill_value=0)
    print(pivot4)
    print()
    
    return sales_data, pivot1, pivot2

sales_data, pivot1, pivot2 = advanced_pivot_tables()

3.2 透视表可视化和分析

python

print("=== 透视表可视化分析 ===")

def pivot_visualization(sales_data, pivot1):
    """透视表可视化分析"""
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. 地区销售额堆叠柱状图
    pivot1.plot(kind='bar', stacked=True, ax=axes[0,0], colormap='Set3')
    axes[0,0].set_title('各地区产品类别销售额分布')
    axes[0,0].set_ylabel('销售额')
    axes[0,0].legend(title='产品类别', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 2. 热力图
    sns.heatmap(pivot1, annot=True, fmt='.0f', ax=axes[0,1], cmap='YlOrRd')
    axes[0,1].set_title('地区-产品类别销售额热力图')
    
    # 3. 销售员业绩对比
    salesperson_performance = sales_data.groupby('销售员').agg({
        '销售额': 'sum',
        '利润': 'sum',
        '数量': 'sum'
    })
    
    salesperson_performance['销售额'].plot(kind='bar', ax=axes[1,0], color='skyblue')
    axes[1,0].set_title('销售员业绩对比')
    axes[1,0].set_ylabel('销售额')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # 4. 月度趋势分析
    monthly_trend = sales_data.groupby('月份').agg({
        '销售额': 'sum',
        '利润': 'sum'
    })
    
    monthly_trend.plot(ax=axes[1,1], marker='o')
    axes[1,1].set_title('月度销售和利润趋势')
    axes[1,1].set_ylabel('金额')
    axes[1,1].legend()
    
    plt.tight_layout()
    plt.show()
    
    # 生成透视表分析报告
    print("透视表分析报告:")
    print("=" * 50)
    
    # 地区分析
    region_total = pivot1.sum(axis=1).sort_values(ascending=False)
    print(f"最佳销售地区: {region_total.index[0]} (销售额: {region_total.iloc[0]:,.0f})")
    print(f"最差销售地区: {region_total.index[-1]} (销售额: {region_total.iloc[-1]:,.0f})")
    
    # 产品类别分析
    category_total = pivot1.sum(axis=0).sort_values(ascending=False)
    print(f"最佳产品类别: {category_total.index[0]} (销售额: {category_total.iloc[0]:,.0f})")
    print(f"最差产品类别: {category_total.index[-1]} (销售额: {category_total.iloc[-1]:,.0f})")
    
    # 销售员分析
    salesperson_ranking = sales_data.groupby('销售员')['销售额'].sum().sort_values(ascending=False)
    print(f"最佳销售员: {salesperson_ranking.index[0]} (销售额: {salesperson_ranking.iloc[0]:,.0f})")
    
    print("=" * 50)

pivot_visualization(sales_data, pivot1)

4. 窗口函数和滚动计算

4.1 滚动窗口函数

python

print("\n=== 窗口函数高级应用 ===")

def advanced_window_functions():
    """高级窗口函数应用"""
    
    # 创建时间序列数据
    dates = pd.date_range('2023-01-01', periods=100, freq='D')
    np.random.seed(42)
    
    df = pd.DataFrame({
        '日期': dates,
        '销售额': np.random.randint(1000, 5000, 100) + np.sin(np.arange(100) * 0.1) * 500,
        '访客数': np.random.randint(100, 1000, 100),
        '转化率': np.random.uniform(0.01, 0.1, 100)
    })
    
    df.set_index('日期', inplace=True)
    
    print("原始数据:")
    print(df.head())
    print()
    
    # 1. 基础滚动统计
    print("1. 基础滚动统计:")
    df['销售额_7日均值'] = df['销售额'].rolling(window=7).mean()
    df['销售额_7日标准差'] = df['销售额'].rolling(window=7).std()
    df['销售额_7日最大值'] = df['销售额'].rolling(window=7).max()
    df['销售额_7日最小值'] = df['销售额'].rolling(window=7).min()
    
    print(df[['销售额', '销售额_7日均值', '销售额_7日标准差']].head(10))
    print()
    
    # 2. 扩展窗口（累积统计）
    print("2. 扩展窗口统计:")
    df['累积销售额'] = df['销售额'].expanding().sum()
    df['累积平均销售额'] = df['销售额'].expanding().mean()
    df['累积最大销售额'] = df['销售额'].expanding().max()
    
    print(df[['销售额', '累积销售额', '累积平均销售额']].head(10))
    print()
    
    # 3. 自定义窗口函数
    print("3. 自定义窗口函数:")
    
    def coefficient_of_variation(x):
        """变异系数"""
        return x.std() / x.mean() if x.mean() != 0 else 0
    
    def percentile_75(x):
        """75分位数"""
        return x.quantile(0.75)
    
    df['销售额_7日变异系数'] = df['销售额'].rolling(window=7).apply(coefficient_of_variation)
    df['销售额_7日_75分位'] = df['销售额'].rolling(window=7).apply(percentile_75)
    
    print(df[['销售额', '销售额_7日变异系数', '销售额_7日_75分位']].head(15))
    print()
    
    # 4. 多列窗口计算
    print("4. 多列窗口计算:")
    df['销售效率'] = df['销售额'] / df['访客数']
    df['销售效率_7日均值'] = df['销售效率'].rolling(window=7).mean()
    
    # 相关性滚动计算
    df['销售额访客数_7日相关性'] = df['销售额'].rolling(window=7).corr(df['访客数'])
    
    print(df[['销售效率', '销售效率_7日均值', '销售额访客数_7日相关性']].head(15))
    print()
    
    return df

window_df = advanced_window_functions()

4.2 窗口函数可视化

python

print("=== 窗口函数可视化 ===")

def visualize_window_functions(df):
    """窗口函数结果可视化"""
    
    fig, axes = plt.subplots(3, 2, figsize=(16, 15))
    
    # 1. 原始数据vs滚动平均
    df[['销售额', '销售额_7日均值']].plot(ax=axes[0,0])
    axes[0,0].set_title('销售额 vs 7日滚动平均')
    axes[0,0].set_ylabel('销售额')
    axes[0,0].legend()
    
    # 2. 滚动标准差（波动性）
    df['销售额_7日标准差'].plot(ax=axes[0,1], color='red')
    axes[0,1].set_title('7日滚动标准差（波动性）')
    axes[0,1].set_ylabel('标准差')
    
    # 3. 累积统计
    df[['累积销售额', '累积平均销售额']].plot(ax=axes[1,0])
    axes[1,0].set_title('累积统计')
    axes[1,0].legend()
    
    # 4. 变异系数
    df['销售额_7日变异系数'].plot(ax=axes[1,1], color='green')
    axes[1,1].set_title('7日变异系数')
    axes[1,1].set_ylabel('变异系数')
    
    # 5. 销售效率
    df[['销售效率', '销售效率_7日均值']].plot(ax=axes[2,0])
    axes[2,0].set_title('销售效率及其滚动平均')
    axes[2,0].set_ylabel('销售效率')
    axes[2,0].legend()
    
    # 6. 滚动相关性
    df['销售额访客数_7日相关性'].plot(ax=axes[2,1], color='purple')
    axes[2,1].set_title('销售额与访客数7日滚动相关性')
    axes[2,1].set_ylabel('相关系数')
    axes[2,1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
    
    # 窗口函数分析总结
    print("窗口函数分析总结:")
    print("=" * 50)
    
    latest_data = df.iloc[-1]
    print(f"最新销售额: {latest_data['销售额']:,.0f}")
    print(f"7日平均销售额: {latest_data['销售额_7日均值']:,.0f}")
    print(f"7日销售波动性: {latest_data['销售额_7日标准差']:,.2f}")
    print(f"累积总销售额: {latest_data['累积销售额']:,.0f}")
    print(f"当前销售效率: {latest_data['销售效率']:,.2f}")
    print(f"销售额与访客数相关性: {latest_data['销售额访客数_7日相关性']:,.3f}")
    
    # 趋势分析
    recent_trend = df['销售额_7日均值'].iloc[-7:].pct_change().mean()
    print(f"近期趋势 (7日平均增长率): {recent_trend:.2%}")
    
    print("=" * 50)

visualize_window_functions(window_df)

5. 自定义函数应用

5.1 apply、map、applymap函数

python

print("\n=== 自定义函数应用 ===")

def custom_function_applications():
    """自定义函数应用示例"""
    
    # 创建示例数据
    df = pd.DataFrame({
        '姓名': ['张三', '李四', '王五', '赵六', '钱七'],
        '年龄': [25, 30, 35, 28, 32],
        '工资': [8000, 12000, 15000, 9500, 11000],
        '部门': ['技术', '销售', '技术', '人事', '销售'],
        '入职日期': ['2020-01-15', '2019-03-20', '2018-07-10', '2021-02-28', '2020-09-05']
    })
    
    df['入职日期'] = pd.to_datetime(df['入职日期'])
    
    print("原始数据:")
    print(df)
    print()
    
    # 1. apply函数应用
    print("1. apply函数应用:")
    
    # 计算工作年限
    def calculate_work_years(hire_date):
        return (pd.Timestamp.now() - hire_date).days / 365.25
    
    df['工作年限'] = df['入职日期'].apply(calculate_work_years)
    
    # 工资等级分类
    def salary_grade(salary):
        if salary < 9000:
            return '初级'
        elif salary < 12000:
            return '中级'
        else:
            return '高级'
    
    df['工资等级'] = df['工资'].apply(salary_grade)
    
    print(df[['姓名', '工资', '工资等级', '工作年限']])
    print()
    
    # 2. 多列apply应用
    print("2. 多列apply应用:")
    
    def performance_score(row):
        """综合绩效评分"""
        base_score = 60
        
        # 工资加分
        if row['工资'] > 12000:
            base_score += 20
        elif row['工资'] > 10000:
            base_score += 10
        
        # 工作年限加分
        if row['工作年限'] > 3:
            base_score += 15
        elif row['工作年限'] > 1:
            base_score += 10
        
        # 年龄调整
        if 25 <= row['年龄'] <= 35:
            base_score += 5
        
        return min(base_score, 100)  # 最高100分
    
    df['绩效评分'] = df.apply(performance_score, axis=1)
    
    print(df[['姓名', '绩效评分']])
    print()
    
    # 3. map函数应用
    print("3. map函数应用:")
    
    # 部门映射
    department_mapping = {
        '技术': 'Technology',
        '销售': 'Sales',
        '人事': 'HR'
    }
    
    df['部门_英文'] = df['部门'].map(department_mapping)
    
    print(df[['姓名', '部门', '部门_英文']])
    print()
    
    # 4. 复杂数据处理
    print("4. 复杂数据处理:")
    
    # 创建包含复杂数据的DataFrame
    complex_df = pd.DataFrame({
        'A': [1, 2, 3, 4, 5],
        'B': [10.5, 20.3, 30.7, 40.2, 50.9],
        'C': ['apple', 'banana', 'cherry', 'date', 'elderberry']
    })
    
    # applymap应用于整个DataFrame
    def format_value(x):
        if isinstance(x, (int, float)):
            return f"{x:.1f}"
        else:
            return x.upper()
    
    formatted_df = complex_df.applymap(format_value)
    print("格式化后的DataFrame:")
    print(formatted_df)
    print()
    
    return df

employee_df = custom_function_applications()

5.2 向量化操作和性能优化

python

print("=== 向量化操作和性能优化 ===")

def vectorization_examples():
    """向量化操作示例"""
    
    # 创建大型数据集进行性能测试
    n = 100000
    np.random.seed(42)
    
    large_df = pd.DataFrame({
        'A': np.random.randint(1, 100, n),
        'B': np.random.randint(1, 100, n),
        'C': np.random.uniform(0, 1, n)
    })
    
    print(f"测试数据集大小: {len(large_df):,} 行")
    print()
    
    # 性能对比：循环 vs 向量化
    import time
    
    print("性能对比测试:")
    
    # 方法1: 使用循环（慢）
    start_time = time.time()
    result1 = []
    for i in range(len(large_df)):
        if large_df.iloc[i]['A'] > large_df.iloc[i]['B']:
            result1.append(large_df.iloc[i]['A'] * large_df.iloc[i]['C'])
        else:
            result1.append(large_df.iloc[i]['B'] * large_df.iloc[i]['C'])
    loop_time = time.time() - start_time
    
    # 方法2: 使用向量化操作（快）
    start_time = time.time()
    result2 = np.where(large_df['A'] > large_df['B'], 
                      large_df['A'] * large_df['C'], 
                      large_df['B'] * large_df['C'])
    vectorized_time = time.time() - start_time
    
    # 方法3: 使用pandas向量化
    start_time = time.time()
    result3 = large_df.apply(lambda row: row['A'] * row['C'] if row['A'] > row['B'] 
                           else row['B'] * row['C'], axis=1)
    pandas_apply_time = time.time() - start_time
    
    print(f"循环方法耗时: {loop_time:.4f} 秒")
    print(f"NumPy向量化耗时: {vectorized_time:.4f} 秒")
    print(f"Pandas apply耗时: {pandas_apply_time:.4f} 秒")
    print(f"向量化比循环快: {loop_time/vectorized_time:.1f} 倍")
    print()
    
    # 复杂向量化操作示例
    print("复杂向量化操作示例:")
    
    # 创建示例数据
    df = pd.DataFrame({
        '销售额': np.random.randint(1000, 10000, 1000),
        '成本': np.random.randint(500, 8000, 1000),
        '地区': np.random.choice(['北京', '上海', '广州', '深圳'], 1000),
        '产品类型': np.random.choice(['A', 'B', 'C'], 1000)
    })
    
    # 复杂的业务逻辑向量化
    conditions = [
        (df['销售额'] > 8000) & (df['地区'].isin(['北京', '上海'])),
        (df['销售额'] > 5000) & (df['地区'].isin(['广州', '深圳'])),
        df['销售额'] > 3000
    ]
    
    choices = ['优秀', '良好', '一般']
    
    df['业绩等级'] = np.select(conditions, choices, default='待改进')
    
    # 计算复杂指标
    df['利润'] = df['销售额'] - df['成本']
    df['利润率'] = df['利润'] / df['销售额']
    
    # 地区调整系数
    region_multiplier = {'北京': 1.2, '上海': 1.15, '广州': 1.1, '深圳': 1.05}
    df['调整后销售额'] = df['销售额'] * df['地区'].map(region_multiplier)
    
    print("向量化处理结果:")
    print(df.head())
    print()
    
    # 性能分析总结
    performance_summary = df.groupby(['地区', '业绩等级']).agg({
        '销售额': ['count', 'mean', 'sum'],
        '利润率': 'mean'
    }).round(2)
    
    print("业绩分析汇总:")
    print(performance_summary)
    
    return df

vectorized_df = vectorization_examples()

本章小结

本章深入介绍了 Pandas 的高级功能：

核心内容回顾

多级索引（MultiIndex）：创建、操作和聚合分析
时间序列分析：重采样、频率转换、移动窗口
高级数据透视表：复杂透视、多维分析、可视化
窗口函数：滚动计算、扩展窗口、自定义函数
自定义函数应用：apply、map、applymap和向量化优化

关键技能

处理复杂的层次化数据结构
进行专业的时间序列分析
创建多维数据透视和交叉分析
实现高效的滚动计算和技术指标
编写高性能的数据处理代码

最佳实践

优先使用向量化操作提升性能
合理选择窗口大小和聚合函数
利用多级索引简化复杂数据操作
结合可视化增强数据洞察
注意内存使用和计算效率

实际应用场景

金融数据分析和技术指标计算
多维业务数据的交叉分析
时间序列预测和趋势分析
复杂报表和仪表板开发
大数据处理和性能优化

掌握这些高级功能，能够帮助你：

处理更复杂的数据分析任务
提升数据处理的效率和质量
构建专业的数据分析解决方案
深入挖掘数据中的商业价值

练习题

创建一个包含多级索引的销售数据分析系统
实现一套完整的股票技术分析指标
设计一个多维度的业务数据透视分析工具
开发高性能的大数据处理流程
构建一个时间序列预测模型

下一章我们将学习 Pandas 的性能优化技巧，探索如何处理大规模数据和提升计算效率。

Pandas 高级功能 ​

概述 ​

1. 多级索引（MultiIndex） ​

1.1 创建多级索引 ​

1.2 多级索引操作 ​

1.3 多级索引聚合分析 ​

2. 时间序列分析 ​

2.1 时间序列基础操作 ​

2.2 时间序列重采样和频率转换 ​

2.3 移动窗口和滚动统计 ​

3. 数据透视表高级应用 ​

3.1 复杂透视表创建 ​

3.2 透视表可视化和分析 ​

4. 窗口函数和滚动计算 ​

4.1 滚动窗口函数 ​

4.2 窗口函数可视化 ​

5. 自定义函数应用 ​

5.1 apply、map、applymap函数 ​

5.2 向量化操作和性能优化 ​

本章小结 ​

核心内容回顾 ​

关键技能 ​

最佳实践 ​

实际应用场景 ​

练习题 ​

Pandas 高级功能

概述

1. 多级索引（MultiIndex）

1.1 创建多级索引

1.2 多级索引操作

1.3 多级索引聚合分析

2. 时间序列分析

2.1 时间序列基础操作

2.2 时间序列重采样和频率转换

2.3 移动窗口和滚动统计

3. 数据透视表高级应用

3.1 复杂透视表创建

3.2 透视表可视化和分析

4. 窗口函数和滚动计算

4.1 滚动窗口函数

4.2 窗口函数可视化

5. 自定义函数应用

5.1 apply、map、applymap函数

5.2 向量化操作和性能优化

本章小结

核心内容回顾

关键技能

最佳实践

实际应用场景

练习题