Skip to content

Pandas JSON 处理

JSON(JavaScript Object Notation)是现代Web应用和API中最常用的数据交换格式。Pandas 提供了强大的JSON处理功能,可以轻松读取、写入和操作JSON数据。本章将详细介绍如何使用 Pandas 处理各种JSON数据格式。

1. JSON 数据格式概述

1.1 JSON 基础知识

JSON是一种轻量级的数据交换格式,具有以下特点:

  • 易于人阅读和编写
  • 易于机器解析和生成
  • 基于JavaScript语法,但独立于语言
  • 支持对象、数组、字符串、数字、布尔值和null

1.2 常见的JSON数据结构

python
import pandas as pd
import json
import numpy as np
from datetime import datetime

# 简单的JSON对象
simple_json = {
    "name": "张三",
    "age": 30,
    "city": "北京",
    "married": True
}

# JSON数组
json_array = [
    {"id": 1, "name": "张三", "score": 85},
    {"id": 2, "name": "李四", "score": 92},
    {"id": 3, "name": "王五", "score": 78}
]

# 嵌套JSON
nested_json = {
    "user": {
        "id": 1,
        "profile": {
            "name": "张三",
            "age": 30,
            "address": {
                "city": "北京",
                "district": "朝阳区"
            }
        },
        "orders": [
            {"order_id": "ORD001", "amount": 299.99, "date": "2023-01-15"},
            {"order_id": "ORD002", "amount": 159.50, "date": "2023-02-20"}
        ]
    }
}

print("简单JSON:", simple_json)
print("JSON数组:", json_array)
print("嵌套JSON:", nested_json)

2. 读取JSON数据

2.1 从字符串读取JSON

python
# 从JSON字符串创建DataFrame
json_string = '''
[
    {"name": "张三", "age": 25, "department": "IT", "salary": 8000},
    {"name": "李四", "age": 30, "department": "HR", "salary": 7000},
    {"name": "王五", "age": 28, "department": "Finance", "salary": 9000}
]
'''

# 使用pd.read_json()读取
df_from_string = pd.read_json(json_string)
print("从JSON字符串创建的DataFrame:")
print(df_from_string)
print("\n数据类型:")
print(df_from_string.dtypes)

2.2 从文件读取JSON

python
# 创建示例JSON文件
employees_data = [
    {"id": 1, "name": "张三", "age": 25, "department": "IT", "salary": 8000, "join_date": "2020-01-15"},
    {"id": 2, "name": "李四", "age": 30, "department": "HR", "salary": 7000, "join_date": "2019-03-20"},
    {"id": 3, "name": "王五", "age": 28, "department": "Finance", "salary": 9000, "join_date": "2021-05-10"},
    {"id": 4, "name": "赵六", "age": 32, "department": "IT", "salary": 9500, "join_date": "2018-12-01"}
]

# 保存为JSON文件
with open('employees.json', 'w', encoding='utf-8') as f:
    json.dump(employees_data, f, ensure_ascii=False, indent=2)

# 从文件读取
df_from_file = pd.read_json('employees.json')
print("从JSON文件读取的DataFrame:")
print(df_from_file)

# 处理日期列
df_from_file['join_date'] = pd.to_datetime(df_from_file['join_date'])
print("\n处理日期后的数据类型:")
print(df_from_file.dtypes)

2.3 读取不同格式的JSON

python
# 不同的JSON格式

# 1. 记录格式(默认)- 每行是一个记录
records_format = '''
[
    {"name": "张三", "age": 25, "city": "北京"},
    {"name": "李四", "age": 30, "city": "上海"}
]
'''

# 2. 索引格式 - 以索引为键
index_format = '''
{
    "0": {"name": "张三", "age": 25, "city": "北京"},
    "1": {"name": "李四", "age": 30, "city": "上海"}
}
'''

# 3. 列格式 - 以列名为键
columns_format = '''
{
    "name": {"0": "张三", "1": "李四"},
    "age": {"0": 25, "1": 30},
    "city": {"0": "北京", "1": "上海"}
}
'''

# 4. 值格式 - 只有值的数组
values_format = '''
[
    ["张三", 25, "北京"],
    ["李四", 30, "上海"]
]
'''

# 读取不同格式
df_records = pd.read_json(records_format, orient='records')
df_index = pd.read_json(index_format, orient='index')
df_columns = pd.read_json(columns_format, orient='columns')
df_values = pd.read_json(values_format, orient='values')

print("记录格式:")
print(df_records)
print("\n索引格式:")
print(df_index)
print("\n列格式:")
print(df_columns)
print("\n值格式(需要手动设置列名):")
df_values.columns = ['name', 'age', 'city']
print(df_values)

2.4 处理嵌套JSON

python
# 处理嵌套JSON数据
nested_data = '''
[
    {
        "id": 1,
        "name": "张三",
        "profile": {
            "age": 25,
            "address": {
                "city": "北京",
                "district": "朝阳区"
            }
        },
        "skills": ["Python", "SQL", "Machine Learning"]
    },
    {
        "id": 2,
        "name": "李四",
        "profile": {
            "age": 30,
            "address": {
                "city": "上海",
                "district": "浦东新区"
            }
        },
        "skills": ["Java", "Spring", "MySQL"]
    }
]
'''

# 使用json_normalize处理嵌套数据
from pandas import json_normalize

# 解析JSON字符串
data = json.loads(nested_data)

# 标准化嵌套JSON
df_normalized = json_normalize(data)
print("标准化后的DataFrame:")
print(df_normalized)
print("\n列名:")
print(df_normalized.columns.tolist())

# 指定分隔符
df_custom_sep = json_normalize(data, sep='_')
print("\n使用自定义分隔符:")
print(df_custom_sep.columns.tolist())

# 处理数组字段
df_with_meta = json_normalize(data, record_path='skills', 
                              meta=['id', 'name', ['profile', 'age']], 
                              meta_prefix='user_')
print("\n展开技能数组:")
print(df_with_meta)

3. 写入JSON数据

3.1 基本写入操作

python
# 创建示例DataFrame
df = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['张三', '李四', '王五', '赵六'],
    'age': [25, 30, 28, 32],
    'department': ['IT', 'HR', 'Finance', 'IT'],
    'salary': [8000, 7000, 9000, 9500],
    'join_date': pd.to_datetime(['2020-01-15', '2019-03-20', '2021-05-10', '2018-12-01'])
})

print("原始DataFrame:")
print(df)

# 转换为JSON字符串
json_string = df.to_json(orient='records', force_ascii=False, indent=2)
print("\nJSON字符串:")
print(json_string)

# 保存到文件
df.to_json('output.json', orient='records', force_ascii=False, indent=2)
print("\n已保存到 output.json 文件")

3.2 不同的输出格式

python
# 演示不同的orient参数
print("1. records格式(默认):")
print(df.to_json(orient='records', force_ascii=False, indent=2)[:200] + "...")

print("\n2. index格式:")
print(df.to_json(orient='index', force_ascii=False, indent=2)[:200] + "...")

print("\n3. columns格式:")
print(df.to_json(orient='columns', force_ascii=False, indent=2)[:200] + "...")

print("\n4. values格式:")
print(df.to_json(orient='values', force_ascii=False, indent=2)[:200] + "...")

print("\n5. split格式:")
print(df.to_json(orient='split', force_ascii=False, indent=2)[:200] + "...")

print("\n6. table格式:")
print(df.to_json(orient='table', force_ascii=False, indent=2)[:200] + "...")

3.3 处理日期和时间

python
# 日期处理选项
print("默认日期格式(epoch毫秒):")
print(df[['name', 'join_date']].to_json(orient='records'))

print("\nISO格式日期:")
print(df[['name', 'join_date']].to_json(orient='records', date_format='iso'))

print("\n自定义日期格式:")
print(df[['name', 'join_date']].to_json(orient='records', date_format='%Y-%m-%d'))

# 处理NaN值
df_with_nan = df.copy()
df_with_nan.loc[0, 'salary'] = np.nan

print("\n包含NaN的数据(默认null):")
print(df_with_nan[['name', 'salary']].to_json(orient='records'))

print("\n自定义NaN值:")
print(df_with_nan[['name', 'salary']].to_json(orient='records', na_value='N/A'))

4. JSON数据操作和转换

4.1 JSON数据的清洗和预处理

python
# 处理不规范的JSON数据
irregular_json = '''
[
    {"name": "张三", "age": "25", "salary": "8000", "active": "true"},
    {"name": "李四", "age": "30", "salary": "7000", "active": "false"},
    {"name": "王五", "age": null, "salary": "9000", "active": "true"},
    {"name": "赵六", "age": "32", "salary": null, "active": "1"}
]
'''

df_irregular = pd.read_json(irregular_json)
print("原始不规范数据:")
print(df_irregular)
print("\n数据类型:")
print(df_irregular.dtypes)

# 数据类型转换和清洗
df_cleaned = df_irregular.copy()

# 转换数值类型
df_cleaned['age'] = pd.to_numeric(df_cleaned['age'], errors='coerce')
df_cleaned['salary'] = pd.to_numeric(df_cleaned['salary'], errors='coerce')

# 转换布尔类型
def convert_to_bool(value):
    if pd.isna(value):
        return False
    if isinstance(value, str):
        return value.lower() in ['true', '1', 'yes']
    return bool(value)

df_cleaned['active'] = df_cleaned['active'].apply(convert_to_bool)

print("\n清洗后的数据:")
print(df_cleaned)
print("\n清洗后的数据类型:")
print(df_cleaned.dtypes)

4.2 JSON数据的合并和拆分

python
# 合并多个JSON文件的数据
json_file1 = '''
[
    {"id": 1, "name": "张三", "department": "IT"},
    {"id": 2, "name": "李四", "department": "HR"}
]
'''

json_file2 = '''
[
    {"id": 1, "salary": 8000, "join_date": "2020-01-15"},
    {"id": 2, "salary": 7000, "join_date": "2019-03-20"}
]
'''

df1 = pd.read_json(json_file1)
df2 = pd.read_json(json_file2)

# 合并数据
df_merged = pd.merge(df1, df2, on='id')
print("合并后的数据:")
print(df_merged)

# 拆分复杂字段
complex_json = '''
[
    {"id": 1, "name": "张三", "contact": "email:zhang@example.com;phone:13800138000"},
    {"id": 2, "name": "李四", "contact": "email:li@example.com;phone:13900139000"}
]
'''

df_complex = pd.read_json(complex_json)
print("\n原始复杂数据:")
print(df_complex)

# 拆分contact字段
def parse_contact(contact_str):
    parts = contact_str.split(';')
    result = {}
    for part in parts:
        key, value = part.split(':')
        result[key] = value
    return result

contact_data = df_complex['contact'].apply(parse_contact)
contact_df = pd.json_normalize(contact_data)

# 合并拆分后的数据
df_expanded = pd.concat([df_complex[['id', 'name']], contact_df], axis=1)
print("\n拆分后的数据:")
print(df_expanded)

4.3 处理JSON数组字段

python
# 处理包含数组的JSON数据
array_json = '''
[
    {
        "id": 1,
        "name": "张三",
        "skills": ["Python", "SQL", "Machine Learning"],
        "projects": [
            {"name": "项目A", "status": "完成"},
            {"name": "项目B", "status": "进行中"}
        ]
    },
    {
        "id": 2,
        "name": "李四",
        "skills": ["Java", "Spring", "MySQL"],
        "projects": [
            {"name": "项目C", "status": "完成"}
        ]
    }
]
'''

data = json.loads(array_json)

# 展开技能数组
skills_df = json_normalize(data, record_path='skills', 
                          meta=['id', 'name'], 
                          record_prefix='skill_')
print("展开的技能数据:")
print(skills_df)

# 展开项目数组
projects_df = json_normalize(data, record_path='projects', 
                            meta=['id', 'name'])
print("\n展开的项目数据:")
print(projects_df)

# 统计每个人的技能数量
skills_count = skills_df.groupby(['id', 'name']).size().reset_index(name='skills_count')
print("\n技能数量统计:")
print(skills_count)

5. 处理API响应数据

5.1 模拟API响应处理

python
# 模拟API响应数据
api_response = '''
{
    "status": "success",
    "message": "数据获取成功",
    "data": {
        "total": 100,
        "page": 1,
        "per_page": 10,
        "users": [
            {
                "id": 1,
                "username": "zhang_san",
                "profile": {
                    "name": "张三",
                    "age": 25,
                    "email": "zhang@example.com"
                },
                "last_login": "2023-12-01T10:30:00Z",
                "is_active": true
            },
            {
                "id": 2,
                "username": "li_si",
                "profile": {
                    "name": "李四",
                    "age": 30,
                    "email": "li@example.com"
                },
                "last_login": "2023-12-02T14:20:00Z",
                "is_active": false
            }
        ]
    }
}
'''

# 解析API响应
response_data = json.loads(api_response)

# 提取用户数据
if response_data['status'] == 'success':
    users_data = response_data['data']['users']
    df_users = json_normalize(users_data)
    
    print("API用户数据:")
    print(df_users)
    
    # 处理时间戳
    df_users['last_login'] = pd.to_datetime(df_users['last_login'])
    
    print("\n处理时间戳后:")
    print(df_users[['username', 'last_login', 'is_active']])
else:
    print(f"API请求失败: {response_data['message']}")

5.2 批量处理API数据

python
# 模拟批量API数据处理
def process_api_batch(api_responses):
    """
    批量处理API响应数据
    """
    all_data = []
    
    for i, response in enumerate(api_responses):
        try:
            data = json.loads(response)
            if data['status'] == 'success':
                # 添加批次信息
                batch_data = data['data']['users']
                for item in batch_data:
                    item['batch_id'] = i + 1
                    item['processed_at'] = datetime.now().isoformat()
                
                all_data.extend(batch_data)
            else:
                print(f"批次 {i+1} 处理失败: {data['message']}")
        except json.JSONDecodeError as e:
            print(f"批次 {i+1} JSON解析错误: {e}")
        except Exception as e:
            print(f"批次 {i+1} 处理错误: {e}")
    
    if all_data:
        df_result = json_normalize(all_data)
        return df_result
    else:
        return pd.DataFrame()

# 模拟多个API响应
api_responses = [api_response, api_response.replace('zhang_san', 'wang_wu')]
df_batch = process_api_batch(api_responses)

print("批量处理结果:")
print(df_batch[['username', 'profile.name', 'batch_id']])

6. JSON数据分析案例

6.1 电商订单数据分析

python
# 电商订单JSON数据
ecommerce_data = '''
[
    {
        "order_id": "ORD001",
        "customer": {
            "id": 1001,
            "name": "张三",
            "city": "北京"
        },
        "items": [
            {"product_id": "P001", "name": "笔记本电脑", "price": 5999, "quantity": 1},
            {"product_id": "P002", "name": "鼠标", "price": 99, "quantity": 2}
        ],
        "order_date": "2023-12-01",
        "status": "已完成"
    },
    {
        "order_id": "ORD002",
        "customer": {
            "id": 1002,
            "name": "李四",
            "city": "上海"
        },
        "items": [
            {"product_id": "P003", "name": "手机", "price": 3999, "quantity": 1}
        ],
        "order_date": "2023-12-02",
        "status": "已发货"
    }
]
'''

orders_data = json.loads(ecommerce_data)

# 分析订单数据
# 1. 基本订单信息
orders_df = json_normalize(orders_data)
print("订单基本信息:")
print(orders_df[['order_id', 'customer.name', 'customer.city', 'order_date', 'status']])

# 2. 展开商品明细
items_df = json_normalize(orders_data, record_path='items', 
                         meta=['order_id', ['customer', 'name'], ['customer', 'city']])
print("\n商品明细:")
print(items_df)

# 3. 计算订单总金额
items_df['total_price'] = items_df['price'] * items_df['quantity']
order_totals = items_df.groupby('order_id')['total_price'].sum().reset_index()
print("\n订单总金额:")
print(order_totals)

# 4. 城市销售统计
city_sales = items_df.groupby('customer.city')['total_price'].sum().reset_index()
print("\n城市销售统计:")
print(city_sales)

6.2 社交媒体数据分析

python
# 社交媒体帖子数据
social_media_data = '''
[
    {
        "post_id": "POST001",
        "author": {
            "username": "user123",
            "followers": 1500,
            "verified": true
        },
        "content": "今天天气真好!",
        "timestamp": "2023-12-01T10:30:00Z",
        "engagement": {
            "likes": 45,
            "comments": 12,
            "shares": 8
        },
        "hashtags": ["#天气", "#心情"],
        "mentions": ["@friend1", "@friend2"]
    },
    {
        "post_id": "POST002",
        "author": {
            "username": "user456",
            "followers": 3200,
            "verified": false
        },
        "content": "分享一个有趣的技术文章",
        "timestamp": "2023-12-01T15:45:00Z",
        "engagement": {
            "likes": 89,
            "comments": 23,
            "shares": 15
        },
        "hashtags": ["#技术", "#分享", "#学习"],
        "mentions": []
    }
]
'''

social_data = json.loads(social_media_data)

# 分析社交媒体数据
posts_df = json_normalize(social_data)
print("帖子数据:")
print(posts_df[['post_id', 'author.username', 'author.followers', 'content']])

# 计算参与度指标
posts_df['total_engagement'] = (posts_df['engagement.likes'] + 
                               posts_df['engagement.comments'] + 
                               posts_df['engagement.shares'])

posts_df['engagement_rate'] = posts_df['total_engagement'] / posts_df['author.followers']

print("\n参与度分析:")
print(posts_df[['post_id', 'author.username', 'total_engagement', 'engagement_rate']])

# 展开标签数据
hashtags_df = json_normalize(social_data, record_path='hashtags', 
                            meta=['post_id', 'author.username'], 
                            record_prefix='hashtag_')
print("\n标签使用统计:")
hashtag_counts = hashtags_df['hashtag_0'].value_counts()
print(hashtag_counts)

7. 性能优化和最佳实践

7.1 大型JSON文件处理

python
# 处理大型JSON文件的策略
def process_large_json(file_path, chunk_size=1000):
    """
    分块处理大型JSON文件
    """
    import ijson  # 需要安装: pip install ijson
    
    results = []
    current_chunk = []
    
    with open(file_path, 'rb') as file:
        # 使用ijson进行流式解析
        parser = ijson.parse(file)
        
        for prefix, event, value in parser:
            if event == 'start_map':
                current_item = {}
            elif event == 'map_key':
                current_key = value
            elif event == 'string' or event == 'number':
                current_item[current_key] = value
            elif event == 'end_map':
                current_chunk.append(current_item)
                
                if len(current_chunk) >= chunk_size:
                    # 处理当前块
                    chunk_df = pd.DataFrame(current_chunk)
                    results.append(chunk_df)
                    current_chunk = []
        
        # 处理最后一块
        if current_chunk:
            chunk_df = pd.DataFrame(current_chunk)
            results.append(chunk_df)
    
    return pd.concat(results, ignore_index=True)

# 内存优化技巧
def optimize_json_dataframe(df):
    """
    优化JSON DataFrame的内存使用
    """
    # 转换数据类型
    for col in df.columns:
        if df[col].dtype == 'object':
            # 尝试转换为数值类型
            numeric_series = pd.to_numeric(df[col], errors='ignore')
            if numeric_series.dtype != 'object':
                df[col] = numeric_series
            else:
                # 转换为category类型(如果重复值较多)
                if df[col].nunique() / len(df) < 0.5:
                    df[col] = df[col].astype('category')
    
    return df

7.2 JSON数据验证

python
# JSON数据验证
def validate_json_data(df, schema):
    """
    验证JSON数据是否符合预期格式
    """
    validation_errors = []
    
    # 检查必需字段
    for field in schema.get('required', []):
        if field not in df.columns:
            validation_errors.append(f"缺少必需字段: {field}")
        elif df[field].isnull().any():
            validation_errors.append(f"字段 {field} 包含空值")
    
    # 检查数据类型
    for field, expected_type in schema.get('types', {}).items():
        if field in df.columns:
            if expected_type == 'numeric':
                if not pd.api.types.is_numeric_dtype(df[field]):
                    validation_errors.append(f"字段 {field} 应为数值类型")
            elif expected_type == 'datetime':
                try:
                    pd.to_datetime(df[field])
                except:
                    validation_errors.append(f"字段 {field} 应为日期时间类型")
    
    # 检查数值范围
    for field, range_config in schema.get('ranges', {}).items():
        if field in df.columns:
            min_val, max_val = range_config
            if (df[field] < min_val).any() or (df[field] > max_val).any():
                validation_errors.append(f"字段 {field} 值超出范围 [{min_val}, {max_val}]")
    
    return validation_errors

# 定义数据模式
data_schema = {
    'required': ['id', 'name', 'age'],
    'types': {
        'id': 'numeric',
        'age': 'numeric',
        'join_date': 'datetime'
    },
    'ranges': {
        'age': (0, 120),
        'salary': (0, 1000000)
    }
}

# 验证数据
validation_errors = validate_json_data(df, data_schema)
if validation_errors:
    print("数据验证错误:")
    for error in validation_errors:
        print(f"- {error}")
else:
    print("数据验证通过")

8. 实际应用场景

8.1 配置文件管理

python
# 应用配置管理
config_json = '''
{
    "database": {
        "host": "localhost",
        "port": 5432,
        "name": "myapp",
        "credentials": {
            "username": "admin",
            "password": "secret123"
        }
    },
    "api": {
        "base_url": "https://api.example.com",
        "timeout": 30,
        "retry_count": 3
    },
    "features": {
        "enable_logging": true,
        "debug_mode": false,
        "max_file_size": 10485760
    }
}
'''

config_data = json.loads(config_json)
config_df = json_normalize(config_data)
print("配置数据:")
print(config_df)

# 配置项管理
def get_config_value(config_df, key_path):
    """
    获取嵌套配置值
    """
    if key_path in config_df.columns:
        return config_df[key_path].iloc[0]
    return None

db_host = get_config_value(config_df, 'database.host')
api_timeout = get_config_value(config_df, 'api.timeout')
print(f"\n数据库主机: {db_host}")
print(f"API超时时间: {api_timeout}")

8.2 日志数据分析

python
# 应用日志数据
log_data = '''
[
    {
        "timestamp": "2023-12-01T10:30:15.123Z",
        "level": "INFO",
        "message": "用户登录成功",
        "user_id": 1001,
        "ip_address": "192.168.1.100",
        "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "response_time": 245
    },
    {
        "timestamp": "2023-12-01T10:31:22.456Z",
        "level": "ERROR",
        "message": "数据库连接失败",
        "error_code": "DB_CONNECTION_ERROR",
        "ip_address": "192.168.1.101",
        "response_time": 5000
    },
    {
        "timestamp": "2023-12-01T10:32:10.789Z",
        "level": "WARN",
        "message": "API调用频率过高",
        "user_id": 1002,
        "ip_address": "192.168.1.102",
        "api_endpoint": "/api/users",
        "response_time": 1200
    }
]
'''

logs_df = pd.read_json(log_data)
logs_df['timestamp'] = pd.to_datetime(logs_df['timestamp'])

print("日志数据分析:")
print(logs_df[['timestamp', 'level', 'message', 'response_time']])

# 日志级别统计
log_level_counts = logs_df['level'].value_counts()
print("\n日志级别统计:")
print(log_level_counts)

# 响应时间分析
print("\n响应时间统计:")
print(logs_df['response_time'].describe())

# 错误日志分析
error_logs = logs_df[logs_df['level'] == 'ERROR']
print("\n错误日志:")
print(error_logs[['timestamp', 'message', 'error_code']])

9. 常见问题和解决方案

9.1 编码问题

python
# 处理编码问题
def safe_read_json(file_path, encodings=['utf-8', 'gbk', 'gb2312']):
    """
    安全读取可能有编码问题的JSON文件
    """
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                data = json.load(f)
            return pd.json_normalize(data)
        except UnicodeDecodeError:
            continue
        except json.JSONDecodeError as e:
            print(f"JSON解析错误 ({encoding}): {e}")
            continue
    
    raise ValueError(f"无法使用任何编码读取文件: {file_path}")

# 处理特殊字符
def clean_json_string(json_str):
    """
    清理JSON字符串中的特殊字符
    """
    # 移除BOM标记
    if json_str.startswith('\ufeff'):
        json_str = json_str[1:]
    
    # 替换常见的问题字符
    replacements = {
        '\u0000': '',  # NULL字符
        '\u000b': ' ',  # 垂直制表符
        '\u000c': ' ',  # 换页符
    }
    
    for old, new in replacements.items():
        json_str = json_str.replace(old, new)
    
    return json_str

9.2 性能问题

python
# 性能优化技巧
import time

def benchmark_json_methods(data, iterations=1000):
    """
    比较不同JSON处理方法的性能
    """
    json_str = json.dumps(data)
    
    # 方法1: pd.read_json
    start_time = time.time()
    for _ in range(iterations):
        df1 = pd.read_json(json_str)
    time1 = time.time() - start_time
    
    # 方法2: json.loads + DataFrame
    start_time = time.time()
    for _ in range(iterations):
        data_parsed = json.loads(json_str)
        df2 = pd.DataFrame(data_parsed)
    time2 = time.time() - start_time
    
    # 方法3: json_normalize
    start_time = time.time()
    for _ in range(iterations):
        data_parsed = json.loads(json_str)
        df3 = json_normalize(data_parsed)
    time3 = time.time() - start_time
    
    print(f"pd.read_json: {time1:.4f}秒")
    print(f"json.loads + DataFrame: {time2:.4f}秒")
    print(f"json_normalize: {time3:.4f}秒")

# 测试性能
test_data = [{'id': i, 'name': f'用户{i}', 'value': i*10} for i in range(100)]
benchmark_json_methods(test_data, 100)

本章小结

本章详细介绍了Pandas处理JSON数据的各种方法和技巧:

  1. JSON基础:了解JSON格式和常见数据结构
  2. 读取JSON:从字符串、文件和不同格式读取JSON数据
  3. 处理嵌套数据:使用json_normalize处理复杂的嵌套JSON
  4. 写入JSON:将DataFrame导出为不同格式的JSON
  5. 数据操作:清洗、转换和分析JSON数据
  6. API数据处理:处理API响应和批量数据
  7. 实际应用:电商、社交媒体、配置管理等场景
  8. 性能优化:大文件处理和内存优化技巧
  9. 问题解决:编码问题和性能问题的解决方案

JSON作为现代数据交换的标准格式,掌握其处理技能对数据分析工作至关重要。通过本章的学习,你应该能够熟练处理各种JSON数据格式,并在实际项目中应用这些技能。

练习题

  1. 创建一个包含嵌套结构的JSON数据,并使用不同方法将其转换为DataFrame
  2. 编写一个函数来处理API分页数据,合并多页结果
  3. 实现一个JSON数据验证器,检查数据完整性和格式正确性
  4. 处理一个包含时间序列数据的JSON文件,进行时间分析
  5. 优化一个大型JSON文件的处理流程,提高处理效率

下一章我们将学习Pandas的常用函数,这些函数将帮助我们更高效地处理和分析数据。