Pandas JSON 处理
JSON(JavaScript Object Notation)是现代Web应用和API中最常用的数据交换格式。Pandas 提供了强大的JSON处理功能,可以轻松读取、写入和操作JSON数据。本章将详细介绍如何使用 Pandas 处理各种JSON数据格式。
1. JSON 数据格式概述
1.1 JSON 基础知识
JSON是一种轻量级的数据交换格式,具有以下特点:
- 易于人阅读和编写
- 易于机器解析和生成
- 基于JavaScript语法,但独立于语言
- 支持对象、数组、字符串、数字、布尔值和null
1.2 常见的JSON数据结构
python
import pandas as pd
import json
import numpy as np
from datetime import datetime
# 简单的JSON对象
simple_json = {
"name": "张三",
"age": 30,
"city": "北京",
"married": True
}
# JSON数组
json_array = [
{"id": 1, "name": "张三", "score": 85},
{"id": 2, "name": "李四", "score": 92},
{"id": 3, "name": "王五", "score": 78}
]
# 嵌套JSON
nested_json = {
"user": {
"id": 1,
"profile": {
"name": "张三",
"age": 30,
"address": {
"city": "北京",
"district": "朝阳区"
}
},
"orders": [
{"order_id": "ORD001", "amount": 299.99, "date": "2023-01-15"},
{"order_id": "ORD002", "amount": 159.50, "date": "2023-02-20"}
]
}
}
print("简单JSON:", simple_json)
print("JSON数组:", json_array)
print("嵌套JSON:", nested_json)2. 读取JSON数据
2.1 从字符串读取JSON
python
# 从JSON字符串创建DataFrame
json_string = '''
[
{"name": "张三", "age": 25, "department": "IT", "salary": 8000},
{"name": "李四", "age": 30, "department": "HR", "salary": 7000},
{"name": "王五", "age": 28, "department": "Finance", "salary": 9000}
]
'''
# 使用pd.read_json()读取
df_from_string = pd.read_json(json_string)
print("从JSON字符串创建的DataFrame:")
print(df_from_string)
print("\n数据类型:")
print(df_from_string.dtypes)2.2 从文件读取JSON
python
# 创建示例JSON文件
employees_data = [
{"id": 1, "name": "张三", "age": 25, "department": "IT", "salary": 8000, "join_date": "2020-01-15"},
{"id": 2, "name": "李四", "age": 30, "department": "HR", "salary": 7000, "join_date": "2019-03-20"},
{"id": 3, "name": "王五", "age": 28, "department": "Finance", "salary": 9000, "join_date": "2021-05-10"},
{"id": 4, "name": "赵六", "age": 32, "department": "IT", "salary": 9500, "join_date": "2018-12-01"}
]
# 保存为JSON文件
with open('employees.json', 'w', encoding='utf-8') as f:
json.dump(employees_data, f, ensure_ascii=False, indent=2)
# 从文件读取
df_from_file = pd.read_json('employees.json')
print("从JSON文件读取的DataFrame:")
print(df_from_file)
# 处理日期列
df_from_file['join_date'] = pd.to_datetime(df_from_file['join_date'])
print("\n处理日期后的数据类型:")
print(df_from_file.dtypes)2.3 读取不同格式的JSON
python
# 不同的JSON格式
# 1. 记录格式(默认)- 每行是一个记录
records_format = '''
[
{"name": "张三", "age": 25, "city": "北京"},
{"name": "李四", "age": 30, "city": "上海"}
]
'''
# 2. 索引格式 - 以索引为键
index_format = '''
{
"0": {"name": "张三", "age": 25, "city": "北京"},
"1": {"name": "李四", "age": 30, "city": "上海"}
}
'''
# 3. 列格式 - 以列名为键
columns_format = '''
{
"name": {"0": "张三", "1": "李四"},
"age": {"0": 25, "1": 30},
"city": {"0": "北京", "1": "上海"}
}
'''
# 4. 值格式 - 只有值的数组
values_format = '''
[
["张三", 25, "北京"],
["李四", 30, "上海"]
]
'''
# 读取不同格式
df_records = pd.read_json(records_format, orient='records')
df_index = pd.read_json(index_format, orient='index')
df_columns = pd.read_json(columns_format, orient='columns')
df_values = pd.read_json(values_format, orient='values')
print("记录格式:")
print(df_records)
print("\n索引格式:")
print(df_index)
print("\n列格式:")
print(df_columns)
print("\n值格式(需要手动设置列名):")
df_values.columns = ['name', 'age', 'city']
print(df_values)2.4 处理嵌套JSON
python
# 处理嵌套JSON数据
nested_data = '''
[
{
"id": 1,
"name": "张三",
"profile": {
"age": 25,
"address": {
"city": "北京",
"district": "朝阳区"
}
},
"skills": ["Python", "SQL", "Machine Learning"]
},
{
"id": 2,
"name": "李四",
"profile": {
"age": 30,
"address": {
"city": "上海",
"district": "浦东新区"
}
},
"skills": ["Java", "Spring", "MySQL"]
}
]
'''
# 使用json_normalize处理嵌套数据
from pandas import json_normalize
# 解析JSON字符串
data = json.loads(nested_data)
# 标准化嵌套JSON
df_normalized = json_normalize(data)
print("标准化后的DataFrame:")
print(df_normalized)
print("\n列名:")
print(df_normalized.columns.tolist())
# 指定分隔符
df_custom_sep = json_normalize(data, sep='_')
print("\n使用自定义分隔符:")
print(df_custom_sep.columns.tolist())
# 处理数组字段
df_with_meta = json_normalize(data, record_path='skills',
meta=['id', 'name', ['profile', 'age']],
meta_prefix='user_')
print("\n展开技能数组:")
print(df_with_meta)3. 写入JSON数据
3.1 基本写入操作
python
# 创建示例DataFrame
df = pd.DataFrame({
'id': [1, 2, 3, 4],
'name': ['张三', '李四', '王五', '赵六'],
'age': [25, 30, 28, 32],
'department': ['IT', 'HR', 'Finance', 'IT'],
'salary': [8000, 7000, 9000, 9500],
'join_date': pd.to_datetime(['2020-01-15', '2019-03-20', '2021-05-10', '2018-12-01'])
})
print("原始DataFrame:")
print(df)
# 转换为JSON字符串
json_string = df.to_json(orient='records', force_ascii=False, indent=2)
print("\nJSON字符串:")
print(json_string)
# 保存到文件
df.to_json('output.json', orient='records', force_ascii=False, indent=2)
print("\n已保存到 output.json 文件")3.2 不同的输出格式
python
# 演示不同的orient参数
print("1. records格式(默认):")
print(df.to_json(orient='records', force_ascii=False, indent=2)[:200] + "...")
print("\n2. index格式:")
print(df.to_json(orient='index', force_ascii=False, indent=2)[:200] + "...")
print("\n3. columns格式:")
print(df.to_json(orient='columns', force_ascii=False, indent=2)[:200] + "...")
print("\n4. values格式:")
print(df.to_json(orient='values', force_ascii=False, indent=2)[:200] + "...")
print("\n5. split格式:")
print(df.to_json(orient='split', force_ascii=False, indent=2)[:200] + "...")
print("\n6. table格式:")
print(df.to_json(orient='table', force_ascii=False, indent=2)[:200] + "...")3.3 处理日期和时间
python
# 日期处理选项
print("默认日期格式(epoch毫秒):")
print(df[['name', 'join_date']].to_json(orient='records'))
print("\nISO格式日期:")
print(df[['name', 'join_date']].to_json(orient='records', date_format='iso'))
print("\n自定义日期格式:")
print(df[['name', 'join_date']].to_json(orient='records', date_format='%Y-%m-%d'))
# 处理NaN值
df_with_nan = df.copy()
df_with_nan.loc[0, 'salary'] = np.nan
print("\n包含NaN的数据(默认null):")
print(df_with_nan[['name', 'salary']].to_json(orient='records'))
print("\n自定义NaN值:")
print(df_with_nan[['name', 'salary']].to_json(orient='records', na_value='N/A'))4. JSON数据操作和转换
4.1 JSON数据的清洗和预处理
python
# 处理不规范的JSON数据
irregular_json = '''
[
{"name": "张三", "age": "25", "salary": "8000", "active": "true"},
{"name": "李四", "age": "30", "salary": "7000", "active": "false"},
{"name": "王五", "age": null, "salary": "9000", "active": "true"},
{"name": "赵六", "age": "32", "salary": null, "active": "1"}
]
'''
df_irregular = pd.read_json(irregular_json)
print("原始不规范数据:")
print(df_irregular)
print("\n数据类型:")
print(df_irregular.dtypes)
# 数据类型转换和清洗
df_cleaned = df_irregular.copy()
# 转换数值类型
df_cleaned['age'] = pd.to_numeric(df_cleaned['age'], errors='coerce')
df_cleaned['salary'] = pd.to_numeric(df_cleaned['salary'], errors='coerce')
# 转换布尔类型
def convert_to_bool(value):
if pd.isna(value):
return False
if isinstance(value, str):
return value.lower() in ['true', '1', 'yes']
return bool(value)
df_cleaned['active'] = df_cleaned['active'].apply(convert_to_bool)
print("\n清洗后的数据:")
print(df_cleaned)
print("\n清洗后的数据类型:")
print(df_cleaned.dtypes)4.2 JSON数据的合并和拆分
python
# 合并多个JSON文件的数据
json_file1 = '''
[
{"id": 1, "name": "张三", "department": "IT"},
{"id": 2, "name": "李四", "department": "HR"}
]
'''
json_file2 = '''
[
{"id": 1, "salary": 8000, "join_date": "2020-01-15"},
{"id": 2, "salary": 7000, "join_date": "2019-03-20"}
]
'''
df1 = pd.read_json(json_file1)
df2 = pd.read_json(json_file2)
# 合并数据
df_merged = pd.merge(df1, df2, on='id')
print("合并后的数据:")
print(df_merged)
# 拆分复杂字段
complex_json = '''
[
{"id": 1, "name": "张三", "contact": "email:zhang@example.com;phone:13800138000"},
{"id": 2, "name": "李四", "contact": "email:li@example.com;phone:13900139000"}
]
'''
df_complex = pd.read_json(complex_json)
print("\n原始复杂数据:")
print(df_complex)
# 拆分contact字段
def parse_contact(contact_str):
parts = contact_str.split(';')
result = {}
for part in parts:
key, value = part.split(':')
result[key] = value
return result
contact_data = df_complex['contact'].apply(parse_contact)
contact_df = pd.json_normalize(contact_data)
# 合并拆分后的数据
df_expanded = pd.concat([df_complex[['id', 'name']], contact_df], axis=1)
print("\n拆分后的数据:")
print(df_expanded)4.3 处理JSON数组字段
python
# 处理包含数组的JSON数据
array_json = '''
[
{
"id": 1,
"name": "张三",
"skills": ["Python", "SQL", "Machine Learning"],
"projects": [
{"name": "项目A", "status": "完成"},
{"name": "项目B", "status": "进行中"}
]
},
{
"id": 2,
"name": "李四",
"skills": ["Java", "Spring", "MySQL"],
"projects": [
{"name": "项目C", "status": "完成"}
]
}
]
'''
data = json.loads(array_json)
# 展开技能数组
skills_df = json_normalize(data, record_path='skills',
meta=['id', 'name'],
record_prefix='skill_')
print("展开的技能数据:")
print(skills_df)
# 展开项目数组
projects_df = json_normalize(data, record_path='projects',
meta=['id', 'name'])
print("\n展开的项目数据:")
print(projects_df)
# 统计每个人的技能数量
skills_count = skills_df.groupby(['id', 'name']).size().reset_index(name='skills_count')
print("\n技能数量统计:")
print(skills_count)5. 处理API响应数据
5.1 模拟API响应处理
python
# 模拟API响应数据
api_response = '''
{
"status": "success",
"message": "数据获取成功",
"data": {
"total": 100,
"page": 1,
"per_page": 10,
"users": [
{
"id": 1,
"username": "zhang_san",
"profile": {
"name": "张三",
"age": 25,
"email": "zhang@example.com"
},
"last_login": "2023-12-01T10:30:00Z",
"is_active": true
},
{
"id": 2,
"username": "li_si",
"profile": {
"name": "李四",
"age": 30,
"email": "li@example.com"
},
"last_login": "2023-12-02T14:20:00Z",
"is_active": false
}
]
}
}
'''
# 解析API响应
response_data = json.loads(api_response)
# 提取用户数据
if response_data['status'] == 'success':
users_data = response_data['data']['users']
df_users = json_normalize(users_data)
print("API用户数据:")
print(df_users)
# 处理时间戳
df_users['last_login'] = pd.to_datetime(df_users['last_login'])
print("\n处理时间戳后:")
print(df_users[['username', 'last_login', 'is_active']])
else:
print(f"API请求失败: {response_data['message']}")5.2 批量处理API数据
python
# 模拟批量API数据处理
def process_api_batch(api_responses):
"""
批量处理API响应数据
"""
all_data = []
for i, response in enumerate(api_responses):
try:
data = json.loads(response)
if data['status'] == 'success':
# 添加批次信息
batch_data = data['data']['users']
for item in batch_data:
item['batch_id'] = i + 1
item['processed_at'] = datetime.now().isoformat()
all_data.extend(batch_data)
else:
print(f"批次 {i+1} 处理失败: {data['message']}")
except json.JSONDecodeError as e:
print(f"批次 {i+1} JSON解析错误: {e}")
except Exception as e:
print(f"批次 {i+1} 处理错误: {e}")
if all_data:
df_result = json_normalize(all_data)
return df_result
else:
return pd.DataFrame()
# 模拟多个API响应
api_responses = [api_response, api_response.replace('zhang_san', 'wang_wu')]
df_batch = process_api_batch(api_responses)
print("批量处理结果:")
print(df_batch[['username', 'profile.name', 'batch_id']])6. JSON数据分析案例
6.1 电商订单数据分析
python
# 电商订单JSON数据
ecommerce_data = '''
[
{
"order_id": "ORD001",
"customer": {
"id": 1001,
"name": "张三",
"city": "北京"
},
"items": [
{"product_id": "P001", "name": "笔记本电脑", "price": 5999, "quantity": 1},
{"product_id": "P002", "name": "鼠标", "price": 99, "quantity": 2}
],
"order_date": "2023-12-01",
"status": "已完成"
},
{
"order_id": "ORD002",
"customer": {
"id": 1002,
"name": "李四",
"city": "上海"
},
"items": [
{"product_id": "P003", "name": "手机", "price": 3999, "quantity": 1}
],
"order_date": "2023-12-02",
"status": "已发货"
}
]
'''
orders_data = json.loads(ecommerce_data)
# 分析订单数据
# 1. 基本订单信息
orders_df = json_normalize(orders_data)
print("订单基本信息:")
print(orders_df[['order_id', 'customer.name', 'customer.city', 'order_date', 'status']])
# 2. 展开商品明细
items_df = json_normalize(orders_data, record_path='items',
meta=['order_id', ['customer', 'name'], ['customer', 'city']])
print("\n商品明细:")
print(items_df)
# 3. 计算订单总金额
items_df['total_price'] = items_df['price'] * items_df['quantity']
order_totals = items_df.groupby('order_id')['total_price'].sum().reset_index()
print("\n订单总金额:")
print(order_totals)
# 4. 城市销售统计
city_sales = items_df.groupby('customer.city')['total_price'].sum().reset_index()
print("\n城市销售统计:")
print(city_sales)6.2 社交媒体数据分析
python
# 社交媒体帖子数据
social_media_data = '''
[
{
"post_id": "POST001",
"author": {
"username": "user123",
"followers": 1500,
"verified": true
},
"content": "今天天气真好!",
"timestamp": "2023-12-01T10:30:00Z",
"engagement": {
"likes": 45,
"comments": 12,
"shares": 8
},
"hashtags": ["#天气", "#心情"],
"mentions": ["@friend1", "@friend2"]
},
{
"post_id": "POST002",
"author": {
"username": "user456",
"followers": 3200,
"verified": false
},
"content": "分享一个有趣的技术文章",
"timestamp": "2023-12-01T15:45:00Z",
"engagement": {
"likes": 89,
"comments": 23,
"shares": 15
},
"hashtags": ["#技术", "#分享", "#学习"],
"mentions": []
}
]
'''
social_data = json.loads(social_media_data)
# 分析社交媒体数据
posts_df = json_normalize(social_data)
print("帖子数据:")
print(posts_df[['post_id', 'author.username', 'author.followers', 'content']])
# 计算参与度指标
posts_df['total_engagement'] = (posts_df['engagement.likes'] +
posts_df['engagement.comments'] +
posts_df['engagement.shares'])
posts_df['engagement_rate'] = posts_df['total_engagement'] / posts_df['author.followers']
print("\n参与度分析:")
print(posts_df[['post_id', 'author.username', 'total_engagement', 'engagement_rate']])
# 展开标签数据
hashtags_df = json_normalize(social_data, record_path='hashtags',
meta=['post_id', 'author.username'],
record_prefix='hashtag_')
print("\n标签使用统计:")
hashtag_counts = hashtags_df['hashtag_0'].value_counts()
print(hashtag_counts)7. 性能优化和最佳实践
7.1 大型JSON文件处理
python
# 处理大型JSON文件的策略
def process_large_json(file_path, chunk_size=1000):
"""
分块处理大型JSON文件
"""
import ijson # 需要安装: pip install ijson
results = []
current_chunk = []
with open(file_path, 'rb') as file:
# 使用ijson进行流式解析
parser = ijson.parse(file)
for prefix, event, value in parser:
if event == 'start_map':
current_item = {}
elif event == 'map_key':
current_key = value
elif event == 'string' or event == 'number':
current_item[current_key] = value
elif event == 'end_map':
current_chunk.append(current_item)
if len(current_chunk) >= chunk_size:
# 处理当前块
chunk_df = pd.DataFrame(current_chunk)
results.append(chunk_df)
current_chunk = []
# 处理最后一块
if current_chunk:
chunk_df = pd.DataFrame(current_chunk)
results.append(chunk_df)
return pd.concat(results, ignore_index=True)
# 内存优化技巧
def optimize_json_dataframe(df):
"""
优化JSON DataFrame的内存使用
"""
# 转换数据类型
for col in df.columns:
if df[col].dtype == 'object':
# 尝试转换为数值类型
numeric_series = pd.to_numeric(df[col], errors='ignore')
if numeric_series.dtype != 'object':
df[col] = numeric_series
else:
# 转换为category类型(如果重复值较多)
if df[col].nunique() / len(df) < 0.5:
df[col] = df[col].astype('category')
return df7.2 JSON数据验证
python
# JSON数据验证
def validate_json_data(df, schema):
"""
验证JSON数据是否符合预期格式
"""
validation_errors = []
# 检查必需字段
for field in schema.get('required', []):
if field not in df.columns:
validation_errors.append(f"缺少必需字段: {field}")
elif df[field].isnull().any():
validation_errors.append(f"字段 {field} 包含空值")
# 检查数据类型
for field, expected_type in schema.get('types', {}).items():
if field in df.columns:
if expected_type == 'numeric':
if not pd.api.types.is_numeric_dtype(df[field]):
validation_errors.append(f"字段 {field} 应为数值类型")
elif expected_type == 'datetime':
try:
pd.to_datetime(df[field])
except:
validation_errors.append(f"字段 {field} 应为日期时间类型")
# 检查数值范围
for field, range_config in schema.get('ranges', {}).items():
if field in df.columns:
min_val, max_val = range_config
if (df[field] < min_val).any() or (df[field] > max_val).any():
validation_errors.append(f"字段 {field} 值超出范围 [{min_val}, {max_val}]")
return validation_errors
# 定义数据模式
data_schema = {
'required': ['id', 'name', 'age'],
'types': {
'id': 'numeric',
'age': 'numeric',
'join_date': 'datetime'
},
'ranges': {
'age': (0, 120),
'salary': (0, 1000000)
}
}
# 验证数据
validation_errors = validate_json_data(df, data_schema)
if validation_errors:
print("数据验证错误:")
for error in validation_errors:
print(f"- {error}")
else:
print("数据验证通过")8. 实际应用场景
8.1 配置文件管理
python
# 应用配置管理
config_json = '''
{
"database": {
"host": "localhost",
"port": 5432,
"name": "myapp",
"credentials": {
"username": "admin",
"password": "secret123"
}
},
"api": {
"base_url": "https://api.example.com",
"timeout": 30,
"retry_count": 3
},
"features": {
"enable_logging": true,
"debug_mode": false,
"max_file_size": 10485760
}
}
'''
config_data = json.loads(config_json)
config_df = json_normalize(config_data)
print("配置数据:")
print(config_df)
# 配置项管理
def get_config_value(config_df, key_path):
"""
获取嵌套配置值
"""
if key_path in config_df.columns:
return config_df[key_path].iloc[0]
return None
db_host = get_config_value(config_df, 'database.host')
api_timeout = get_config_value(config_df, 'api.timeout')
print(f"\n数据库主机: {db_host}")
print(f"API超时时间: {api_timeout}")8.2 日志数据分析
python
# 应用日志数据
log_data = '''
[
{
"timestamp": "2023-12-01T10:30:15.123Z",
"level": "INFO",
"message": "用户登录成功",
"user_id": 1001,
"ip_address": "192.168.1.100",
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"response_time": 245
},
{
"timestamp": "2023-12-01T10:31:22.456Z",
"level": "ERROR",
"message": "数据库连接失败",
"error_code": "DB_CONNECTION_ERROR",
"ip_address": "192.168.1.101",
"response_time": 5000
},
{
"timestamp": "2023-12-01T10:32:10.789Z",
"level": "WARN",
"message": "API调用频率过高",
"user_id": 1002,
"ip_address": "192.168.1.102",
"api_endpoint": "/api/users",
"response_time": 1200
}
]
'''
logs_df = pd.read_json(log_data)
logs_df['timestamp'] = pd.to_datetime(logs_df['timestamp'])
print("日志数据分析:")
print(logs_df[['timestamp', 'level', 'message', 'response_time']])
# 日志级别统计
log_level_counts = logs_df['level'].value_counts()
print("\n日志级别统计:")
print(log_level_counts)
# 响应时间分析
print("\n响应时间统计:")
print(logs_df['response_time'].describe())
# 错误日志分析
error_logs = logs_df[logs_df['level'] == 'ERROR']
print("\n错误日志:")
print(error_logs[['timestamp', 'message', 'error_code']])9. 常见问题和解决方案
9.1 编码问题
python
# 处理编码问题
def safe_read_json(file_path, encodings=['utf-8', 'gbk', 'gb2312']):
"""
安全读取可能有编码问题的JSON文件
"""
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
data = json.load(f)
return pd.json_normalize(data)
except UnicodeDecodeError:
continue
except json.JSONDecodeError as e:
print(f"JSON解析错误 ({encoding}): {e}")
continue
raise ValueError(f"无法使用任何编码读取文件: {file_path}")
# 处理特殊字符
def clean_json_string(json_str):
"""
清理JSON字符串中的特殊字符
"""
# 移除BOM标记
if json_str.startswith('\ufeff'):
json_str = json_str[1:]
# 替换常见的问题字符
replacements = {
'\u0000': '', # NULL字符
'\u000b': ' ', # 垂直制表符
'\u000c': ' ', # 换页符
}
for old, new in replacements.items():
json_str = json_str.replace(old, new)
return json_str9.2 性能问题
python
# 性能优化技巧
import time
def benchmark_json_methods(data, iterations=1000):
"""
比较不同JSON处理方法的性能
"""
json_str = json.dumps(data)
# 方法1: pd.read_json
start_time = time.time()
for _ in range(iterations):
df1 = pd.read_json(json_str)
time1 = time.time() - start_time
# 方法2: json.loads + DataFrame
start_time = time.time()
for _ in range(iterations):
data_parsed = json.loads(json_str)
df2 = pd.DataFrame(data_parsed)
time2 = time.time() - start_time
# 方法3: json_normalize
start_time = time.time()
for _ in range(iterations):
data_parsed = json.loads(json_str)
df3 = json_normalize(data_parsed)
time3 = time.time() - start_time
print(f"pd.read_json: {time1:.4f}秒")
print(f"json.loads + DataFrame: {time2:.4f}秒")
print(f"json_normalize: {time3:.4f}秒")
# 测试性能
test_data = [{'id': i, 'name': f'用户{i}', 'value': i*10} for i in range(100)]
benchmark_json_methods(test_data, 100)本章小结
本章详细介绍了Pandas处理JSON数据的各种方法和技巧:
- JSON基础:了解JSON格式和常见数据结构
- 读取JSON:从字符串、文件和不同格式读取JSON数据
- 处理嵌套数据:使用json_normalize处理复杂的嵌套JSON
- 写入JSON:将DataFrame导出为不同格式的JSON
- 数据操作:清洗、转换和分析JSON数据
- API数据处理:处理API响应和批量数据
- 实际应用:电商、社交媒体、配置管理等场景
- 性能优化:大文件处理和内存优化技巧
- 问题解决:编码问题和性能问题的解决方案
JSON作为现代数据交换的标准格式,掌握其处理技能对数据分析工作至关重要。通过本章的学习,你应该能够熟练处理各种JSON数据格式,并在实际项目中应用这些技能。
练习题
- 创建一个包含嵌套结构的JSON数据,并使用不同方法将其转换为DataFrame
- 编写一个函数来处理API分页数据,合并多页结果
- 实现一个JSON数据验证器,检查数据完整性和格式正确性
- 处理一个包含时间序列数据的JSON文件,进行时间分析
- 优化一个大型JSON文件的处理流程,提高处理效率
下一章我们将学习Pandas的常用函数,这些函数将帮助我们更高效地处理和分析数据。