Pandas 数据结构 Series

Series 是 Pandas 中最基础的一维数据结构，可以理解为带标签的数组或字典。本章将全面介绍 Series 的创建、操作和应用。

📚 Series 概述

什么是 Series

Series 是一个一维的标记数组，能够保存任何数据类型（整数、字符串、浮点数、Python 对象等）。它由两个主要部分组成：

数据（values）：实际存储的数据
索引（index）：数据的标签

Series 的特点

✅ 一维结构：类似于数组或列表
✅ 带标签索引：每个元素都有对应的标签
✅ 同质数据：所有元素具有相同的数据类型
✅ 大小不可变：创建后长度固定
✅ 数据可变：可以修改元素值

🔨 创建 Series

从列表创建

python

import pandas as pd
import numpy as np

# 从列表创建 Series
data = [10, 20, 30, 40, 50]
s1 = pd.Series(data)
print(s1)
# 输出：
# 0    10
# 1    20
# 2    30
# 3    40
# 4    50
# dtype: int64

# 指定索引
s2 = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print(s2)
# 输出：
# a    10
# b    20
# c    30
# d    40
# e    50
# dtype: int64

从字典创建

python

# 从字典创建 Series
data_dict = {
    '北京': 2154,
    '上海': 2424,
    '广州': 1491,
    '深圳': 1344,
    '杭州': 1036
}

population = pd.Series(data_dict)
print(population)
# 输出：
# 北京    2154
# 上海    2424
# 广州    1491
# 深圳    1344
# 杭州    1036
# dtype: int64

从 NumPy 数组创建

python

# 从 NumPy 数组创建
arr = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
s3 = pd.Series(arr, index=['A', 'B', 'C', 'D', 'E'])
print(s3)
# 输出：
# A    1.1
# B    2.2
# C    3.3
# D    4.4
# E    5.5
# dtype: float64

从标量值创建

python

# 从标量值创建（需要指定索引）
s4 = pd.Series(100, index=['x', 'y', 'z'])
print(s4)
# 输出：
# x    100
# y    100
# z    100
# dtype: int64

创建特殊 Series

python

# 创建空 Series
empty_series = pd.Series(dtype=float)
print(f"空 Series: {empty_series}")

# 创建日期序列
date_range = pd.date_range('2024-01-01', periods=5, freq='D')
date_series = pd.Series(range(1, 6), index=date_range)
print(date_series)

# 创建分类数据
categories = pd.Categorical(['A', 'B', 'A', 'C', 'B'])
cat_series = pd.Series(categories)
print(cat_series)

🔍 Series 属性

基本属性

python

# 创建示例 Series
scores = pd.Series([85, 92, 78, 96, 88], 
                  index=['张三', '李四', '王五', '赵六', '钱七'])

# 查看基本属性
print(f"数据类型: {scores.dtype}")           # int64
print(f"形状: {scores.shape}")              # (5,)
print(f"大小: {scores.size}")               # 5
print(f"维度: {scores.ndim}")               # 1
print(f"索引: {scores.index}")              # Index(['张三', '李四', '王五', '赵六', '钱七'])
print(f"值: {scores.values}")               # [85 92 78 96 88]
print(f"名称: {scores.name}")               # None

设置名称

python

# 设置 Series 和索引的名称
scores.name = '考试成绩'
scores.index.name = '学生姓名'
print(scores)
# 输出：
# 学生姓名
# 张三    85
# 李四    92
# 王五    78
# 赵六    96
# 钱七    88
# Name: 考试成绩, dtype: int64

内存使用

python

# 查看内存使用情况
print(f"内存使用: {scores.memory_usage()} bytes")
print(f"内存使用（深度）: {scores.memory_usage(deep=True)} bytes")

🎯 索引和选择

位置索引

python

# 创建示例数据
fruits = pd.Series(['苹果', '香蕉', '橙子', '葡萄', '草莓'], 
                  index=['A', 'B', 'C', 'D', 'E'])

# 位置索引（从0开始）
print(fruits[0])        # 苹果
print(fruits[2])        # 橙子
print(fruits[-1])       # 草莓

# 切片
print(fruits[1:4])      # B到D（不包含E）
print(fruits[:3])       # 前3个
print(fruits[2:])       # 从第3个开始

标签索引

python

# 标签索引
print(fruits['A'])      # 苹果
print(fruits['C'])      # 橙子

# 多个标签
print(fruits[['A', 'C', 'E']])
# 输出：
# A    苹果
# C    橙子
# E    草莓
# dtype: object

布尔索引

python

# 创建数值 Series
temperatures = pd.Series([22, 25, 19, 30, 27, 24], 
                        index=['周一', '周二', '周三', '周四', '周五', '周六'])

# 布尔索引
hot_days = temperatures > 25
print(hot_days)
# 输出：
# 周一    False
# 周二    False
# 周三    False
# 周四     True
# 周五     True
# 周六    False
# dtype: bool

# 筛选高温天气
print(temperatures[hot_days])
# 输出：
# 周四    30
# 周五    27
# dtype: int64

# 复合条件
comfortable = temperatures[(temperatures >= 20) & (temperatures <= 25)]
print(comfortable)

高级索引方法

python

# iloc：基于位置的索引
print(fruits.iloc[0])       # 第1个元素
print(fruits.iloc[1:3])     # 第2到第3个元素

# loc：基于标签的索引
print(fruits.loc['A'])      # 标签为'A'的元素
print(fruits.loc['B':'D'])  # 标签从'B'到'D'

# at 和 iat：快速访问单个元素
print(fruits.at['A'])       # 等同于 fruits['A']
print(fruits.iat[0])        # 等同于 fruits.iloc[0]

🔧 Series 操作

数学运算

python

# 创建数值 Series
prices = pd.Series([10.5, 20.3, 15.8, 25.2, 18.7], 
                  index=['商品A', '商品B', '商品C', '商品D', '商品E'])

# 标量运算
print("原价格:")
print(prices)

print("\n打9折后:")
print(prices * 0.9)

print("\n加税后（+10%）:")
print(prices * 1.1)

print("\n每件加5元:")
print(prices + 5)

Series 间运算

python

# 创建两个 Series
q1_sales = pd.Series([100, 150, 200, 120], 
                    index=['产品A', '产品B', '产品C', '产品D'])
q2_sales = pd.Series([120, 180, 190, 140], 
                    index=['产品A', '产品B', '产品C', '产品D'])

# 加法运算
total_sales = q1_sales + q2_sales
print("总销量:")
print(total_sales)

# 增长率
growth_rate = (q2_sales - q1_sales) / q1_sales * 100
print("\n增长率（%）:")
print(growth_rate)

不同索引的运算

python

# 不同索引的 Series 运算
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6, 7], index=['a', 'b', 'd', 'e'])

result = s1 + s2
print(result)
# 输出：
# a    5.0
# b    7.0
# c    NaN
# d    NaN
# e    NaN
# dtype: float64

# 使用 fill_value 处理缺失值
result_filled = s1.add(s2, fill_value=0)
print(result_filled)

📊 统计方法

描述性统计

python

# 创建成绩数据
student_scores = pd.Series([85, 92, 78, 96, 88, 91, 83, 89, 94, 87])

# 基本统计信息
print(f"平均分: {student_scores.mean():.2f}")
print(f"中位数: {student_scores.median():.2f}")
print(f"标准差: {student_scores.std():.2f}")
print(f"方差: {student_scores.var():.2f}")
print(f"最小值: {student_scores.min()}")
print(f"最大值: {student_scores.max()}")
print(f"总和: {student_scores.sum()}")
print(f"计数: {student_scores.count()}")

# 分位数
print(f"25%分位数: {student_scores.quantile(0.25)}")
print(f"75%分位数: {student_scores.quantile(0.75)}")

# 完整描述
print("\n完整统计描述:")
print(student_scores.describe())

排序和排名

python

# 按值排序
sorted_scores = student_scores.sort_values(ascending=False)
print("成绩从高到低:")
print(sorted_scores)

# 按索引排序
sorted_by_index = student_scores.sort_index()
print("\n按索引排序:")
print(sorted_by_index)

# 排名
ranks = student_scores.rank(ascending=False)
print("\n成绩排名:")
print(ranks)

唯一值和计数

python

# 创建包含重复值的 Series
grades = pd.Series(['A', 'B', 'A', 'C', 'B', 'A', 'D', 'C', 'B'])

# 唯一值
print(f"唯一值: {grades.unique()}")
print(f"唯一值数量: {grades.nunique()}")

# 值计数
print("\n各等级计数:")
print(grades.value_counts())

# 按索引排序的计数
print("\n按等级排序的计数:")
print(grades.value_counts().sort_index())

🔄 数据处理

缺失值处理

python

# 创建包含缺失值的 Series
data_with_nan = pd.Series([1, 2, np.nan, 4, 5, np.nan, 7])
print("原始数据:")
print(data_with_nan)

# 检测缺失值
print(f"\n缺失值检测: {data_with_nan.isnull()}")
print(f"非缺失值检测: {data_with_nan.notnull()}")
print(f"缺失值数量: {data_with_nan.isnull().sum()}")

# 删除缺失值
print("\n删除缺失值后:")
print(data_with_nan.dropna())

# 填充缺失值
print("\n用0填充:")
print(data_with_nan.fillna(0))

print("\n用平均值填充:")
print(data_with_nan.fillna(data_with_nan.mean()))

print("\n前向填充:")
print(data_with_nan.fillna(method='ffill'))

print("\n后向填充:")
print(data_with_nan.fillna(method='bfill'))

数据转换

python

# 创建字符串 Series
names = pd.Series(['张三', '李四', '王五', '赵六'])

# 字符串方法
print("原始姓名:")
print(names)

print("\n添加前缀:")
print(names.str.upper())  # 注意：中文没有大小写

print("\n添加后缀:")
print(names + '同学')

print("\n字符串长度:")
print(names.str.len())

# 数值转换
number_strings = pd.Series(['1', '2', '3', '4', '5'])
print("\n字符串转数值:")
print(pd.to_numeric(number_strings))

# 类型转换
float_series = pd.Series([1.1, 2.2, 3.3, 4.4])
print("\n浮点转整数:")
print(float_series.astype(int))

应用函数

python

# 创建数值 Series
numbers = pd.Series([1, 4, 9, 16, 25])

# 应用内置函数
print("平方根:")
print(numbers.apply(np.sqrt))

# 应用自定义函数
def classify_number(x):
    if x < 10:
        return '小'
    elif x < 20:
        return '中'
    else:
        return '大'

print("\n数值分类:")
print(numbers.apply(classify_number))

# 使用 lambda 函数
print("\n平方:")
print(numbers.apply(lambda x: x ** 2))

🔗 Series 合并和连接

连接 Series

python

# 创建多个 Series
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['d', 'e', 'f'])
s3 = pd.Series([7, 8, 9], index=['g', 'h', 'i'])

# 连接 Series
concatenated = pd.concat([s1, s2, s3])
print("连接后的 Series:")
print(concatenated)

# 重置索引
reset_index = pd.concat([s1, s2, s3], ignore_index=True)
print("\n重置索引后:")
print(reset_index)

追加元素

python

# 追加单个元素（已弃用，使用 concat）
original = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
new_element = pd.Series([4], index=['d'])
appended = pd.concat([original, new_element])
print("追加元素后:")
print(appended)

🎨 实际应用示例

示例1：股票价格分析

python

# 模拟股票价格数据
stock_prices = pd.Series([
    100.5, 102.3, 98.7, 105.2, 107.8, 103.4, 109.1, 106.5, 111.2, 108.9
], index=pd.date_range('2024-01-01', periods=10, freq='D'))

stock_prices.name = '股票价格'
stock_prices.index.name = '日期'

print("股票价格数据:")
print(stock_prices)

# 计算日收益率
daily_returns = stock_prices.pct_change() * 100
print(f"\n平均日收益率: {daily_returns.mean():.2f}%")
print(f"收益率标准差: {daily_returns.std():.2f}%")

# 找出最大涨幅和跌幅的日期
max_gain_date = daily_returns.idxmax()
max_loss_date = daily_returns.idxmin()
print(f"\n最大涨幅日期: {max_gain_date}, 涨幅: {daily_returns[max_gain_date]:.2f}%")
print(f"最大跌幅日期: {max_loss_date}, 跌幅: {daily_returns[max_loss_date]:.2f}%")

示例2：销售数据分析

python

# 月度销售数据
monthly_sales = pd.Series([
    120000, 135000, 142000, 158000, 163000, 171000,
    185000, 192000, 178000, 165000, 155000, 148000
], index=['1月', '2月', '3月', '4月', '5月', '6月',
         '7月', '8月', '9月', '10月', '11月', '12月'])

monthly_sales.name = '月度销售额'

print("月度销售数据:")
print(monthly_sales)

# 销售统计
print(f"\n年度总销售额: {monthly_sales.sum():,} 元")
print(f"月均销售额: {monthly_sales.mean():,.0f} 元")
print(f"销售额中位数: {monthly_sales.median():,.0f} 元")

# 找出销售最好和最差的月份
best_month = monthly_sales.idxmax()
worst_month = monthly_sales.idxmin()
print(f"\n销售最好月份: {best_month} ({monthly_sales[best_month]:,} 元)")
print(f"销售最差月份: {worst_month} ({monthly_sales[worst_month]:,} 元)")

# 计算环比增长率
month_over_month = monthly_sales.pct_change() * 100
print("\n环比增长率:")
print(month_over_month.dropna().round(2))

示例3：学生成绩分析

python

# 学生成绩数据
student_grades = pd.Series({
    '张三': 85, '李四': 92, '王五': 78, '赵六': 96, '钱七': 88,
    '孙八': 91, '周九': 83, '吴十': 89, '郑十一': 94, '王十二': 87
})

student_grades.name = '期末成绩'
student_grades.index.name = '学生姓名'

print("学生成绩:")
print(student_grades)

# 成绩等级划分
def grade_level(score):
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    elif score >= 70:
        return 'C'
    elif score >= 60:
        return 'D'
    else:
        return 'F'

grade_levels = student_grades.apply(grade_level)
print("\n成绩等级:")
print(grade_levels)

# 等级统计
print("\n等级分布:")
print(grade_levels.value_counts().sort_index())

# 优秀学生（90分以上）
excellent_students = student_grades[student_grades >= 90]
print(f"\n优秀学生 (≥90分): {len(excellent_students)} 人")
print(excellent_students.sort_values(ascending=False))

🔍 Series 与其他数据结构的转换

转换为其他类型

python

# 创建示例 Series
data = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

# 转换为列表
print(f"转换为列表: {data.tolist()}")

# 转换为数组
print(f"转换为数组: {data.values}")

# 转换为字典
print(f"转换为字典: {data.to_dict()}")

# 转换为 DataFrame
df = data.to_frame(name='数值')
print("\n转换为 DataFrame:")
print(df)

📈 性能优化技巧

向量化操作

python

import time

# 创建大型 Series
large_series = pd.Series(np.random.randn(1000000))

# 比较循环和向量化操作的性能
# 方法1：循环（慢）
start_time = time.time()
result1 = pd.Series([x**2 if x > 0 else 0 for x in large_series])
loop_time = time.time() - start_time

# 方法2：向量化（快）
start_time = time.time()
result2 = large_series.where(large_series > 0, 0) ** 2
vectorized_time = time.time() - start_time

print(f"循环方法耗时: {loop_time:.4f} 秒")
print(f"向量化方法耗时: {vectorized_time:.4f} 秒")
print(f"性能提升: {loop_time/vectorized_time:.1f} 倍")

内存优化

python

# 选择合适的数据类型
# 默认整数类型
default_int = pd.Series([1, 2, 3, 4, 5])
print(f"默认整数类型: {default_int.dtype}, 内存: {default_int.memory_usage()} bytes")

# 优化为较小的整数类型
optimized_int = pd.Series([1, 2, 3, 4, 5], dtype='int8')
print(f"优化整数类型: {optimized_int.dtype}, 内存: {optimized_int.memory_usage()} bytes")

# 分类数据优化
colors = pd.Series(['红', '绿', '蓝'] * 1000)
print(f"字符串类型内存: {colors.memory_usage(deep=True)} bytes")

colors_cat = colors.astype('category')
print(f"分类类型内存: {colors_cat.memory_usage(deep=True)} bytes")
print(f"内存节省: {(1 - colors_cat.memory_usage(deep=True)/colors.memory_usage(deep=True))*100:.1f}%")

📝 本章小结

通过本章学习，您应该已经掌握：

✅ Series 基础概念：理解 Series 的结构和特点
✅ 创建 Series：掌握多种创建 Series 的方法
✅ 索引和选择：熟练使用各种索引方式
✅ 数据操作：进行数学运算和数据处理
✅ 统计分析：使用统计方法分析数据
✅ 实际应用：解决真实的数据分析问题
✅ 性能优化：提高代码执行效率

关键要点

Series 是 Pandas 的基础：理解 Series 对学习 DataFrame 至关重要
索引的重要性：合理使用索引可以大大提高数据处理效率
向量化操作：避免循环，使用 Pandas 内置方法
数据类型优化：选择合适的数据类型可以节省内存

下一步

现在您已经掌握了 Series，接下来将学习 Pandas 的另一个核心数据结构 DataFrame。

下一章：Pandas 数据结构 DataFrame

Pandas 数据结构 Series ​

📚 Series 概述 ​

什么是 Series ​

Series 的特点 ​

🔨 创建 Series ​

从列表创建 ​

从字典创建 ​

从 NumPy 数组创建 ​

从标量值创建 ​

创建特殊 Series ​

🔍 Series 属性 ​

基本属性 ​

设置名称 ​

内存使用 ​

🎯 索引和选择 ​

位置索引 ​

标签索引 ​

布尔索引 ​

高级索引方法 ​

🔧 Series 操作 ​

数学运算 ​

Series 间运算 ​

不同索引的运算 ​

📊 统计方法 ​

描述性统计 ​

排序和排名 ​

唯一值和计数 ​

🔄 数据处理 ​

缺失值处理 ​

数据转换 ​

应用函数 ​

🔗 Series 合并和连接 ​

连接 Series ​

追加元素 ​

🎨 实际应用示例 ​

示例1：股票价格分析 ​

示例2：销售数据分析 ​

示例3：学生成绩分析 ​

🔍 Series 与其他数据结构的转换 ​

转换为其他类型 ​

📈 性能优化技巧 ​

向量化操作 ​

内存优化 ​

📝 本章小结 ​

关键要点 ​

下一步 ​

Pandas 数据结构 Series

📚 Series 概述

什么是 Series

Series 的特点

🔨 创建 Series

从列表创建

从字典创建

从 NumPy 数组创建

从标量值创建

创建特殊 Series

🔍 Series 属性

基本属性

设置名称

内存使用

🎯 索引和选择

位置索引

标签索引

布尔索引

高级索引方法

🔧 Series 操作

数学运算

Series 间运算

不同索引的运算

📊 统计方法

描述性统计

排序和排名

唯一值和计数

🔄 数据处理

缺失值处理

数据转换

应用函数

🔗 Series 合并和连接

连接 Series

追加元素

🎨 实际应用示例

示例1：股票价格分析

示例2：销售数据分析

示例3：学生成绩分析

🔍 Series 与其他数据结构的转换

转换为其他类型

📈 性能优化技巧

向量化操作

内存优化

📝 本章小结

关键要点

下一步