人工智能学习 9 - Pandas

Pandas 基础

参考: https://www.pypandas.cn/ 或 pandas docs

numpy 主要做计算, pandas 主要做数据清理和分析.

import pandas as pd

dt = pd.read_csv("./path/filename.csv", encoding="UTF8")
# 中文编码: GB2312简体中文, GBK国标, BIG5繁体中文, UTF8默认

Series

类似 ndarray 只有一列数据, 但是 series 每个数据都有一个 index 标签, 默认是 0 开始序数.

每个元素底层类型是 numpy.int64

dt = pd.Series([1,2,3])
dt[0:1] # 不包含 1 位置

dt = pd.Series([1,2,3], index=["A","B","C"])
dt["A":"B"] # 包含 "B" 位置

# 当 series 中只有 int 时, int 数字类型为 numpy.int64
# 当 series 中有 int 和 float 时, int 数字转换为 numpy.float64
# 当 series 中有 str 等其他类型时, int 数字转换为普通 python 类型 int
type(dt["A"])

# 以 index 对其后进行计算
dt + dt
dt[1:] + dt[:-2]

# 常用统计计算
dt.mean()/min()/max()...

DataFrame

DataFrame 是多个 Series 组成的表

# 使用 array 创建
df = pd.DataFrame([[1,2],[3,4]], index=["a","b"], column=["c","d"])

# 使用 dict 创建
df = pd.DataFrame({
    "Name": ["Alice","Bob","Charlie","David"],
    "Age": [22,31,19,21],
    "City": ["Beijing","Tianjin","Tokyo","New York"],
})
df.set_index("Name")

# 添加/删除列
df["Gender"] = ["Woman", "Man", "Woman", "Man"]
del df["Age"]

# 获取列
df["Gender"] 或 df.index
# 数字id 获取行
df.iloc[0] 或 df.iloc[1:2]
# 索引 获取行
df.loc["ID"] 或 df.loc["ID1":"ID2"]
# 条件过滤
df[df["Gender"] == "Man"]
# 获取单元格
df.at[x, y]

# 行列转置
df.T

文件读取

# csv json excel xml html hdfs ...

# 读取
df = pd.read_*(file, delimiter = "\t", index_col="datetime")
# 写入
df.to_*(file, index=False)

查看数据基本信息

df.head(n=5)
df.tail(n=5)

# 数据条数, 索引范围
# 列名称, null 数量, 列数据类型; 数据类型统计
# 占用内存大小
df.info()
# 列数据统计信息(条数, 平均值, std, min, 百分位数, max)
df.describe()

# 行,列,列数据类型
df.index
df.columns
df.dtypes

处理缺失值

# 是否是缺失值
df.isnull()
pd.isnull(df)
# 统计缺失数量
df.isnull().sum()

# 去掉缺失值
df.dropna(inplace=True)
# 去掉某一列
df.drop("column", axis=1, inplace=True)

# 填充缺失值
vmean = df["column"].mean()
df["column"].fillna(vmean, inplace=True)

分组统计

# 统计操作
# mean, sum, count, max, min, head, tail

# 分组
df.groupby("c1")["c2"].mean()
df[["c1","c2"]].groupby("c1").mean()

# 多级分组
df.groupby(["c1","c2"]).mean()

# 自定义函数
def func(group):
  # 数据条数 group.shape[0]
  pass

df.groupby().apply(func)

图形化

# 折线图
df["column"].plot()
# 散点图
df.plot.scatter(x="c1", y="c2")
# 多列一起绘制
df[["c1","c2"]].plot.area(figsize=(120,4), subplots=True)

存储图片

from PIL import Image

bg_color = (255,255,255)
img = Image.new("RGB", (width, height), bg_color)
# see: https://blog.csdn.net/zhouzhiyao960211/article/details/90384524
img = Image.fromarray(img.astype("uint8")).convert("RGB")
# see: https://blog.csdn.net/qq_30159015/article/details/80070514

img.show()
img.save("./img.jpg")

重塑

# index 不变, c1 中的值做列, 展示 c2 数据
df.pivot(columns="c1", values="c2")

# 以 c1 为行, c2 为列, 使用 aggfunc 统计 c3 数据
df.pivot_table(index="c1", columns="c2", values="c3", aggfunc="mean")
df.groupby(["c1", "c2"])["c3"].mean()

数据处理

排序

1	df.sort_values(by = "c", ascending=True)

替换

1	df["nc"] = df["c"].replace("regexp", "", regex=True)

数据类型转换

1	df["nc"] = pd.to_numeric(df["c"])

拼接

1 2	# axis=0 上下拼, axis=1 左右拼 pd.concat([df1, df2], axis=0)

时间序列

# 重命名 column 字段
df.rename(columns={"old": "new"}, inplace=True)
df = df.rename(columns={"old": "new"}, inplace=False)

# 时间字符串转时间对象
df["datetime"] = pd.to_datetime(df["datetime"])

# 日期对象的计算
tm.min()
tm.max()
tm.max() - tm.min()

# 日期对象的属性
tm.dt.date "YYYY-mm-dd"
tm.dt.year
tm.dt.month
tm.dt.day
tm.dt.weekday
tm.dt.hour
tm.dt.minute
tm.dt.second