编辑

pandas介绍


引入库

import numpy as np
import pandas as pd
import pymysql

从数据库读取数据

conn = pymysql.connect(host="14.152.59.237",port=1296,user="rtest",passwd="NJKucDPZHCMk41Xc",db="rtest",charset="utf8")
df = pd.read_sql("SELECT ID,Agency,Banner,ScoreT FROM gz1699 WHERE B1B3=1 AND ID>1000000 LIMIT 20",conn,index_col="ID")

Series 对象

初始化对象
s = pd.Series([1,0,2,1,2,3])
s = df['Agency']
显示不重复值
s.unique()
频次统计
s.count()
s.value_counts()
归属
s.isin([1,3])
缺失值 pd.NaN(Not a Number)
s.isnull()
s.notnull()

DataFrame 对象

初始化对象
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, colums=['Q1a','Q1b','Q2'])
选取元素
df2.Q1a     #按列索引选取
df2.ix[1]  df2.ix[[2,4]]  #按行索引选取
df2[0] df2[1:5]  #按行序号选取

df2[df2.isin([1,'DASWON'])]
df2[df2<3]
df[df.ScoreT>90]
删除列
del df2['Q1a']   
转置
df.T
统计
df.count()
df.describe() 
    count mean std min 25% 50% 75% Max
排序
df.sort_index()
df.sort_index(by='ScoreT')
df.sort_values('ScoreT', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
缺失值 pd.NaN(Not a Number)
df2.dropna()
df2.dropna(how='all')
df2[df.notnull()]
df2.fillna(0)
重复行
df[df.duplicated()]
df.drop_duplicated()

等级索引

表格与二级索引数据转换
ms = df.stack()
ms.unstack()

其他: pivot

统计

分段
bins=[0,60,80,90,95,100]
cat = pd.cut(df.ScoreT,bins)
pd.value_counts(cat)

bin_names=['0~60','61~80','81~90','91~95','96~100']
cat = pd.cut(df.ScoreT,bins,labels=bin_names)

s1 = pd.Series(cat,index=df.index).to_frame(name='S1')
df2 = pd.merge(df, s1, left_index=True,right_index=True)

小括号为开区间,中括号为闭区间

GroupBy

认真理解内部机制 SPLIT-APPLY-COMMBINE 过程,大数据MapReduce也是对这个机制的扩充

group = df['ScoreT'].groupby(df['Agency'])
group.mean()
group.sum()
group = df['ScoreT'].groupby([df['Agency'],df['Banner']])
group.count().unstack()
df.groupby([df['Agency'],df['Banner']]).count().add_prefix('count_')
df.groupby([df['Agency'],df['Banner']]).mean().add_prefix('mean_')

group.agg(mean)
def count(series):
    return series.count()
group.agg([count, 'mean','sum'])

笛卡尔积

包装好的函数
def crossjoin(frame, name1, name2):
    def split_series(n):
        if frame[n].dtype.name != 'object':
            fr = frame[n].to_frame(name=n)
            fr['ID']=fr.index
        else:
            fr = frame[n].apply(lambda x: pd.Series(re.split(',', x))).stack().to_frame(name=n)
            fr['ID']=fr.index.droplevel(level=1)
        return fr
    x1 = split_series(name1)
    x2 = split_series(name2)
    return (pd.merge(x1, x2, on='ID', how='outer'),x1[name1],x2[name2])
使用方法
df = pd.read_sql("SELECT ID,A2,A3,B1,E2,ScoreT FROM gz1714 WHERE A1=1 LIMIT 20",conn,index_col="ID")
crossjoin(df, 'A2', 'A3')
x,x1,x2 = crossjoin(df, 'E2', 'A3')

交叉表

pd.crosstab(cj['E2'],cj['A3'], margins=True)
del ct['All']
ct = pd.crosstab(cj['E2'],cj['A3'], margins=True,rownames=['题目描述'],colnames=['表头'])

数据透视表

cj.pivot_table(rows='E2',cols='A3',values='ID',aggfunc=np.mean)

Base 和 Total

base = x2.value_counts()
total = x1.value_counts()
r = ct / base

其他

API 文档

http://pandas.pydata.org/pandas-docs/stable/api.html

Pandas 数据分析笔记

https://amaozhao.gitbooks.io/pandas-notebook/content/

Pandas 描述统计函数

http://blog.csdn.net/claroja/article/details/65445063

相关词条