import numpy as np
import pandas as pd
import pymysql
conn = pymysql.connect(host="14.152.59.237",port=1296,user="rtest",passwd="NJKucDPZHCMk41Xc",db="rtest",charset="utf8")
df = pd.read_sql("SELECT ID,Agency,Banner,ScoreT FROM gz1699 WHERE B1B3=1 AND ID>1000000 LIMIT 20",conn,index_col="ID")
s = pd.Series([1,0,2,1,2,3])
s = df['Agency']
s.unique()
s.count()
s.value_counts()
s.isin([1,3])
pd.NaN
(Not a Number)s.isnull()
s.notnull()
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, colums=['Q1a','Q1b','Q2'])
df2.Q1a #按列索引选取
df2.ix[1] df2.ix[[2,4]] #按行索引选取
df2[0] df2[1:5] #按行序号选取
df2[df2.isin([1,'DASWON'])]
df2[df2<3]
df[df.ScoreT>90]
del df2['Q1a']
df.T
df.count()
df.describe()
count mean std min 25% 50% 75% Max
df.sort_index()
df.sort_index(by='ScoreT')
df.sort_values('ScoreT', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
pd.NaN
(Not a Number)df2.dropna()
df2.dropna(how='all')
df2[df.notnull()]
df2.fillna(0)
df[df.duplicated()]
df.drop_duplicated()
表格与二级索引数据转换
ms = df.stack()
ms.unstack()
其他: pivot
bins=[0,60,80,90,95,100]
cat = pd.cut(df.ScoreT,bins)
pd.value_counts(cat)
bin_names=['0~60','61~80','81~90','91~95','96~100']
cat = pd.cut(df.ScoreT,bins,labels=bin_names)
s1 = pd.Series(cat,index=df.index).to_frame(name='S1')
df2 = pd.merge(df, s1, left_index=True,right_index=True)
小括号为开区间,中括号为闭区间
认真理解内部机制 SPLIT-APPLY-COMMBINE 过程,大数据MapReduce也是对这个机制的扩充
group = df['ScoreT'].groupby(df['Agency'])
group.mean()
group.sum()
group = df['ScoreT'].groupby([df['Agency'],df['Banner']])
group.count().unstack()
df.groupby([df['Agency'],df['Banner']]).count().add_prefix('count_')
df.groupby([df['Agency'],df['Banner']]).mean().add_prefix('mean_')
group.agg(mean)
def count(series):
return series.count()
group.agg([count, 'mean','sum'])
def crossjoin(frame, name1, name2):
def split_series(n):
if frame[n].dtype.name != 'object':
fr = frame[n].to_frame(name=n)
fr['ID']=fr.index
else:
fr = frame[n].apply(lambda x: pd.Series(re.split(',', x))).stack().to_frame(name=n)
fr['ID']=fr.index.droplevel(level=1)
return fr
x1 = split_series(name1)
x2 = split_series(name2)
return (pd.merge(x1, x2, on='ID', how='outer'),x1[name1],x2[name2])
df = pd.read_sql("SELECT ID,A2,A3,B1,E2,ScoreT FROM gz1714 WHERE A1=1 LIMIT 20",conn,index_col="ID")
crossjoin(df, 'A2', 'A3')
x,x1,x2 = crossjoin(df, 'E2', 'A3')
pd.crosstab(cj['E2'],cj['A3'], margins=True)
del ct['All']
ct = pd.crosstab(cj['E2'],cj['A3'], margins=True,rownames=['题目描述'],colnames=['表头'])
cj.pivot_table(rows='E2',cols='A3',values='ID',aggfunc=np.mean)
base = x2.value_counts()
total = x1.value_counts()
r = ct / base
其他
http://pandas.pydata.org/pandas-docs/stable/api.html
https://amaozhao.gitbooks.io/pandas-notebook/content/