您的位置:首页 > 数据库

2015-04-08-数据聚合与分组运算(3)-透视表和交叉表+2012联邦选举委员会数据库

2015-04-08 21:59 323 查看
---------------------------------------------------------------------------------------

-----(四)、透视表和交叉表

from pandas import DataFrame,Series

import numpy as np

import pandas as pd

cd L:\czz

tips=pd.read_csv(r'ch08\tips.csv')

tips['tip_pct']=tips['tip']/tips['total_bill']

根据sex和smoker计算分组平均数

tips.pivot_table(rows=['sex','smoker'])

tips.pivot_table(['tip_pct','size'],rows=['sex','day'],cols='smoker')

添加小计

tips.pivot_table(['tip_pct','size'],rows=['sex','day'],cols='smoker',margins=True)

使用其他的聚合函数

tips.pivot_table('tip_pct',rows=['sex','smoker'],cols='day',aggfunc=len,margins=True)

填充空值

tips.pivot_table('size',rows=['time','sex','smoker'],cols='day',aggfunc='sum',fill_value=0)

tips.pivot_table('size',rows=['time','sex','smoker'],cols='day',aggfunc='sum')

-----交叉表:crosstab

data=DataFrame({'Sample':[1,2,3,4,5,6,7,8,9,10],

'Gender':['FeMale','Male','FeMale','Male','Male','Male','FeMale','FeMale','Male','FeMale'],

'Handedness':['Right-handed','Left-handed','Right-handed','Right-handed','Left-handed','Right-handed','Right-handed','Left-handed','Right-handed','Right-handed']

})

pd.crosstab(data.Gender,data.Handedness,margins=True)

pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

---------------------------------------------------------------------------------------

-----(五)、2012联邦选举委员会数据库

**** fec=pd.read_csv(r'ch09\P00000001-ALL.csv')

fec.ix[123456]

获取全部的候选人名单

unique_cands=fec.cand_nm.unique() 

unique_cands

利用字典说明党派关系

****  parties={

'Bachmann, Michelle':'Republican',

'Cain, Herman':'Republican',

'Gingrich, Newt':'Republican',

'Huntsman, Jon':'Republican',

'Johnson, Gary Earl':'Republican',

'McCotter, Thaddeus G':'Republican',

'Obama, Barack':'Democrat',

'Paul, Ron':'Republican',

'Pawlenty, Timothy':'Republican',

'Perry, Rick':'Republican',

"Roemer, Charles E. 'Buddy' III":'Republican',

'Romney, Mitt':'Republican',

'Santorum, Rick':'Republican',

}

fec.cand_nm[123456:123461]

fec.cand_nm[123456:123461].map(parties)

将其添加为一个新列

****  fec['party']=fec.cand_nm.map(parties)

****  fec['party'].value_counts()

该数据既包括赞助也包括退款(负的出资额)

(fec.contb_receipt_amt>0).value_counts()

限定该数据集只能有正的出资额

****  fec=fec[fec.contb_receipt_amt>0]

只包含Obama, Barack和Romney, Mitt的信息

****  fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]

-----1.根据职业和雇主统计赞助信息

fec.contbr_occupation.value_counts()[:10]

****  occ_mapping={

'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',

'INFORMATION REQUESTED':'NOT PROVIDED',

'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED',

'C.E.O.':'CEO'

}

#如果没有提供相关映射,则返回x

f=lambda x:occ_mapping.get(x,x)

fec.contbr_occupation=fec.contbr_occupation.map(f)

对雇主信息也进行同样的处理

****  emp_mapping={

'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',

'INFORMATION REQUESTED':'NOT PROVIDED',

'SELF':'SELF-EMPLOYED',

'SELF EMPLOYED':'SELF-EMPLOYED'

}

f=lambda x:emp_mapping.get(x,x)

fec.contbr_employer=fec.contbr_employer.map(f)

通过pivot_table根据党派和职业对数据进行聚合,然后过滤掉总出资额不足200万美元的数据

****  by_occupation=fec.pivot_table('contb_receipt_amt',

rows='contbr_occupation',

cols='party',

aggfunc='sum'

)

over_2mm=by_occupation[by_occupation.sum(1)>2000000]

over_2mm

做成柱状图

****  over_2mm.plot(kind='barh')

-----2.对Obama和Romney总出资额最高的企业和职业

****  

def get_top_amounts(group,key,n=5):

    totals=group.groupby(key)['contb_receipt_amt'].sum()

    return totals.order(ascending=False)[:n]

grouped=fec_mrbo.groupby('cand_nm')

grouped.apply(get_top_amounts,'contbr_occupation',n=7)

grouped.apply(get_top_amounts,'contbr_employer',n=10)

-----3.对出资额进行分组

bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])

labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)

grouped=fec_mrbo.groupby(['cand_nm',labels])

grouped.size().unstack(0)

bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)

bucket_sums

两位候选人收到的各种捐赠额度的总额比例

normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)

normed_sums

normed_sums[:-2].plot(kind='barh',stacked=True)

-----4.根据州统计赞助信息

grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])

totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)

totals=totals[totals.sum(1)>100000]

totals[:10]

总赞助额比例

percent=totals.div(totals.sum(1),axis=0)

percent[:10]

在地图上画出来?

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: