您的位置:首页 > 移动开发

14.pandas高级-分组计算-apply-透视表

2020-08-23 18:54 1041 查看
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

神奇的apply

# 读取文件
tips=pd.read_csv('data/tips.csv')
# 显示前五行
tips.head()
total_bill	tip	smoker	day	time	size
0	16.99	1.01	No	Sun	Dinner	2
1	10.34	1.66	No	Sun	Dinner	3
2	21.01	3.50	No	Sun	Dinner	3
3	23.68	3.31	No	Sun	Dinner	2
4	24.59	3.61	No	Sun	Dinner	4
# 读取后5行
tips.tail()
total_bill	tip	smoker	day	time	size
239	29.03	5.92	No	Sat	Dinner	3
240	27.18	2.00	Yes	Sat	Dinner	2
241	22.67	2.00	Yes	Sat	Dinner	2
242	17.82	1.75	No	Sat	Dinner	2
243	18.78	3.00	No	Thur	Dinner	2
# 新增一列数据,tip_percent 到 tip 后一列。值 tip/total_bill
newtip=tips.copy()
num=list(newtip.columns).index('tip')
num
1
newtip.insert(num+1,'tip_percent',newtip.tip/newtip.total_bill)
newtip.head()
total_bill	tip	tip_percent	smoker	day	time	size
0	16.99	1.01	0.059447	No	Sun	Dinner	2
1	10.34	1.66	0.160542	No	Sun	Dinner	3
2	21.01	3.50	0.166587	No	Sun	Dinner	3
3	23.68	3.31	0.139780	No	Sun	Dinner	2
4	24.59	3.61	0.146808	No	Sun	Dinner	4
# 将tip_percent 显示成百分比
newtip1=newtip.copy()
newtip1.tip_percent.apply(lambda x:str(round(x*100,2))+'%')
0       5.94%
1      16.05%
2      16.66%
3      13.98%
4      14.68%
...
239    20.39%
240     7.36%
241     8.82%
242     9.82%
243    15.97%
Name: tip_percent, Length: 244, dtype: object
newtip1.tip_percent.apply(lambda x:f'{round(x*100,2)}%')
0       5.94%
1      16.05%
2      16.66%
3      13.98%
4      14.68%
...
239    20.39%
240     7.36%
241     8.82%
242     9.82%
243    15.97%
Name: tip_percent, Length: 244, dtype: object
newtip
total_bill	tip	tip_percent	smoker	day	time	size
0	16.99	1.01	0.059447	No	Sun	Dinner	2
1	10.34	1.66	0.160542	No	Sun	Dinner	3
2	21.01	3.50	0.166587	No	Sun	Dinner	3
3	23.68	3.31	0.139780	No	Sun	Dinner	2
4	24.59	3.61	0.146808	No	Sun	Dinner	4...	...	...	...	...	...	...	...
239	29.03	5.92	0.203927	No	Sat	Dinner	3
240	27.18	2.00	0.073584	Yes	Sat	Dinner	2
241	22.67	2.00	0.088222	Yes	Sat	Dinner	2
242	17.82	1.75	0.098204	No	Sat	Dinner	2
243	18.78	3.00	0.159744	No	Thur	Dinner	2
244 rows × 7 columns
# 编写函数,实现按照任意一列,默认总消费,默认排序后的5个值
def Top(df,column_name='total_bill',n=5):
return df.sort_values(by=column_name).tail(n)
Top(newtip,'tip',8)
total_bill	tip	tip_percent	smoker	day	time	size
47	32.40	6.00	0.185185	No	Sun	Dinner	4
183	23.17	6.50	0.280535	Yes	Sun	Dinner	4
214	28.17	6.50	0.230742	Yes	Sat	Dinner	3
141	34.30	6.70	0.195335	No	Thur	Lunch	6
59	48.27	6.73	0.139424	No	Sat	Dinner	4
23	39.42	7.58	0.192288	No	Sat	Dinner	4
212	48.33	9.00	0.186220	No	Sat	Dinner	4
170	50.81	10.00	0.196812	Yes	Sat	Dinner	3
newtip.groupby('smoker').apply(Top,'tip')
total_bill	tip	tip_percent	smoker	day	time	size
smoker
No	47	32.40	6.00	0.185185	No	Sun	Dinner	4
141	34.30	6.70	0.195335	No	Thur	Lunch	6
59	48.27	6.73	0.139424	No	Sat	Dinner	4
23	39.42	7.58	0.192288	No	Sat	Dinner	4
212	48.33	9.00	0.186220	No	Sat	Dinner	4
Yes	211	25.89	5.16	0.199305	Yes	Sat	Dinner	4
181	23.33	5.65	0.242177	Yes	Sun	Dinner	2
214	28.17	6.50	0.230742	Yes	Sat	Dinner	3
183	23.17	6.50	0.280535	Yes	Sun	Dinner	4
170	50.81	10.00	0.196812	Yes	Sat	Dinner	3

当我们apply(func)时,func的参数怎么办?
如:func(obj,arg1,arg2,…)
xxx.apply(func,func的arg1,arg2,…)

s1=Series(np.random.randn(5))
s1
0    0.520809
1    1.417082
2    1.672192
3   -1.048475
4   -1.507357
dtype: float64
# 使用 apply 对s1保留3位小数
s1.apply(round,ndigits=3)
0    0.5211    1.417
2    1.672
3   -1.048
4   -1.507
dtype: float64
DataFrame(s1)
0
0	0.520809
1	1.417082
2	1.672192
3	-1.048475
4	-1.507357
DataFrame(s1).apply(pd.cut,bins=3)
0
0	(-0.448, 0.612]
1	(0.612, 1.672]
2	(0.612, 1.672]
3	(-1.511, -0.448]
4	(-1.511, -0.448]
s2=Series(np.random.randint(5))
s2
0    3
dtype: int64
DataFrame(s2).apply(pd.cut,bins=2)
0
0	(2.997, 3.0]
# 查看小费百分比最高的八个
newtip.groupby('smoker').apply(Top,column_name='tip_percent',n=8)
total_bill	tip	tip_percent	smoker	day	time	size
smoker
No	20	17.92	4.08	0.227679	No	Sat	Dinner	2
17	16.29	3.71	0.227747	No	Sun	Dinner	3
6	8.77	2.00	0.228050	No	Sun	Dinner	2
88	24.71	5.85	0.236746	No	Thur	Lunch	2
185	20.69	5.00	0.241663	No	Sun	Dinner	5
51	10.29	2.60	0.252672	No	Sun	Dinner	2
149	7.51	2.00	0.266312	No	Thur	Lunch	2
232	11.61	3.39	0.291990	No	Sat	Dinner	2
Yes	181	23.33	5.65	0.242177	Yes	Sun	Dinner	2
221	13.42	3.48	0.259314	Yes	Fri	Lunch	2
93	16.32	4.30	0.263480	Yes	Fri	Dinner	2
109	14.31	4.00	0.279525	Yes	Sat	Dinner	2
183	23.17	6.50	0.280535	Yes	Sun	Dinner	4
67	3.07	1.00	0.325733	Yes	Sat	Dinner	1178	9.60	4.00	0.416667	Yes	Sun	Dinner	2
172	7.25	5.15	0.710345	Yes	Sun	Dinner	2
Top(newtip,column_name='tip_percent',n=8)
total_bill	tip	tip_percent	smoker	day	time	size
93	16.32	4.30	0.263480	Yes	Fri	Dinner	2
149	7.51	2.00	0.266312	No	Thur	Lunch	2
109	14.31	4.00	0.279525	Yes	Sat	Dinner	2
183	23.17	6.50	0.280535	Yes	Sun	Dinner	4
232	11.61	3.39	0.291990	No	Sat	Dinner	2
67	3.07	1.00	0.325733	Yes	Sat	Dinner	1178	9.60	4.00	0.416667	Yes	Sun	Dinner	2
172	7.25	5.15	0.710345	Yes	Sun	Dinner	2
newtip.groupby(['smoker','day'],group_keys=True).apply(Top,column_name='tip_percent',n=3)
total_bill	tip	tip_percent	smoker	day	time	size
smoker	day
No	Fri	94	22.75	3.25	0.142857	No	Fri	Dinner	2
91	22.49	3.50	0.155625	No	Fri	Dinner	2
223	15.98	3.00	0.187735	No	Fri	Lunch	3
Sat	110	14.00	3.00	0.214286	No	Sat	Dinner	2
20	17.92	4.08	0.227679	No	Sat	Dinner	2
232	11.61	3.39	0.291990	No	Sat	Dinner	2
Sun	6	8.77	2.00	0.228050	No	Sun	Dinner	2
185	20.69	5.00	0.241663	No	Sun	Dinner	5
51	10.29	2.60	0.252672	No	Sun	Dinner	2
Thur	87	18.28	4.00	0.218818	No	Thur	Lunch	2
88	24.71	5.85	0.236746	No	Thur	Lunch	2
149	7.51	2.00	0.266312	No	Thur	Lunch	2
Yes	Fri	222	8.58	1.92	0.223776	Yes	Fri	Lunch	1221	13.42	3.48	0.259314	Yes	Fri	Lunch	2
93	16.32	4.30	0.263480	Yes	Fri	Dinner	2
Sat	214	28.17	6.50	0.230742	Yes	Sat	Dinner	3
109	14.31	4.00	0.279525	Yes	Sat	Dinner	2
67	3.07	1.00	0.325733	Yes	Sat	Dinner	1Sun	183	23.17	6.50	0.280535	Yes	Sun	Dinner	4
178	9.60	4.00	0.416667	Yes	Sun	Dinner	2
172	7.25	5.15	0.710345	Yes	Sun	Dinner	2
Thur	191	19.81	4.19	0.211509	Yes	Thur	Lunch	2
200	18.71	4.00	0.213789	Yes	Thur	Lunch	3
194	16.58	4.00	0.241255	Yes	Thur	Lunch	2

加权平均

# 60 70 80 3:3:4
(60*3+70*3+80*4)/10
71.0
np.average([60,70,80],weights=[3,3,4])
71.0
# 创建一个
df2 = DataFrame(
{
'category':list('aabbaba'),
'data':np.random.randint(60,100,7),
'weights': np.random.random(7)
}
)
df2
category	data	weights
0	a	79	0.512505
1	a	75	0.684013
2	b	97	0.107550
3	b	63	0.870675
4	a	73	0.685156
5	b	61	0.0210016	a	94	0.889526
df2.groupby(['category']).apply(lambda x:np.average(x.data,weights=x.weights))
category
a    81.344075
b    66.617508
dtype: float64

透视表

newtip.head()
total_bill	tip	tip_percent	smoker	day	time	size
0	16.99	1.01	0.059447	No	Sun	Dinner	2
1	10.34	1.66	0.160542	No	Sun	Dinner	3
2	21.01	3.50	0.166587	No	Sun	Dinner	3
3	23.68	3.31	0.139780	No	Sun	Dinner	2
4	24.59	3.61	0.146808	No	Sun	Dinner	4
# 透视表默认求平均
newtip.pivot_table(index=['smoker','day'],aggfunc='mean')
size	tip	tip_percent	total_bill
smoker	day
No	Fri	2.250000	2.812500	0.151650	18.420000
Sat	2.555556	3.102889	0.158048	19.661778
Sun	2.929825	3.167895	0.160113	20.506667
Thur	2.488889	2.673778	0.160298	17.113111Yes	Fri	2.066667	2.714000	0.174783	16.813333
Sat	2.476190	2.875476	0.147906	21.276667
Sun	2.578947	3.516842	0.187250	24.120000
Thur	2.352941	3.030000	0.163863	19.190588
newtip.groupby(['smoker','day']).mean()
total_bill	tip	tip_percent	size
smoker	day
No	Fri	18.420000	2.812500	0.151650	2.250000
Sat	19.661778	3.102889	0.158048	2.555556
Sun	20.506667	3.167895	0.160113	2.929825
Thur	17.113111	2.673778	0.160298	2.488889
Yes	Fri	16.813333	2.714000	0.174783	2.066667
Sat	21.276667	2.875476	0.147906	2.476190
Sun	24.120000	3.516842	0.187250	2.578947
Thur	19.190588	3.030000	0.163863	2.352941
newtip.pivot_table(index=['smoker','day'],aggfunc=['mean','max'])
mean	max
size	tip	tip_percent	total_bill	size	time	tip	tip_percent	total_bill
smoker	day
No	Fri	2.250000	2.812500	0.151650	18.420000	3	Lunch	3.50	0.187735	22.75
Sat	2.555556	3.102889	0.158048	19.661778	4	Dinner	9.00	0.291990	48.33
Sun	2.929825	3.167895	0.160113	20.506667	6	Dinner	6.00	0.252672	48.17
Thur	2.488889	2.673778	0.160298	17.113111	6	Lunch	6.70	0.266312	41.19
Yes	Fri	2.066667	2.714000	0.174783	16.813333	4	Lunch	4.73	0.263480	40.17
Sat	2.476190	2.875476	0.147906	21.276667	5	Dinner	10.00	0.325733	50.81Sun	2.578947	3.516842	0.187250	24.120000	5	Dinner	6.50	0.710345	45.35
Thur	2.352941	3.030000	0.163863	19.190588	4	Lunch	5.00	0.241255	43.11
# margins=True 添加分项小计
newtip.pivot_table(['tip_percent','size'],
columns='smoker',
index=['day','time'],
margins=True,
aggfunc=len,
fill_value=0
)
size	tip_percent
smoker	No	Yes	All	No	Yes	All
day	time
Fri	Dinner	3	9	12	3	9	12.0
Lunch	1	6	7	1	6	7.0
Sat	Dinner	45	42	87	45	42	87.0
Sun	Dinner	57	19	76	57	19	76.0
Thur	Dinner	1	0	1	1	0	1.0
Lunch	44	17	61	44	17	61.0
All		151	93	244	151	93	244.0
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: