python高级-23.pandas-分组计算-apply-透视表
2020-08-23 19:08
274 查看
import numpy as np import pandas as pd from pandas import Series,DataFrame from numpy import nan as NA import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
神奇的apply
# 读取文件 tips=pd.read_csv('data/tips.csv') # 显示前五行 tips.head()
total_bill tip smoker day time size 0 16.99 1.01 No Sun Dinner 2 1 10.34 1.66 No Sun Dinner 3 2 21.01 3.50 No Sun Dinner 3 3 23.68 3.31 No Sun Dinner 2 4 24.59 3.61 No Sun Dinner 4
# 读取后5行 tips.tail()
total_bill tip smoker day time size 239 29.03 5.92 No Sat Dinner 3 240 27.18 2.00 Yes Sat Dinner 2 241 22.67 2.00 Yes Sat Dinner 2 242 17.82 1.75 No Sat Dinner 2 243 18.78 3.00 No Thur Dinner 2
# 新增一列数据,tip_percent 到 tip 后一列。值 tip/total_bill newtip=tips.copy() num=list(newtip.columns).index('tip') num
1
newtip.insert(num+1,'tip_percent',newtip.tip/newtip.total_bill) newtip.head()
total_bill tip tip_percent smoker day time size 0 16.99 1.01 0.059447 No Sun Dinner 2 1 10.34 1.66 0.160542 No Sun Dinner 3 2 21.01 3.50 0.166587 No Sun Dinner 3 3 23.68 3.31 0.139780 No Sun Dinner 2 4 24.59 3.61 0.146808 No Sun Dinner 4
# 将tip_percent 显示成百分比 newtip1=newtip.copy() newtip1.tip_percent.apply(lambda x:str(round(x*100,2))+'%')
0 5.94% 1 16.05% 2 16.66% 3 13.98% 4 14.68% ... 239 20.39% 240 7.36% 241 8.82% 242 9.82% 243 15.97% Name: tip_percent, Length: 244, dtype: object
newtip1.tip_percent.apply(lambda x:f'{round(x*100,2)}%')
0 5.94% 1 16.05% 2 16.66% 3 13.98% 4 14.68% ... 239 20.39% 240 7.36% 241 8.82% 242 9.82% 243 15.97% Name: tip_percent, Length: 244, dtype: object
newtip
total_bill tip tip_percent smoker day time size 0 16.99 1.01 0.059447 No Sun Dinner 2 1 10.34 1.66 0.160542 No Sun Dinner 3 2 21.01 3.50 0.166587 No Sun Dinner 3 3 23.68 3.31 0.139780 No Sun Dinner 2 4 24.59 3.61 0.146808 No Sun Dinner 4... ... ... ... ... ... ... ... 239 29.03 5.92 0.203927 No Sat Dinner 3 240 27.18 2.00 0.073584 Yes Sat Dinner 2 241 22.67 2.00 0.088222 Yes Sat Dinner 2 242 17.82 1.75 0.098204 No Sat Dinner 2 243 18.78 3.00 0.159744 No Thur Dinner 2 244 rows × 7 columns
# 编写函数,实现按照任意一列,默认总消费,默认排序后的5个值 def Top(df,column_name='total_bill',n=5): return df.sort_values(by=column_name).tail(n) Top(newtip,'tip',8)
total_bill tip tip_percent smoker day time size 47 32.40 6.00 0.185185 No Sun Dinner 4 183 23.17 6.50 0.280535 Yes Sun Dinner 4 214 28.17 6.50 0.230742 Yes Sat Dinner 3 141 34.30 6.70 0.195335 No Thur Lunch 6 59 48.27 6.73 0.139424 No Sat Dinner 4 23 39.42 7.58 0.192288 No Sat Dinner 4 212 48.33 9.00 0.186220 No Sat Dinner 4 170 50.81 10.00 0.196812 Yes Sat Dinner 3
newtip.groupby('smoker').apply(Top,'tip')
total_bill tip tip_percent smoker day time size smoker No 47 32.40 6.00 0.185185 No Sun Dinner 4 141 34.30 6.70 0.195335 No Thur Lunch 6 59 48.27 6.73 0.139424 No Sat Dinner 4 23 39.42 7.58 0.192288 No Sat Dinner 4 212 48.33 9.00 0.186220 No Sat Dinner 4 Yes 211 25.89 5.16 0.199305 Yes Sat Dinner 4 181 23.33 5.65 0.242177 Yes Sun Dinner 2 214 28.17 6.50 0.230742 Yes Sat Dinner 3 183 23.17 6.50 0.280535 Yes Sun Dinner 4 170 50.81 10.00 0.196812 Yes Sat Dinner 3
当我们apply(func)时,func的参数怎么办?
如:func(obj,arg1,arg2,…)
xxx.apply(func,func的arg1,arg2,…)
s1=Series(np.random.randn(5)) s1
0 0.520809 1 1.417082 2 1.672192 3 -1.048475 4 -1.507357 dtype: float64
# 使用 apply 对s1保留3位小数 s1.apply(round,ndigits=3)
0 0.5211 1.417 2 1.672 3 -1.048 4 -1.507 dtype: float64
DataFrame(s1)
0 0 0.520809 1 1.417082 2 1.672192 3 -1.048475 4 -1.507357
DataFrame(s1).apply(pd.cut,bins=3)
0 0 (-0.448, 0.612] 1 (0.612, 1.672] 2 (0.612, 1.672] 3 (-1.511, -0.448] 4 (-1.511, -0.448]
s2=Series(np.random.randint(5)) s2
0 3 dtype: int64
DataFrame(s2).apply(pd.cut,bins=2)
0 0 (2.997, 3.0]
# 查看小费百分比最高的八个 newtip.groupby('smoker').apply(Top,column_name='tip_percent',n=8)
total_bill tip tip_percent smoker day time size smoker No 20 17.92 4.08 0.227679 No Sat Dinner 2 17 16.29 3.71 0.227747 No Sun Dinner 3 6 8.77 2.00 0.228050 No Sun Dinner 2 88 24.71 5.85 0.236746 No Thur Lunch 2 185 20.69 5.00 0.241663 No Sun Dinner 5 51 10.29 2.60 0.252672 No Sun Dinner 2 149 7.51 2.00 0.266312 No Thur Lunch 2 232 11.61 3.39 0.291990 No Sat Dinner 2 Yes 181 23.33 5.65 0.242177 Yes Sun Dinner 2 221 13.42 3.48 0.259314 Yes Fri Lunch 2 93 16.32 4.30 0.263480 Yes Fri Dinner 2 109 14.31 4.00 0.279525 Yes Sat Dinner 2 183 23.17 6.50 0.280535 Yes Sun Dinner 4 67 3.07 1.00 0.325733 Yes Sat Dinner 1178 9.60 4.00 0.416667 Yes Sun Dinner 2 172 7.25 5.15 0.710345 Yes Sun Dinner 2
Top(newtip,column_name='tip_percent',n=8)
total_bill tip tip_percent smoker day time size 93 16.32 4.30 0.263480 Yes Fri Dinner 2 149 7.51 2.00 0.266312 No Thur Lunch 2 109 14.31 4.00 0.279525 Yes Sat Dinner 2 183 23.17 6.50 0.280535 Yes Sun Dinner 4 232 11.61 3.39 0.291990 No Sat Dinner 2 67 3.07 1.00 0.325733 Yes Sat Dinner 1178 9.60 4.00 0.416667 Yes Sun Dinner 2 172 7.25 5.15 0.710345 Yes Sun Dinner 2
newtip.groupby(['smoker','day'],group_keys=True).apply(Top,column_name='tip_percent',n=3)
total_bill tip tip_percent smoker day time size smoker day No Fri 94 22.75 3.25 0.142857 No Fri Dinner 2 91 22.49 3.50 0.155625 No Fri Dinner 2 223 15.98 3.00 0.187735 No Fri Lunch 3 Sat 110 14.00 3.00 0.214286 No Sat Dinner 2 20 17.92 4.08 0.227679 No Sat Dinner 2 232 11.61 3.39 0.291990 No Sat Dinner 2 Sun 6 8.77 2.00 0.228050 No Sun Dinner 2 185 20.69 5.00 0.241663 No Sun Dinner 5 51 10.29 2.60 0.252672 No Sun Dinner 2 Thur 87 18.28 4.00 0.218818 No Thur Lunch 2 88 24.71 5.85 0.236746 No Thur Lunch 2 149 7.51 2.00 0.266312 No Thur Lunch 2 Yes Fri 222 8.58 1.92 0.223776 Yes Fri Lunch 1221 13.42 3.48 0.259314 Yes Fri Lunch 2 93 16.32 4.30 0.263480 Yes Fri Dinner 2 Sat 214 28.17 6.50 0.230742 Yes Sat Dinner 3 109 14.31 4.00 0.279525 Yes Sat Dinner 2 67 3.07 1.00 0.325733 Yes Sat Dinner 1Sun 183 23.17 6.50 0.280535 Yes Sun Dinner 4 178 9.60 4.00 0.416667 Yes Sun Dinner 2 172 7.25 5.15 0.710345 Yes Sun Dinner 2 Thur 191 19.81 4.19 0.211509 Yes Thur Lunch 2 200 18.71 4.00 0.213789 Yes Thur Lunch 3 194 16.58 4.00 0.241255 Yes Thur Lunch 2
加权平均
# 60 70 80 3:3:4 (60*3+70*3+80*4)/10
71.0
np.average([60,70,80],weights=[3,3,4])
71.0
# 创建一个 df2 = DataFrame( { 'category':list('aabbaba'), 'data':np.random.randint(60,100,7), 'weights': np.random.random(7) } ) df2
category data weights 0 a 79 0.512505 1 a 75 0.684013 2 b 97 0.107550 3 b 63 0.870675 4 a 73 0.685156 5 b 61 0.0210016 a 94 0.889526
df2.groupby(['category']).apply(lambda x:np.average(x.data,weights=x.weights))
category a 81.344075 b 66.617508 dtype: float64
透视表
newtip.head()
total_bill tip tip_percent smoker day time size 0 16.99 1.01 0.059447 No Sun Dinner 2 1 10.34 1.66 0.160542 No Sun Dinner 3 2 21.01 3.50 0.166587 No Sun Dinner 3 3 23.68 3.31 0.139780 No Sun Dinner 2 4 24.59 3.61 0.146808 No Sun Dinner 4
# 透视表默认求平均 newtip.pivot_table(index=['smoker','day'],aggfunc='mean')
size tip tip_percent total_bill smoker day No Fri 2.250000 2.812500 0.151650 18.420000 Sat 2.555556 3.102889 0.158048 19.661778 Sun 2.929825 3.167895 0.160113 20.506667 Thur 2.488889 2.673778 0.160298 17.113111Yes Fri 2.066667 2.714000 0.174783 16.813333 Sat 2.476190 2.875476 0.147906 21.276667 Sun 2.578947 3.516842 0.187250 24.120000 Thur 2.352941 3.030000 0.163863 19.190588
newtip.groupby(['smoker','day']).mean()
total_bill tip tip_percent size smoker day No Fri 18.420000 2.812500 0.151650 2.250000 Sat 19.661778 3.102889 0.158048 2.555556 Sun 20.506667 3.167895 0.160113 2.929825 Thur 17.113111 2.673778 0.160298 2.488889 Yes Fri 16.813333 2.714000 0.174783 2.066667 Sat 21.276667 2.875476 0.147906 2.476190 Sun 24.120000 3.516842 0.187250 2.578947 Thur 19.190588 3.030000 0.163863 2.352941
newtip.pivot_table(index=['smoker','day'],aggfunc=['mean','max'])
mean max size tip tip_percent total_bill size time tip tip_percent total_bill smoker day No Fri 2.250000 2.812500 0.151650 18.420000 3 Lunch 3.50 0.187735 22.75 Sat 2.555556 3.102889 0.158048 19.661778 4 Dinner 9.00 0.291990 48.33 Sun 2.929825 3.167895 0.160113 20.506667 6 Dinner 6.00 0.252672 48.17 Thur 2.488889 2.673778 0.160298 17.113111 6 Lunch 6.70 0.266312 41.19 Yes Fri 2.066667 2.714000 0.174783 16.813333 4 Lunch 4.73 0.263480 40.17 Sat 2.476190 2.875476 0.147906 21.276667 5 Dinner 10.00 0.325733 50.81Sun 2.578947 3.516842 0.187250 24.120000 5 Dinner 6.50 0.710345 45.35 Thur 2.352941 3.030000 0.163863 19.190588 4 Lunch 5.00 0.241255 43.11
# margins=True 添加分项小计 newtip.pivot_table(['tip_percent','size'], columns='smoker', index=['day','time'], margins=True, aggfunc=len, fill_value=0 ) size tip_percent
smoker No Yes All No Yes All day time Fri Dinner 3 9 12 3 9 12.0 Lunch 1 6 7 1 6 7.0 Sat Dinner 45 42 87 45 42 87.0 Sun Dinner 57 19 76 57 19 76.0 Thur Dinner 1 0 1 1 0 1.0 Lunch 44 17 61 44 17 61.0 All 151 93 244 151 93 244.0
相关文章推荐
- python高级-22.pandas-数据聚合和分组计算
- Python高级应用实例对比:高效计算大文件中的最长行的长度
- 04. Pandas 3| 数值计算与统计、合并连接去重分组透视表文件读取
- python库学习笔记——分组计算利器:pandas中的groupby技术
- Excel数据分析高级技巧②——数据透视表(组合/切片器/计算字段/数据透视图/条件格式)
- python数据清洗工具、方法、过程整理归纳(五、数据清洗之数据统计——数据分组运算、聚合函数使用、分组对象和apply函数、透视图与交叉表)
- [置顶] 【python 分组求和功能】类似透视表的作用
- [python]按key1分组后,计算data1,data2的统计信息并附加到原始表格中
- Python高级应用实例对比:高效计算大文件中的最长行的长度
- Excel教程: 带累计数据的组合图;透视表增加计算和分组
- python|jupyter|pandas|4.4使用分组聚合进行组内计算
- excel-数据透视表0.05(分组、计算字段/项 &数据透视图)
- Python科学计算学习之高级数组
- python-pandas11-数据聚合-分组计算
- Python高级编程--多线程编程(一)
- python 计算时间差
- SUMO仿真高级进阶系列二:计算车辆的旅行时间、等待时间和变道次数
- Python 高级编程--多线程编程(二)
- 如何取数据窗口中分组区计算列的值?
- Python高级——正则表达式 re模块1.match方法