pandas聚合和分组运算之groupby
2018-03-09 17:08
567 查看
import numpy as np import matplotlib.pyplot as plt from pandas import Series, DataFrame import pandas as pd np.random.seed(12345) # 记录随机数的种子,确保每次执行,都是相同的随机数 plt.rc('figure', figsize=(10, 6)) ### GroupBy 技术 ''' pandas聚合和分组运算之groupby pandas提供了一个灵活高效的groupby功能,它使你能以一种自然的方式对数据集进行切片、切块、摘要等操作。 根据一个或多个键(可以是函数、数组或DataFrame列名)拆分pandas对象。计算分组摘要统计, 如计数、平均值、标准差,或用户自定义函数。对DataFrame的列应用各种各样的函数。 应用组内转换或其他运算,如规格化、线性回归、排名或选取子集等。 计算透视表或交叉表。执行分位数分析以及其他分组分析。 ''' df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'], 'key2' : ['one', 'two', 'one', 'two', 'one'], 'data1' : np.random.randn(5), 'data2' : np.random.randn(5)}) #print( df ) ''' data1 data2 key1 key2 0 -0.204708 1.393406 a one 1 0.478943 0.092908 a two 2 -0.519439 0.281746 b one 3 -0.555730 0.769023 b two 4 1.965781 1.246435 a one ''' grouped = df['data1'].groupby(df['key1']) print( grouped ) # <pandas.core.groupby.SeriesGroupBy object at 0x0000000019A396A0> print( grouped.mean() ) ''' key1 a 0.746672 b -0.537585 Name: data1, dtype: float64 ''' print(' ###################### ') means = df['data1'].groupby([df['key1'], df['key2']]).mean() print( means ) ''' key1 key2 a one 0.880536 two 0.478943 b one -0.519439 two -0.555730 Name: data1, dtype: float64 ''' print(' ###################### ') print( means.unstack() ) ''' key2 one two key1 a 0.880536 0.478943 b -0.519439 -0.555730 ''' print(' ###################### ') states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio']) years = np.array([2005, 2005, 2006, 2005, 2006]) print( df['data1'].groupby([states, years]).mean() ) ''' California 2005 0.478943 2006 -0.519439 Ohio 2005 -0.380219 2006 1.965781 Name: data1, dtype: float64 ''' print(' ###################### ') print( df.groupby('key1').mean() ) ''' data1 data2 key1 a 0.746672 0.910916 b -0.537585 0.525384 ''' print(' ###################### ') print( df.groupby(['key1', 'key2']).mean() ) ''' data1 data2 key1 key2 a one 0.880536 1.319920 two 0.478943 0.092908 b one -0.519439 0.281746 two -0.555730 0.769023 ''' print(' ###################### ') print( df.groupby(['key1', 'key2']).size() ) ''' key1 key2 a one 2 two 1 b one 1 two 1 dtype: int64 ''' print(' ###################### ') # ### 对分组进行迭代 for name, group in df.groupby('key1'): print(name) print(group) print(' ###################### ') print( df.groupby('key1') ) # <pandas.core.groupby.DataFrameGroupBy object at 0x0000000008436780> print(' ###################### ') for (k1, k2), group in df.groupby(['key1', 'key2']): print((k1, k2)) print(group) pieces = dict(list(df.groupby('key1'))) print( pieces['b'] ) # 筛选 以 key1 为列字段 分组 的数据中 有 b 的数据 ''' data1 data2 key1 key2 2 -0.519439 0.281746 b one 3 -0.555730 0.769023 b two ''' print( df.dtypes ) ''' data1 float64 data2 float64 key1 object key2 object dtype: object ''' print(' ###################### ') grouped = df.groupby(df.dtypes, axis=1) print( list(grouped) ) ''' [(dtype('float64'), data1 data2 0 -0.204708 1.393406 1 0.478943 0.092908 2 -0.519439 0.281746 3 -0.555730 0.769023 4 1.965781 1.246435), (dtype('O'), key1 key2 0 a one 1 a two 2 b one 3 b two 4 a one)] ''' print( dict(list(grouped)) ) ''' {dtype('O'): key1 key2 0 a one 1 a two 2 b one 3 b two 4 a one, dtype('float64'): data1 data2 0 -0.204708 1.393406 1 0.478943 0.092908 2 -0.519439 0.281746 3 -0.555730 0.769023 4 1.965781 1.246435} ''' print(' ###################### ') # ### 选择一个或一组列 df.groupby('key1')['data1'] df.groupby('key1')[['data2']] df['data1'].groupby(df['key1']) df[['data2']].groupby(df['key1']) df.groupby(['key1', 'key2'])[['data2']].mean() s_grouped = df.groupby(['key1', 'key2'])['data2'] print( s_grouped ) #<pandas.core.groupby.SeriesGroupBy object at 0x000000000D3A3A20> print( s_grouped.mean() ) ''' key1 key2 a one 1.319920 two 0.092908 b one 0.281746 two 0.769023 Name: data2, dtype: float64 ''' print(' ###################### ') # ### 通过字典或series进行分组 people = DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values print( people ) ''' a b c d e Joe 1.007189 -1.296221 0.274992 0.228913 1.352917 Steve 0.886429 -2.001637 -0.371843 1.669025 -0.438570 Wes -0.539741 NaN NaN -1.021228 -0.577087 Jim 0.124121 0.302614 0.523772 0.000940 1.343810 Travis -0.713544 -0.831154 -2.370232 -1.860761 -0.860757 ''' mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'} by_column = people.groupby(mapping, axis=1) print( by_column.sum() ) ''' blue red Joe 0.503905 1.063885 Steve 1.297183 -1.553778 Wes -1.021228 -1.116829 Jim 0.524712 1.770545 Travis -4.230992 -2.405455 ''' print(' ###################### ') map_series = Series(mapping) print( map_series ) ''' a red b red c blue d blue e red f orange dtype: object ''' print( people.groupby(map_series, axis=1).count() ) ''' blue red Joe 2 3 Steve 2 3 Wes 1 2 Jim 2 3 Travis 2 3 ''' # ### 通过函数进行分组 print( people.groupby(len).sum() ) ''' a b c d e 3 0.591569 -0.993608 0.798764 -0.791374 2.119639 5 0.886429 -2.001637 -0.371843 1.669025 -0.438570 6 -0.713544 -0.831154 -2.370232 -1.860761 -0.860757 ''' key_list = ['one', 'one', 'one', 'two', 'two'] print( people.groupby([len, key_list]).min() ) ''' a b c d e 3 one -0.539741 -1.296221 0.274992 -1.021228 -0.577087 two 0.124121 0.302614 0.523772 0.000940 1.343810 5 one 0.886429 -2.001637 -0.371843 1.669025 -0.438570 6 two -0.713544 -0.831154 -2.370232 -1.860761 -0.860757 ''' print(' ###################### ') #### 通过索引进行分组 columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['cty', 'tenor']) hier_df = DataFrame(np.random.randn(4, 5), columns=columns) print( hier_df ) ''' cty US JP tenor 1 3 5 1 3 0 0.560145 -1.265934 0.119827 -1.063512 0.332883 1 -2.359419 -0.199543 -1.541996 -0.970736 -1.307030 2 0.286350 0.377984 -0.753887 0.331286 1.349742 3 0.069877 0.246674 -0.011862 1.004812 1.327195 ''' print( hier_df.groupby(level='cty', axis=1).count() ) ''' cty JP US 0 2 3 1 2 3 2 2 3 3 2 3 ''' print(' ###################### ') ###数据聚合 df print( df ) ''' data1 data2 key1 key2 0 -0.204708 1.393406 a one 1 0.478943 0.092908 a two 2 -0.519439 0.281746 b one 3 -0.555730 0.769023 b two 4 1.965781 1.246435 a one ''' grouped = df.groupby('key1') grouped['data1'].quantile(0.9) print(' ###################### ') def peak_to_peak(arr): return arr.max() - arr.min() print( grouped.agg(peak_to_peak) ) ''' data1 data2 key1 a 2.170488 1.300498 b 0.036292 0.487276 ''' print( grouped.describe() ) ''' data1 data2 key1 a count 3.000000 3.000000 mean 0.746672 0.910916 std 1.109736 0.712217 min -0.204708 0.092908 25% 0.137118 0.669671 50% 0.478943 1.246435 75% 1.222362 1.319920 max 1.965781 1.393406 b count 2.000000 2.000000 mean -0.537585 0.525384 std 0.025662 0.344556 min -0.555730 0.281746 25% -0.546657 0.403565 50% -0.537585 0.525384 75% -0.528512 0.647203 max -0.519439 0.769023 '''
相关文章推荐
- python/pandas数据挖掘(十四)-groupby,聚合,分组级运算
- pandas聚合和分组运算——GroupBy技术(1)
- pandas聚合和分组运算之groupby - 2
- pandas聚合和分组运算之groupby
- 数据聚合与分组运算——GroupBy技术(1)
- Python groupby,聚合,分组级运算
- python/pandas数据分析(十五)-聚合与分组运算实例
- pandas数据分组运算:groupby
- Pandas —— groupby( )聚合分组
- 数据聚合与分组运算——GroupBy
- Pandas分组运算(groupby)修炼
- 2015-04-01-数据聚合与分组运算(1)-GroupBy技术+数据聚合
- pandas聚合和分组运算之groupby
- Pandas-数据聚合与分组运算
- pandas—数据聚合与分组运算
- pandas聚合和分组运算之groupby
- Pandas GroupBy 分组(分割-应用-组合)
- 《利用python进行数据分析》读书笔记--第九章 数据聚合与分组运算(二)
- pandas数据分组和聚合操作方法
- python中数据聚合与分组运算