您的位置:首页 > 其它

pandas聚合和分组运算之groupby

2018-03-09 17:08 567 查看
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pandas as pd

np.random.seed(12345)  # 记录随机数的种子,确保每次执行,都是相同的随机数
plt.rc('figure', figsize=(10, 6))

### GroupBy 技术
'''
pandas聚合和分组运算之groupby
pandas提供了一个灵活高效的groupby功能,它使你能以一种自然的方式对数据集进行切片、切块、摘要等操作。
根据一个或多个键(可以是函数、数组或DataFrame列名)拆分pandas对象。计算分组摘要统计,
如计数、平均值、标准差,或用户自定义函数。对DataFrame的列应用各种各样的函数。
应用组内转换或其他运算,如规格化、线性回归、排名或选取子集等。
计算透视表或交叉表。执行分位数分析以及其他分组分析。
'''
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})

#print( df )
'''
data1     data2 key1 key2
0 -0.204708  1.393406    a  one
1  0.478943  0.092908    a  two
2 -0.519439  0.281746    b  one
3 -0.555730  0.769023    b  two
4  1.965781  1.246435    a  one
'''

grouped = df['data1'].groupby(df['key1'])
print( grouped ) # <pandas.core.groupby.SeriesGroupBy object at 0x0000000019A396A0>

print( grouped.mean() )
'''
key1
a    0.746672
b   -0.537585
Name: data1, dtype: float64
'''
print(' ###################### ')

means = df['data1'].groupby([df['key1'], df['key2']]).mean()
print( means )
'''
key1  key2
a     one     0.880536
two     0.478943
b     one    -0.519439
two    -0.555730
Name: data1, dtype: float64
'''
print(' ###################### ')
print( means.unstack() )
'''
key2       one       two
key1
a     0.880536  0.478943
b    -0.519439 -0.555730
'''

print(' ###################### ')

states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
print( df['data1'].groupby([states, years]).mean() )
'''
California  2005    0.478943
2006   -0.519439
Ohio        2005   -0.380219
2006    1.965781
Name: data1, dtype: float64
'''
print(' ###################### ')

print( df.groupby('key1').mean() )
'''
data1     data2
key1
a     0.746672  0.910916
b    -0.537585  0.525384
'''
print(' ###################### ')

print( df.groupby(['key1', 'key2']).mean() )
'''
data1     data2
key1 key2
a    one   0.880536  1.319920
two   0.478943  0.092908
b    one  -0.519439  0.281746
two  -0.555730  0.769023
'''

print(' ###################### ')

print( df.groupby(['key1', 'key2']).size() )
'''
key1  key2
a     one     2
two     1
b     one     1
two     1
dtype: int64
'''

print(' ###################### ')

# ### 对分组进行迭代
for name, group in df.groupby('key1'):
print(name)
print(group)

print(' ###################### ')

print( df.groupby('key1') ) # <pandas.core.groupby.DataFrameGroupBy object at 0x0000000008436780>

print(' ###################### ')

for (k1, k2), group in df.groupby(['key1', 'key2']):
print((k1, k2))
print(group)

pieces = dict(list(df.groupby('key1')))
print( pieces['b'] )  # 筛选 以 key1 为列字段 分组 的数据中 有 b 的数据
'''
data1     data2 key1 key2
2 -0.519439  0.281746    b  one
3 -0.555730  0.769023    b  two
'''
print( df.dtypes )
'''
data1    float64
data2    float64
key1      object
key2      object
dtype: object
'''
print(' ###################### ')

grouped = df.groupby(df.dtypes, axis=1)
print( list(grouped) )
'''
[(dtype('float64'),       data1     data2
0 -0.204708  1.393406
1  0.478943  0.092908
2 -0.519439  0.281746
3 -0.555730  0.769023
4  1.965781  1.246435), (dtype('O'),   key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one)]
'''
print( dict(list(grouped)) )
'''
{dtype('O'):   key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one, dtype('float64'):       data1     data2
0 -0.204708  1.393406
1  0.478943  0.092908
2 -0.519439  0.281746
3 -0.555730  0.769023
4  1.965781  1.246435}
'''
print(' ###################### ')

# ### 选择一个或一组列
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

df.groupby(['key1', 'key2'])[['data2']].mean()

s_grouped = df.groupby(['key1', 'key2'])['data2']

print( s_grouped ) #<pandas.core.groupby.SeriesGroupBy object at 0x000000000D3A3A20>
print( s_grouped.mean() )
'''
key1  key2
a     one     1.319920
two     0.092908
b     one     0.281746
two     0.769023
Name: data2, dtype: float64
'''
print(' ###################### ')

# ### 通过字典或series进行分组
people = DataFrame(np.random.randn(5, 5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values
print( people )
'''
a         b         c         d         e
Joe     1.007189 -1.296221  0.274992  0.228913  1.352917
Steve   0.886429 -2.001637 -0.371843  1.669025 -0.438570
Wes    -0.539741       NaN       NaN -1.021228 -0.577087
Jim     0.124121  0.302614  0.523772  0.000940  1.343810
Travis -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
'''

mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
'd': 'blue', 'e': 'red', 'f' : 'orange'}

by_column = people.groupby(mapping, axis=1)
print( by_column.sum() )
'''
blue       red
Joe     0.503905  1.063885
Steve   1.297183 -1.553778
Wes    -1.021228 -1.116829
Jim     0.524712  1.770545
Travis -4.230992 -2.405455
'''
print(' ###################### ')
map_series = Series(mapping)
print( map_series )
'''
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
'''

print( people.groupby(map_series, axis=1).count() )
'''
blue  red
Joe        2    3
Steve      2    3
Wes        1    2
Jim        2    3
Travis     2    3
'''

# ### 通过函数进行分组
print( people.groupby(len).sum() )
'''
a         b         c         d         e
3  0.591569 -0.993608  0.798764 -0.791374  2.119639
5  0.886429 -2.001637 -0.371843  1.669025 -0.438570
6 -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
'''

key_list = ['one', 'one', 'one', 'two', 'two']
print( people.groupby([len, key_list]).min() )
'''
a         b         c         d         e
3 one -0.539741 -1.296221  0.274992 -1.021228 -0.577087
two  0.124121  0.302614  0.523772  0.000940  1.343810
5 one  0.886429 -2.001637 -0.371843  1.669025 -0.438570
6 two -0.713544 -0.831154 -2.370232 -1.860761 -0.860757
'''

print(' ###################### ')

#### 通过索引进行分组
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
[1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
print( hier_df )
'''
cty          US                            JP
tenor         1         3         5         1         3
0      0.560145 -1.265934  0.119827 -1.063512  0.332883
1     -2.359419 -0.199543 -1.541996 -0.970736 -1.307030
2      0.286350  0.377984 -0.753887  0.331286  1.349742
3      0.069877  0.246674 -0.011862  1.004812  1.327195
'''

print( hier_df.groupby(level='cty', axis=1).count() )
'''
cty  JP  US
0     2   3
1     2   3
2     2   3
3     2   3
'''

print(' ###################### ')
###数据聚合
df
print( df )
'''
data1     data2 key1 key2
0 -0.204708  1.393406    a  one
1  0.478943  0.092908    a  two
2 -0.519439  0.281746    b  one
3 -0.555730  0.769023    b  two
4  1.965781  1.246435    a  one
'''
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

print(' ###################### ')

def peak_to_peak(arr):
return arr.max() - arr.min()

print( grouped.agg(peak_to_peak) )
'''
data1     data2
key1
a     2.170488  1.300498
b     0.036292  0.487276
'''

print( grouped.describe() )
'''
data1     data2
key1
a    count  3.000000  3.000000
mean   0.746672  0.910916
std    1.109736  0.712217
min   -0.204708  0.092908
25%    0.137118  0.669671
50%    0.478943  1.246435
75%    1.222362  1.319920
max    1.965781  1.393406
b    count  2.000000  2.000000
mean  -0.537585  0.525384
std    0.025662  0.344556
min   -0.555730  0.281746
25%   -0.546657  0.403565
50%   -0.537585  0.525384
75%   -0.528512  0.647203
max   -0.519439  0.769023
'''
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息