您的位置:首页 > 编程语言 > Python开发

Pandas 10分钟入门(官方说明+个人小测试)

2017-10-28 21:53 369 查看

Pandas10分钟入门

代码下载地址[http://download.csdn.net/download/sirwill/10043185]In[19]:
importpandasaspd
importnumpyasnp
importmatplotlib.pyplotasplt

ObjectCreation

In[20]:
s=pd.Series([1,2,3,np.nan,5,6,])#series类型数组。
s
Out[20]:
01.0
12.0
23.0
3NaN
45.0
56.0
dtype:float64
In[21]:
dates=pd.date_range("20170112",periods=6)#CreatingaDataFramebypassinganumpyarray,withadatetimeindexandlabeledcolumn
dates
Out[21]:
DatetimeIndex(['2017-01-12','2017-01-13','2017-01-14','2017-01-15',
'2017-01-16','2017-01-17'],
dtype='datetime64[ns]',freq='D')
In[22]:
list(dates)
dates.date
Out[22]:
array([datetime.date(2017,1,12),datetime.date(2017,1,13),
datetime.date(2017,1,14),datetime.date(2017,1,15),
datetime.date(2017,1,16),datetime.date(2017,1,17)],dtype=object)
In[23]:
list(dates.date)
Out[23]:
[datetime.date(2017,1,12),
datetime.date(2017,1,13),
datetime.date(2017,1,14),
datetime.date(2017,1,15),
datetime.date(2017,1,16),
datetime.date(2017,1,17)]
In[24]:
dates.year
Out[24]:
Int64Index([2017,2017,2017,2017,2017,2017],dtype='int64')
In[25]:
list(dates.year)
Out[25]:
[2017,2017,2017,2017,2017,2017]
In[26]:
list(dates.day)
Out[26]:
[12,13,14,15,16,17]
In[27]:
str(dates.date)
Out[27]:
'[datetime.date(2017,1,12)datetime.date(2017,1,13)\ndatetime.date(2017,1,14)datetime.date(2017,1,15)\ndatetime.date(2017,1,16)datetime.date(2017,1,17)]'
In[28]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df
Out[28]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460
In[29]:
df2=pd.DataFrame({'A':1.,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})#CreatingaDataFramebypassingadictofobjectsthatcanbeconvertedtoseries-like.
df2
Out[29]:
ABCDEF
01.02013-01-021.03testfoo
11.02013-01-021.03trainfoo
21.02013-01-021.03testfoo
31.02013-01-021.03trainfoo
In[30]:
df2.dtypes
Out[30]:
Afloat64
Bdatetime64[ns]
Cfloat32
Dint32
Ecategory
Fobject
dtype:object
In[31]:
df.dtypes
Out[31]:
Afloat64
Bfloat64
Cfloat64
Dfloat64
dtype:object
In[32]:
df2.<TAB>#使用jupyter时按tab键,可以看到代码提示。
File"<ipython-input-32-9c4c8dafe199>",line1
df2.<TAB>#Ifyou’reusingIPython,tabcompletionforcolumnnames(aswellaspublicattributes)isautomaticallyenabled.
^
SyntaxError:invalidsyntax

ViewingData

In[36]:
df.head()
Out[36]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
In[37]:
df.index
Out[37]:
DatetimeIndex(['2017-01-12','2017-01-13','2017-01-14','2017-01-15',
'2017-01-16','2017-01-17'],
dtype='datetime64[ns]',freq='D')
In[38]:
df.columns
Out[38]:
Index(['A','B','C','D'],dtype='object')
In[39]:
df.values
Out[39]:
array([[-2.2581213,2.45619592,0.77856734,-2.030407],
[-0.65834822,0.62249451,0.38862467,0.07358728],
[0.58921899,1.39279193,0.60554535,1.23153815],
[-0.1519579,-0.65524863,-2.1147252,-0.66983949],
[-1.32330447,3.14365936,0.63899562,0.89868346],
[-0.02493461,0.3858107,-1.57718486,-0.0214603]])
In[40]:
df.describe()
Out[40]:
ABCD
count6.0000006.0000006.0000006.000000
mean-0.6379081.224284-0.213363-0.086316
std1.0210781.4019871.2820791.171045
min-2.258121-0.655249-2.114725-2.030407
25%-1.1570650.444982-1.085732-0.507745
50%-0.4051531.0076430.4970850.026063
75%-0.0566902.1903450.6306330.692409
max0.5892193.1436590.7785671.231538
In[41]:
df
Out[41]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460
In[42]:
df.T
Out[42]:
2017-01-1200:00:002017-01-1300:00:002017-01-1400:00:002017-01-1500:00:002017-01-1600:00:002017-01-1700:00:00
A-2.258121-0.6583480.589219-0.151958-1.323304-0.024935
B2.4561960.6224951.392792-0.6552493.1436590.385811
C0.7785670.3886250.605545-2.1147250.638996-1.577185
D-2.0304070.0735871.231538-0.6698390.898683-0.021460
In[43]:
df.sort_index(axis=1,ascending=False)#Sortingbyanaxis排序。
Out[43]:
DCBA
2017-01-12-2.0304070.7785672.456196-2.258121
2017-01-130.0735870.3886250.622495-0.658348
2017-01-141.2315380.6055451.3927920.589219
2017-01-15-0.669839-2.114725-0.655249-0.151958
2017-01-160.8986830.6389963.143659-1.323304
2017-01-17-0.021460-1.5771850.385811-0.024935
In[44]:
df.sort_values(by="B")#Sortingbyvalues
Out[44]:
ABCD
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-17-0.0249350.385811-1.577185-0.021460
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-16-1.3233043.1436590.6389960.898683
In[45]:
df
Out[45]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460

Selection

Getting

In[46]:
df["A"]#Selectingasinglecolumn,whichyieldsaSeries,equivalenttodf.A
Out[46]:
2017-01-12-2.258121
2017-01-13-0.658348
2017-01-140.589219
2017-01-15-0.151958
2017-01-16-1.323304
2017-01-17-0.024935
Freq:D,Name:A,dtype:float64
In[47]:
df.A
Out[47]:
2017-01-12-2.258121
2017-01-13-0.658348
2017-01-140.589219
2017-01-15-0.151958
2017-01-16-1.323304
2017-01-17-0.024935
Freq:D,Name:A,dtype:float64
In[48]:
df[0:3]#Selectingvia[],whichslicestherows.
Out[48]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
In[49]:
df["2017-01-13":"2017-01-17"]
Out[49]:
ABCD
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460

SelectionbyLabel

In[50]:
dates
Out[50]:
DatetimeIndex(['2017-01-12','2017-01-13','2017-01-14','2017-01-15',
'2017-01-16','2017-01-17'],
dtype='datetime64[ns]',freq='D')
In[51]:
df.loc[dates[0]]#Forgettingacrosssectionusingalabel
Out[51]:
A-2.258121
B2.456196
C0.778567
D-2.030407
Name:2017-01-1200:00:00,dtype:float64
In[52]:
df.loc[:,["A","B"]]
Out[52]:
AB
2017-01-12-2.2581212.456196
2017-01-13-0.6583480.622495
2017-01-140.5892191.392792
2017-01-15-0.151958-0.655249
2017-01-16-1.3233043.143659
2017-01-17-0.0249350.385811
In[53]:
df.loc['20170112':'20170116',['A','B']]#Showinglabelslicing,bothendpointsareincluded
Out[53]:
AB
2017-01-12-2.2581212.456196
2017-01-13-0.6583480.622495
2017-01-140.5892191.392792
2017-01-15-0.151958-0.655249
2017-01-16-1.3233043.143659
In[54]:
df.loc["20170115",["A","B"]]
Out[54]:
A-0.151958
B-0.655249
Name:2017-01-1500:00:00,dtype:float64
In[55]:
df.loc[dates[3],"D"]#Forgettingascalarvalue
Out[55]:
-0.6698394854437093
In[56]:
df.at[dates[3],"D"]#Forgettingfastaccesstoascalar(equivtothepriormethod)
Out[56]:
-0.6698394854437093

SelectionbyPosition

In[57]:
df.iloc[3]#Selectviathepositionofthepassedintegers
Out[57]:
A-0.151958
B-0.655249
C-2.114725
D-0.669839
Name:2017-01-1500:00:00,dtype:float64
In[58]:
df.iloc[2:5,0:2]#Byintegerslices,actingsimilartonumpy/python
Out[58]:
AB
2017-01-140.5892191.392792
2017-01-15-0.151958-0.655249
2017-01-16-1.3233043.143659
In[59]:
df.iloc[[1,3,4],[0,2]]#Bylistsofintegerpositionlocations,similartothenumpy/pythonstyle
Out[59]:
AC
2017-01-13-0.6583480.388625
2017-01-15-0.151958-2.114725
2017-01-16-1.3233040.638996
In[60]:
df.iloc[1:3,:]
Out[60]:
ABCD
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
In[61]:
df.iloc[:,1:3]
Out[61]:
BC
2017-01-122.4561960.778567
2017-01-130.6224950.388625
2017-01-141.3927920.605545
2017-01-15-0.655249-2.114725
2017-01-163.1436590.638996
2017-01-170.385811-1.577185
In[62]:
df.iloc[1,1]#Forgettingavalueexplicitly
Out[62]:
0.62249451281708756
In[63]:
df.iat[1,1]#Forgettingfastaccesstoascalar(equivtothepriormethod)
Out[63]:
0.62249451281708756

BooleanIndexing

In[64]:
df[df.A>0]#Usingasinglecolumn’svaluestoselectdata
Out[64]:
ABCD
2017-01-140.5892191.3927920.6055451.231538
In[65]:
df[df>0]#SelectingvaluesfromaDataFramewhereabooleanconditionismet
Out[65]:
ABCD
2017-01-12NaN2.4561960.778567NaN
2017-01-13NaN0.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15NaNNaNNaNNaN
2017-01-16NaN3.1436590.6389960.898683
2017-01-17NaN0.385811NaNNaN
In[66]:
df2
Out[66]:
ABCDEF
01.02013-01-021.03testfoo
11.02013-01-021.03trainfoo
21.02013-01-021.03testfoo
31.02013-01-021.03trainfoo
In[67]:
df
Out[67]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460
In[68]:
df2=df.copy()df2
Out[68]:
ABCD
2017-01-12-2.2581212.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460
In[69]:
df.equals(df2)
Out[69]:
True
In[70]:
df==df2
Out[70]:
ABCD
2017-01-12TrueTrueTrueTrue
2017-01-13TrueTrueTrueTrue
2017-01-14TrueTrueTrueTrue
2017-01-15TrueTrueTrueTrue
2017-01-16TrueTrueTrueTrue
2017-01-17TrueTrueTrueTrue
In[71]:
dfisdf2
Out[71]:
False
In[72]:
df2["E"]=["one","one","two","three","four","three"]df2
Out[72]:
ABCDE
2017-01-12-2.2581212.4561960.778567-2.030407one
2017-01-13-0.6583480.6224950.3886250.073587one
2017-01-140.5892191.3927920.6055451.231538two
2017-01-15-0.151958-0.655249-2.114725-0.669839three
2017-01-16-1.3233043.1436590.6389960.898683four
2017-01-17-0.0249350.385811-1.577185-0.021460three
In[73]:
df2[df2.E.isin(["two","four"])]
Out[73]:
ABCDE
2017-01-140.5892191.3927920.6055451.231538two
2017-01-16-1.3233043.1436590.6389960.898683four
In[74]:
df2[df2["E"].isin(["two","four"])]
Out[74]:
ABCDE
2017-01-140.5892191.3927920.6055451.231538two
2017-01-16-1.3233043.1436590.6389960.898683four

Setting

In[75]:
s1=pd.Series([1,2,3,4,5,6],index=pd.date_range("20171016",periods=6))#Settinganewcolumnautomaticallyalignsthedatabytheindexes
s1
Out[75]:
2017-10-161
2017-10-172
2017-10-183
2017-10-194
2017-10-205
2017-10-216
Freq:D,dtype:int64
In[76]:
df.at[dates[0],"A"]=0#Settingvaluesbylabel
In[77]:
df
Out[77]:
ABCD
2017-01-120.0000002.4561960.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460
In[78]:
df.iat[0,1]=0df
Out[78]:
ABCD
2017-01-120.0000000.0000000.778567-2.030407
2017-01-13-0.6583480.6224950.3886250.073587
2017-01-140.5892191.3927920.6055451.231538
2017-01-15-0.151958-0.655249-2.114725-0.669839
2017-01-16-1.3233043.1436590.6389960.898683
2017-01-17-0.0249350.385811-1.577185-0.021460
In[79]:
df.loc[:,"D"]=np.array([5]*len(df))#Settingbyassigningwithanumpyarraydf
Out[79]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[80]:
df2=df.copy()df2
Out[80]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[81]:
df2[df2>0]=-df2df2
Out[81]:
ABCD
2017-01-120.0000000.000000-0.778567-5
2017-01-13-0.658348-0.622495-0.388625-5
2017-01-14-0.589219-1.392792-0.605545-5
2017-01-15-0.151958-0.655249-2.114725-5
2017-01-16-1.323304-3.143659-0.638996-5
2017-01-17-0.024935-0.385811-1.577185-5

MissingData

In[83]:
df
Out[83]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[84]:
df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1
df1
Out[84]:
ABCDE
2017-01-120.0000000.0000000.77856751.0
2017-01-13-0.6583480.6224950.38862551.0
2017-01-140.5892191.3927920.6055455NaN
2017-01-15-0.151958-0.655249-2.1147255NaN
In[85]:
df1.dropna(how="any")#Todropanyrowsthathavemissingdata
Out[85]:
ABCDE
2017-01-120.0000000.0000000.77856751.0
2017-01-13-0.6583480.6224950.38862551.0
In[86]:
df1.fillna(value=5)#Fillingmissingdata
Out[86]:
ABCDE
2017-01-120.0000000.0000000.77856751.0
2017-01-13-0.6583480.6224950.38862551.0
2017-01-140.5892191.3927920.60554555.0
2017-01-15-0.151958-0.655249-2.11472555.0
In[87]:
df1
Out[87]:
ABCDE
2017-01-120.0000000.0000000.77856751.0
2017-01-13-0.6583480.6224950.38862551.0
2017-01-140.5892191.3927920.6055455NaN
2017-01-15-0.151958-0.655249-2.1147255NaN
In[88]:
pd.isnull(df1)
Out[88]:
ABCDE
2017-01-12FalseFalseFalseFalseFalse
2017-01-13FalseFalseFalseFalseFalse
2017-01-14FalseFalseFalseFalseTrue
2017-01-15FalseFalseFalseFalseTrue
In[89]:
df1.isnull()
Out[89]:
ABCDE
2017-01-12FalseFalseFalseFalseFalse
2017-01-13FalseFalseFalseFalseFalse
2017-01-14FalseFalseFalseFalseTrue
2017-01-15FalseFalseFalseFalseTrue
In[90]:
df1.isna()#没有这个方法~~
---------------------------------------------------------------------------AttributeErrorTraceback(mostrecentcalllast)<ipython-input-90-9dd6d031e095>in<module>()---->1df1.isna()#没有这个方法~~D:\Users\asus\Anaconda3\lib\site-packages\pandas\core\generic.pyin__getattr__(self,name)2968ifnameinself._info_axis:2969returnself[name]->2970returnobject.__getattribute__(self,name)29712972def__setattr__(self,name,value):AttributeError:'DataFrame'objecthasnoattribute'isna'

Options

Stats

Operationsingeneralexcludemissingdata.PerformingadescriptivestatisticIn[91]:
df
Out[91]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[92]:
df.mean()
Out[92]:
A-0.261554
B0.814918
C-0.213363
D5.000000
dtype:float64
In[93]:
df.mean(1)#Sameoperationontheotheraxis
Out[93]:
2017-01-121.444642
2017-01-131.338193
2017-01-141.896889
2017-01-150.519517
2017-01-161.864838
2017-01-170.945923
Freq:D,dtype:float64
In[94]:
s=pd.Series([1,2,3,np.nan,4,5],index=dates).shift(2)
#Operatingwithobjectsthathavedifferentdimensionalityandneedalignment.Inaddition,pandasautomaticallybroadcastsalongthespecifieddimension.
s
Out[94]:
2017-01-12NaN
2017-01-13NaN
2017-01-141.0
2017-01-152.0
2017-01-163.0
2017-01-17NaN
Freq:D,dtype:float64
In[95]:
df
Out[95]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[96]:
df.sub(s,axis="index")#dataFrame与series的减法
Out[96]:
ABCD
2017-01-12NaNNaNNaNNaN
2017-01-13NaNNaNNaNNaN
2017-01-14-0.4107810.392792-0.3944554.0
2017-01-15-2.151958-2.655249-4.1147253.0
2017-01-16-4.3233040.143659-2.3610042.0
2017-01-17NaNNaNNaNNaN

Apply

In[97]:
df
Out[97]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[98]:
df.apply(np.cumsum)#行叠加。
Out[98]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224951.16719210
2017-01-14-0.0691292.0152861.77273715
2017-01-15-0.2210871.360038-0.34198820
2017-01-16-1.5443924.5036970.29700825
2017-01-17-1.5693264.889508-1.28017730
In[99]:
df.apply(lambdax:x.max()-x.min())
Out[99]:
A1.912523
B3.798908
C2.893293
D0.000000
dtype:float64

Histogramming

In[100]:
s=pd.Series(np.random.randint(0,7,size=10))
s
Out[100]:
04
15
22
30
45
53
64
73
83
90
dtype:int32
In[101]:
s.value_counts()
Out[101]:
33
52
42
02
21
dtype:int64

StringMethods

Seriesisequippedwithasetofstringprocessingmethodsinthestrattributethatmakeiteasytooperateoneachelementofthearray,asinthecodesnippetbelow.Notethatpattern-matchinginstrgenerallyusesregularexpressionsbydefault(andinsomecasesalwaysusesthem).SeemoreatVectorizedStringMethods.In[102]:
s=pd.Series(['A','B','C','Aaba','Baca',np.nan,'CABA','dog','cat'])
s.str.lower()
Out[102]:
0a
1b
2c
3aaba
4baca
5NaN
6caba
7dog
8cat
dtype:object
In[103]:
s
Out[103]:
0A
1B
2C
3Aaba
4Baca
5NaN
6CABA
7dog
8cat
dtype:object

Merge合并

Concat

pandasprovidesvariousfacilitiesforeasilycombiningtogetherSeries,DataFrame,andPanelobjectswithvariouskindsofsetlogicfortheindexesandrelationalalgebrafunctionalityinthecaseofjoin/merge-typeoperations.SeetheMergingsectionConcatenatingpandasobjectstogetherwithconcat():In[104]:
df
Out[104]:
ABCD
2017-01-120.0000000.0000000.7785675
2017-01-13-0.6583480.6224950.3886255
2017-01-140.5892191.3927920.6055455
2017-01-15-0.151958-0.655249-2.1147255
2017-01-16-1.3233043.1436590.6389965
2017-01-17-0.0249350.385811-1.5771855
In[105]:
df=pd.DataFrame(np.random.randn(10,4))df
Out[105]:
0123
00.111766-0.5051252.1560290.419152
11.0688701.1805870.3613451.090554
20.4889970.281507-0.738345-0.242974
3-1.8467091.686173-0.202319-1.151983
40.573012-1.9791891.5447681.594595
5-0.954571-0.6967880.270959-2.296720
6-1.5119461.7961130.3994930.412664
70.089844-0.545153-0.315653-0.235828
8-0.7471401.222900-1.6508120.292432
90.6598550.5012650.3639781.722914
In[106]:
#breakitintopiecespieces=[df[:3],df[3:7],df[7:]]pd.concat(pieces)
Out[106]:
0123
00.111766-0.5051252.1560290.419152
11.0688701.1805870.3613451.090554
20.4889970.281507-0.738345-0.242974
3-1.8467091.686173-0.202319-1.151983
40.573012-1.9791891.5447681.594595
5-0.954571-0.6967880.270959-2.296720
6-1.5119461.7961130.3994930.412664
70.089844-0.545153-0.315653-0.235828
8-0.7471401.222900-1.6508120.292432
90.6598550.5012650.3639781.722914
In[107]:
pieces
Out[107]:
[0123
00.111766-0.5051252.1560290.419152
11.0688701.1805870.3613451.090554
20.4889970.281507-0.738345-0.242974,
0123
3-1.8467091.686173-0.202319-1.151983
40.573012-1.9791891.5447681.594595
5-0.954571-0.6967880.270959-2.296720
6-1.5119461.7961130.3994930.412664,
0123
70.089844-0.545153-0.315653-0.235828
8-0.7471401.222900-1.6508120.292432
90.6598550.5012650.3639781.722914]

Join

SQLstylemerges.SeetheDatabasestylejoiningIn[108]:
left=pd.DataFrame({"key":["foo","foo"],"lval":[1,2]})
right=pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
In[109]:
left
Out[109]:
keylval
0foo1
1foo2
In[110]:
right
Out[110]:
keyrval
0foo4
1foo5
In[111]:
pd.merge(left,right,on="key")
Out[111]:
keylvalrval
0foo14
1foo15
2foo24
3foo25
In[112]:
left=pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
right=pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
In[113]:
left
Out[113]:
keylval
0foo1
1bar2
In[114]:
right
Out[114]:
keyrval
0foo4
1bar5
In[115]:
pd.merge(left,right,on="key")
Out[115]:
keylvalrval
0foo14
1bar25

Append

In[116]:
df=pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])df
Out[116]:
ABCD
0-0.8524511.074357-0.5918920.950982
1-0.9775801.6563740.6936570.718832
20.303269-0.881728-1.5093211.219849
30.6557511.2356601.7290381.074948
40.658413-1.215348-1.1396230.753772
51.3451151.420212-0.124543-0.099265
61.1296230.597484-0.804759-0.568266
7-0.7705700.540917-0.261607-0.083751
In[117]:
s=df.iloc[3]s
Out[117]:
A0.655751
B1.235660
C1.729038
D1.074948
Name:3,dtype:float64
In[118]:
df.append(s,ignore_index=True)
Out[118]:
ABCD
0-0.8524511.074357-0.5918920.950982
1-0.9775801.6563740.6936570.718832
20.303269-0.881728-1.5093211.219849
30.6557511.2356601.7290381.074948
40.658413-1.215348-1.1396230.753772
51.3451151.420212-0.124543-0.099265
61.1296230.597484-0.804759-0.568266
7-0.7705700.540917-0.261607-0.083751
80.6557511.2356601.7290381.074948

Grouping

By“groupby”wearereferringtoaprocessinvolvingoneormoreofthefollowingsteps•Splittingthedataintogroupsbasedonsomecriteria•Applyingafunctiontoeachgroupindependently•CombiningtheresultsintoadatastructureIn[119]:
df=pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],'B':['one','one','two','three','two','two','one','three'],'C':np.random.randn(8),'D':np.random.randn(8)})df
Out[119]:
ABCD
0fooone-0.523738-1.363519
1barone-0.071920-2.618027
2footwo-2.712421-0.407372
3barthree-0.635898-1.942854
4footwo0.952073-0.546110
5bartwo1.474296-0.982238
6fooone-0.529788-0.213397
7foothree0.877394-0.791663
In[120]:
df.groupby("A").sum()
Out[120]:
CD
A
bar0.766479-5.543120
foo-1.936480-3.322062
In[121]:
df.groupby(["A","B"]).sum()#Groupingbymultiplecolumnsformsahierarchicalindex,whichwethenapplythefunction.
Out[121]:
CD
AB
barone-0.071920-2.618027
three-0.635898-1.942854
two1.474296-0.982238
fooone-1.053527-1.576917
three0.877394-0.791663
two-1.760347-0.953482

Reshaping

Stack

In[122]:
tuples=list(zip([['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]))tuples
Out[122]:
[(['bar','bar','baz','baz','foo','foo','qux','qux'],),
(['one','two','one','two','one','two','one','two'],)]
In[123]:
tuples=list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]))tuples
Out[123]:
[('bar','one'),
('bar','two'),
('baz','one'),
('baz','two'),
('foo','one'),
('foo','two'),
('qux','one'),
('qux','two')]
In[124]:
index=pd.MultiIndex.from_tuples(tuples,names=["first","second"])
index
Out[124]:
MultiIndex(levels=[['bar','baz','foo','qux'],['one','two']],
labels=[[0,0,1,1,2,2,3,3],[0,1,0,1,0,1,0,1]],
names=['first','second'])
In[125]:
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])df
Out[125]:
AB
firstsecond
barone-1.101051-1.126231
two-0.395652-0.313567
bazone1.378579-1.637869
two0.665960-0.259749
fooone-0.2561811.260131
two-0.9947200.506272
quxone-0.4226490.191402
two-0.1020850.975210
In[126]:
df2=df[:4]df2
Out[126]:
AB
firstsecond
barone-1.101051-1.126231
two-0.395652-0.313567
bazone1.378579-1.637869
two0.665960-0.259749
In[127]:
stacked=df2.stack()
stacked
Out[127]:
firstsecond
baroneA-1.101051
B-1.126231
twoA-0.395652
B-0.313567
bazoneA1.378579
B-1.637869
twoA0.665960
B-0.259749
dtype:float64
Witha“stacked”DataFrameorSeries(havingaMultiIndexastheindex),theinverseoperationofstack()isunstack(),whichbydefaultunstacksthelastlevel:In[128]:
stacked.unstack()
Out[128]:
AB
firstsecond
barone-1.101051-1.126231
two-0.395652-0.313567
bazone1.378579-1.637869
two0.665960-0.259749
In[129]:
stacked.unstack(1)
Out[129]:
secondonetwo
first
barA-1.101051-0.395652
B-1.126231-0.313567
bazA1.3785790.665960
B-1.637869-0.259749
In[130]:
stacked.unstack(0)
Out[130]:
firstbarbaz
second
oneA-1.1010511.378579
B-1.126231-1.637869
twoA-0.3956520.665960
B-0.313567-0.259749

PivotTables

In[131]:
df=pd.DataFrame({'A':['one','one','two','three']*3,'B':['A','B','C']*4,'C':['foo','foo','foo','bar','bar','bar']*2,'D':np.random.randn(12),'E':np.random.randn(12)})df
Out[131]:
ABCDE
0oneAfoo0.0392300.134261
1oneBfoo0.952890-0.499183
2twoCfoo-0.778814-0.655735
3threeAbar0.7988640.025109
4oneBbar-0.580050-1.711672
5oneCbar0.004300-0.433591
6twoAfoo0.229248-2.648814
7threeBfoo0.5064880.630373
8oneCfoo-0.3156670.031764
9oneAbar-1.5474100.743825
10twoBbar-0.4809580.365255
11threeCbar1.7429480.692884
In[4]:
pd.pivot_table(df,values="D",index=["A","B"],columns=["C"])
Out[4]:
Cbarfoo
AB
oneA0.932814-1.440079
B0.0602521.071877
C2.8797790.355274
threeA-0.328442NaN
BNaN-2.544812
C-1.879058NaN
twoANaN-1.987377
B0.220517NaN
CNaN-0.082820

TimeSeries

pandashassimple,powerful,andefficientfunctionalityforperformingresamplingoperationsduringfrequencyconversion(e.g.,convertingsecondlydatainto5-minutelydata).Thisisextremelycommonin,butnotlimitedto,financialapplications.In[132]:
rng=pd.date_range("1/2/2017",periods=100,freq="S")
rng
Out[132]:
DatetimeIndex(['2017-01-0200:00:00','2017-01-0200:00:01',
'2017-01-0200:00:02','2017-01-0200:00:03',
'2017-01-0200:00:04','2017-01-0200:00:05',
'2017-01-0200:00:06','2017-01-0200:00:07',
'2017-01-0200:00:08','2017-01-0200:00:09',
'2017-01-0200:00:10','2017-01-0200:00:11',
'2017-01-0200:00:12','2017-01-0200:00:13',
'2017-01-0200:00:14','2017-01-0200:00:15',
'2017-01-0200:00:16','2017-01-0200:00:17',
'2017-01-0200:00:18','2017-01-0200:00:19',
'2017-01-0200:00:20','2017-01-0200:00:21',
'2017-01-0200:00:22','2017-01-0200:00:23',
'2017-01-0200:00:24','2017-01-0200:00:25',
'2017-01-0200:00:26','2017-01-0200:00:27',
'2017-01-0200:00:28','2017-01-0200:00:29',
'2017-01-0200:00:30','2017-01-0200:00:31',
'2017-01-0200:00:32','2017-01-0200:00:33',
'2017-01-0200:00:34','2017-01-0200:00:35',
'2017-01-0200:00:36','2017-01-0200:00:37',
'2017-01-0200:00:38','2017-01-0200:00:39',
'2017-01-0200:00:40','2017-01-0200:00:41',
'2017-01-0200:00:42','2017-01-0200:00:43',
'2017-01-0200:00:44','2017-01-0200:00:45',
'2017-01-0200:00:46','2017-01-0200:00:47',
'2017-01-0200:00:48','2017-01-0200:00:49',
'2017-01-0200:00:50','2017-01-0200:00:51',
'2017-01-0200:00:52','2017-01-0200:00:53',
'2017-01-0200:00:54','2017-01-0200:00:55',
'2017-01-0200:00:56','2017-01-0200:00:57',
'2017-01-0200:00:58','2017-01-0200:00:59',
'2017-01-0200:01:00','2017-01-0200:01:01',
'2017-01-0200:01:02','2017-01-0200:01:03',
'2017-01-0200:01:04','2017-01-0200:01:05',
'2017-01-0200:01:06','2017-01-0200:01:07',
'2017-01-0200:01:08','2017-01-0200:01:09',
'2017-01-0200:01:10','2017-01-0200:01:11',
'2017-01-0200:01:12','2017-01-0200:01:13',
'2017-01-0200:01:14','2017-01-0200:01:15',
'2017-01-0200:01:16','2017-01-0200:01:17',
'2017-01-0200:01:18','2017-01-0200:01:19',
'2017-01-0200:01:20','2017-01-0200:01:21',
'2017-01-0200:01:22','2017-01-0200:01:23',
'2017-01-0200:01:24','2017-01-0200:01:25',
'2017-01-0200:01:26','2017-01-0200:01:27',
'2017-01-0200:01:28','2017-01-0200:01:29',
'2017-01-0200:01:30','2017-01-0200:01:31',
'2017-01-0200:01:32','2017-01-0200:01:33',
'2017-01-0200:01:34','2017-01-0200:01:35',
'2017-01-0200:01:36','2017-01-0200:01:37',
'2017-01-0200:01:38','2017-01-0200:01:39'],
dtype='datetime64[ns]',freq='S')
In[133]:
ts=pd.Series(np.random.randint(0,500,len(rng)),index=rng)ts
Out[133]:
2017-01-0200:00:00251
2017-01-0200:00:0163
2017-01-0200:00:02108
2017-01-0200:00:03288
2017-01-0200:00:04491
2017-01-0200:00:05490
2017-01-0200:00:06343
2017-01-0200:00:07357
2017-01-0200:00:0872
2017-01-0200:00:09171
2017-01-0200:00:10324
2017-01-0200:00:11281
2017-01-0200:00:12176
2017-01-0200:00:1314
2017-01-0200:00:14495
2017-01-0200:00:15150
2017-01-0200:00:1669
2017-01-0200:00:17144
2017-01-0200:00:18126
2017-01-0200:00:19368
2017-01-0200:00:20129
2017-01-0200:00:21386
2017-01-0200:00:22228
2017-01-0200:00:23458
2017-01-0200:00:2498
2017-01-0200:00:25244
2017-01-0200:00:26206
2017-01-0200:00:2798
2017-01-0200:00:2892
2017-01-0200:00:29259
...
2017-01-0200:01:10127
2017-01-0200:01:11342
2017-01-0200:01:12185
2017-01-0200:01:13123
2017-01-0200:01:1473
2017-01-0200:01:15132
2017-01-0200:01:16462
2017-01-0200:01:17317
2017-01-0200:01:18180
2017-01-0200:01:19247
2017-01-0200:01:2097
2017-01-0200:01:21401
2017-01-0200:01:22342
2017-01-0200:01:23382
2017-01-0200:01:24304
2017-01-0200:01:2547
2017-01-0200:01:26193
2017-01-0200:01:27334
2017-01-0200:01:28196
2017-01-0200:01:29297
2017-01-0200:01:30195
2017-01-0200:01:31236
2017-01-0200:01:32200
2017-01-0200:01:33490
2017-01-0200:01:34196
2017-01-0200:01:35201
2017-01-0200:01:36397
2017-01-0200:01:37494
2017-01-0200:01:38482
2017-01-0200:01:39267
Freq:S,Length:100,dtype:int32
In[7]:
ts.resample("5Min").sum()
Out[7]:
2017-01-0222939
Freq:5T,dtype:int32
In[9]:
ts.resample("1Min").sum()
Out[9]:
2017-01-0200:00:0013896
2017-01-0200:01:009043
Freq:T,dtype:int32
Timezonerepresentation.零时区UTC表示。In[10]:
rng=pd.date_range("2/1/201700:00",periods=5,freq="D")
rng
Out[10]:
DatetimeIndex(['2017-02-01','2017-02-02','2017-02-03','2017-02-04',
'2017-02-05'],
dtype='datetime64[ns]',freq='D')
In[12]:
ts=pd.Series(np.random.randn(len(rng)),index=rng)ts
Out[12]:
2017-02-010.329594
2017-02-022.097319
2017-02-031.852023
2017-02-04-0.213452
2017-02-050.160873
Freq:D,dtype:float64
In[13]:
tsUtc=ts.tz_localize("UTC")
tsUtc
Out[13]:
2017-02-0100:00:00+00:000.329594
2017-02-0200:00:00+00:002.097319
2017-02-0300:00:00+00:001.852023
2017-02-0400:00:00+00:00-0.213452
2017-02-0500:00:00+00:000.160873
Freq:D,dtype:float64
Converttoanothertimezone.时区转换。In[14]:
tsUtc.tz_convert("US/Eastern")
Out[14]:
2017-01-3119:00:00-05:000.329594
2017-02-0119:00:00-05:002.097319
2017-02-0219:00:00-05:001.852023
2017-02-0319:00:00-05:00-0.213452
2017-02-0419:00:00-05:000.160873
Freq:D,dtype:float64
In[15]:
tsUtc
Out[15]:
2017-02-0100:00:00+00:000.329594
2017-02-0200:00:00+00:002.097319
2017-02-0300:00:00+00:001.852023
2017-02-0400:00:00+00:00-0.213452
2017-02-0500:00:00+00:000.160873
Freq:D,dtype:float64
ConvertingbetweentimespanrepresentationsIn[16]:
rng=pd.date_range("1/8/2017",periods=5,freq="M")
rng
Out[16]:
DatetimeIndex(['2017-01-31','2017-02-28','2017-03-31','2017-04-30',
'2017-05-31'],
dtype='datetime64[ns]',freq='M')
In[18]:
ts=pd.Series(np.random.randn(len(rng)),rng)ts
Out[18]:
2017-01-310.904523
2017-02-28-0.470144
2017-03-31-0.373244
2017-04-300.860448
2017-05-310.176226
Freq:M,dtype:float64
In[20]:
ps=ts.to_period()ps
Out[20]:
2017-010.904523
2017-02-0.470144
2017-03-0.373244
2017-040.860448
2017-050.176226
Freq:M,dtype:float64
In[21]:
ps.to_timestamp()
Out[21]:
2017-01-010.904523
2017-02-01-0.470144
2017-03-01-0.373244
2017-04-010.860448
2017-05-010.176226
Freq:MS,dtype:float64
In[22]:
ps
Out[22]:
2017-010.904523
2017-02-0.470144
2017-03-0.373244
2017-040.860448
2017-050.176226
Freq:M,dtype:float64
Convertingbetweenperiodandtimestampenablessomeconvenientarithmeticfunctionstobeused.Inthefollowingexample,weconvertaquarterlyfrequencywithyearendinginNovemberto9amoftheendofthemonthfollowingthequarterend:In[23]:
prng=pd.period_range("1990Q1","2017Q4",freq="Q-NOV")
prng
Out[23]:
PeriodIndex(['1990Q1','1990Q2','1990Q3','1990Q4','1991Q1','1991Q2',
'1991Q3','1991Q4','1992Q1','1992Q2',
...
'2015Q3','2015Q4','2016Q1','2016Q2','2016Q3','2016Q4',
'2017Q1','2017Q2','2017Q3','2017Q4'],
dtype='period[Q-NOV]',length=112,freq='Q-NOV')
In[25]:
ts=pd.Series(np.random.randn(len(prng)),prng)
ts.head()
Out[25]:
1990Q11.193031
1990Q20.621627
1990Q3-0.235553
1990Q40.642938
1991Q10.247024
Freq:Q-NOV,dtype:float64
In[26]:
ts.index=(prng.asfreq("M","e")+1).asfreq("H","s")+9
ts.head()
Out[26]:
1990-03-0109:001.193031
1990-06-0109:000.621627
1990-09-0109:00-0.235553
1990-12-0109:000.642938
1991-03-0109:000.247024
Freq:H,dtype:float64

Categoricals

In[34]:
df=pd.DataFrame({"id":[1,2,3,4,5,6],"raw_grade":["a","a","c","b","b","f"]})df
Out[34]:
idraw_grade
01a
12a
23c
34b
45b
56f
Converttherawgradestoacategoricaldatatype.In[35]:
df["grade"]=df.raw_grade.astype("category")df
Out[35]:
idraw_gradegrade
01aa
12aa
23cc
34bb
45bb
56ff
In[36]:
df.grade#Converttherawgradestoacategoricaldatatype
Out[36]:
0a
1a
2c
3b
4b
5f
Name:grade,dtype:category
Categories(4,object):[a,b,c,f]
In[37]:
#Renamethecategoriestomoremeaningfulnames(assigningtoSeries.cat.categoriesisinplace!)df.grade.cat.categories=["verygood","good","nomal","bad"]df
Out[37]:
idraw_gradegrade
01averygood
12averygood
23cnomal
34bgood
45bgood
56fbad
In[38]:
#Reorderthecategoriesandsimultaneouslyaddthemissingcategories(methodsunderSeries.catreturnanewSeriesperdefault).

df.grade=df.grade.cat.set_categories(["verybad","bad","medium","good","verygood"])
df.grade
Out[38]:
0verygood
1verygood
2NaN
3good
4good
5bad
Name:grade,dtype:category
Categories(5,object):[verybad,bad,medium,good,verygood]
In[39]:
df
Out[39]:
idraw_gradegrade
01averygood
12averygood
23cNaN
34bgood
45bgood
56fbad
Sortingisperorderinthecategories,notlexicalorderIn[40]:
df.sort_values(by="grade")
Out[40]:
idraw_gradegrade
23cNaN
56fbad
34bgood
45bgood
01averygood
12averygood
GroupingbyacategoricalcolumnshowsalsoemptycategoriesIn[41]:
df.groupby("grade").size()
Out[41]:
grade
verybad0
bad1
medium0
good2
verygood2
dtype:int64

Plotting

In[43]:
ts=pd.Series(np.random.randn(1000),index=pd.date_range("1/1/2017",periods=1000))
ts.head()
Out[43]:
2017-01-01-0.745067
2017-01-02-0.070895
2017-01-030.233542
2017-01-04-0.206597
2017-01-050.891064
Freq:D,dtype:float64
In[45]:
ts=ts.cumsum()
ts.head()
Out[45]:
2017-01-01-0.745067
2017-01-02-1.561029
2017-01-03-2.143449
2017-01-04-2.932466
2017-01-05-2.830418
Freq:D,dtype:float64
In[48]:
ts.plot()
Out[48]:
<matplotlib.axes._subplots.AxesSubplotat0x19bf6a6e278>
In[50]:
df=pd.DataFrame(np.random.randn(1000,4),index=ts.index,columns=["A","B","C","D"])df.head()
Out[50]:
ABCD
2017-01-01-1.940139-0.476590-0.1540661.692812
2017-01-020.3998910.2689760.596209-0.484979
2017-01-030.814519-0.142193-0.084394-0.687342
2017-01-040.385848-1.230059-0.093327-0.096652
2017-01-050.407435-0.8493470.3791920.172933
In[51]:
df=df.cumsum()
In[53]:
plt.figure()
df.plot()
plt.legend(loc="best")
plt.show()
<matplotlib.figure.Figureat0x19bf8855da0>
<matplotlib.figure.Figureat0x19bf897dc88>

GettingDataIn/Out

CSV

In[]:
df.to_csv("foo.csv")
In[]:
pd.read_csv("foo.csv")

HDF5

In[]:
df.to_hdf("foo.h5","df")
In[]:
pd.read_hdf("foo.h5","df")

Excel

In[]:
df.to_excel('foo.xlsx',sheet_name='Sheet1')
In[]:
pd.read_excel('foo.xlsx','Sheet1',index_col=None,na_values=['NA'])
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:

                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python pandas