Pandas10分钟入门
代码下载地址[http://download.csdn.net/download/sirwill/10043185]In[19]:
importpandasaspd
importnumpyasnp
importmatplotlib.pyplotasplt
ObjectCreation
In[20]:
s=pd.Series([1,2,3,np.nan,5,6,])#series类型数组。
s
Out[20]:
01.0
12.0
23.0
3NaN
45.0
56.0
dtype:float64
In[21]:
dates=pd.date_range("20170112",periods=6)#CreatingaDataFramebypassinganumpyarray,withadatetimeindexandlabeledcolumn
dates
Out[21]:
DatetimeIndex(['2017-01-12','2017-01-13','2017-01-14','2017-01-15',
'2017-01-16','2017-01-17'],
dtype='datetime64[ns]',freq='D')
In[22]:
list(dates)
dates.date
Out[22]:
array([datetime.date(2017,1,12),datetime.date(2017,1,13),
datetime.date(2017,1,14),datetime.date(2017,1,15),
datetime.date(2017,1,16),datetime.date(2017,1,17)],dtype=object)
In[23]:
list(dates.date)
Out[23]:
[datetime.date(2017,1,12),
datetime.date(2017,1,13),
datetime.date(2017,1,14),
datetime.date(2017,1,15),
datetime.date(2017,1,16),
datetime.date(2017,1,17)]
In[24]:
dates.year
Out[24]:
Int64Index([2017,2017,2017,2017,2017,2017],dtype='int64')
In[25]:
list(dates.year)
Out[25]:
[2017,2017,2017,2017,2017,2017]
In[26]:
list(dates.day)
Out[26]:
[12,13,14,15,16,17]
In[27]:
str(dates.date)
Out[27]:
'[datetime.date(2017,1,12)datetime.date(2017,1,13)\ndatetime.date(2017,1,14)datetime.date(2017,1,15)\ndatetime.date(2017,1,16)datetime.date(2017,1,17)]'
In[28]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df
Out[28]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
In[29]:
df2=pd.DataFrame({'A':1.,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})#CreatingaDataFramebypassingadictofobjectsthatcanbeconvertedtoseries-like.
df2
Out[29]:
| | A | B | C | D | E | F |
---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
---|
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
---|
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
---|
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
---|
In[30]:
df2.dtypes
Out[30]:
Afloat64
Bdatetime64[ns]
Cfloat32
Dint32
Ecategory
Fobject
dtype:object
In[31]:
df.dtypes
Out[31]:
Afloat64
Bfloat64
Cfloat64
Dfloat64
dtype:object
In[32]:
df2.<TAB>#使用jupyter时按tab键,可以看到代码提示。
File"<ipython-input-32-9c4c8dafe199>",line1
df2.<TAB>#Ifyou’reusingIPython,tabcompletionforcolumnnames(aswellaspublicattributes)isautomaticallyenabled.
^
SyntaxError:invalidsyntax
ViewingData
In[36]:
df.head()
Out[36]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
In[37]:
df.index
Out[37]:
DatetimeIndex(['2017-01-12','2017-01-13','2017-01-14','2017-01-15',
'2017-01-16','2017-01-17'],
dtype='datetime64[ns]',freq='D')
In[38]:
df.columns
Out[38]:
Index(['A','B','C','D'],dtype='object')
In[39]:
df.values
Out[39]:
array([[-2.2581213,2.45619592,0.77856734,-2.030407],
[-0.65834822,0.62249451,0.38862467,0.07358728],
[0.58921899,1.39279193,0.60554535,1.23153815],
[-0.1519579,-0.65524863,-2.1147252,-0.66983949],
[-1.32330447,3.14365936,0.63899562,0.89868346],
[-0.02493461,0.3858107,-1.57718486,-0.0214603]])
In[40]:
df.describe()
Out[40]:
| | A | B | C | D |
---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
---|
mean | -0.637908 | 1.224284 | -0.213363 | -0.086316 |
---|
std | 1.021078 | 1.401987 | 1.282079 | 1.171045 |
---|
min | -2.258121 | -0.655249 | -2.114725 | -2.030407 |
---|
25% | -1.157065 | 0.444982 | -1.085732 | -0.507745 |
---|
50% | -0.405153 | 1.007643 | 0.497085 | 0.026063 |
---|
75% | -0.056690 | 2.190345 | 0.630633 | 0.692409 |
---|
max | 0.589219 | 3.143659 | 0.778567 | 1.231538 |
---|
In[41]:
df
Out[41]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
In[42]:
df.T
Out[42]:
| | 2017-01-1200:00:00 | 2017-01-1300:00:00 | 2017-01-1400:00:00 | 2017-01-1500:00:00 | 2017-01-1600:00:00 | 2017-01-1700:00:00 |
---|
A | -2.258121 | -0.658348 | 0.589219 | -0.151958 | -1.323304 | -0.024935 |
---|
B | 2.456196 | 0.622495 | 1.392792 | -0.655249 | 3.143659 | 0.385811 |
---|
C | 0.778567 | 0.388625 | 0.605545 | -2.114725 | 0.638996 | -1.577185 |
---|
D | -2.030407 | 0.073587 | 1.231538 | -0.669839 | 0.898683 | -0.021460 |
---|
In[43]:
df.sort_index(axis=1,ascending=False)#Sortingbyanaxis排序。
Out[43]:
| | D | C | B | A |
---|
2017-01-12 | -2.030407 | 0.778567 | 2.456196 | -2.258121 |
---|
2017-01-13 | 0.073587 | 0.388625 | 0.622495 | -0.658348 |
---|
2017-01-14 | 1.231538 | 0.605545 | 1.392792 | 0.589219 |
---|
2017-01-15 | -0.669839 | -2.114725 | -0.655249 | -0.151958 |
---|
2017-01-16 | 0.898683 | 0.638996 | 3.143659 | -1.323304 |
---|
2017-01-17 | -0.021460 | -1.577185 | 0.385811 | -0.024935 |
---|
In[44]:
df.sort_values(by="B")#Sortingbyvalues
Out[44]:
| | A | B | C | D |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
In[45]:
df
Out[45]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
Selection
Getting
In[46]:
df["A"]#Selectingasinglecolumn,whichyieldsaSeries,equivalenttodf.A
Out[46]:
2017-01-12-2.258121
2017-01-13-0.658348
2017-01-140.589219
2017-01-15-0.151958
2017-01-16-1.323304
2017-01-17-0.024935
Freq:D,Name:A,dtype:float64
In[47]:
df.A
Out[47]:
2017-01-12-2.258121
2017-01-13-0.658348
2017-01-140.589219
2017-01-15-0.151958
2017-01-16-1.323304
2017-01-17-0.024935
Freq:D,Name:A,dtype:float64
In[48]:
df[0:3]#Selectingvia[],whichslicestherows.
Out[48]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
In[49]:
df["2017-01-13":"2017-01-17"]
Out[49]:
| | A | B | C | D |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
SelectionbyLabel
In[50]:
dates
Out[50]:
DatetimeIndex(['2017-01-12','2017-01-13','2017-01-14','2017-01-15',
'2017-01-16','2017-01-17'],
dtype='datetime64[ns]',freq='D')
In[51]:
df.loc[dates[0]]#Forgettingacrosssectionusingalabel
Out[51]:
A-2.258121
B2.456196
C0.778567
D-2.030407
Name:2017-01-1200:00:00,dtype:float64
In[52]:
df.loc[:,["A","B"]]
Out[52]:
| | A | B |
---|
2017-01-12 | -2.258121 | 2.456196 |
---|
2017-01-13 | -0.658348 | 0.622495 |
---|
2017-01-14 | 0.589219 | 1.392792 |
---|
2017-01-15 | -0.151958 | -0.655249 |
---|
2017-01-16 | -1.323304 | 3.143659 |
---|
2017-01-17 | -0.024935 | 0.385811 |
---|
In[53]:
df.loc['20170112':'20170116',['A','B']]#Showinglabelslicing,bothendpointsareincluded
Out[53]:
| | A | B |
---|
2017-01-12 | -2.258121 | 2.456196 |
---|
2017-01-13 | -0.658348 | 0.622495 |
---|
2017-01-14 | 0.589219 | 1.392792 |
---|
2017-01-15 | -0.151958 | -0.655249 |
---|
2017-01-16 | -1.323304 | 3.143659 |
---|
In[54]:
df.loc["20170115",["A","B"]]
Out[54]:
A-0.151958
B-0.655249
Name:2017-01-1500:00:00,dtype:float64
In[55]:
df.loc[dates[3],"D"]#Forgettingascalarvalue
Out[55]:
-0.6698394854437093
In[56]:
df.at[dates[3],"D"]#Forgettingfastaccesstoascalar(equivtothepriormethod)
Out[56]:
-0.6698394854437093
SelectionbyPosition
In[57]:
df.iloc[3]#Selectviathepositionofthepassedintegers
Out[57]:
A-0.151958
B-0.655249
C-2.114725
D-0.669839
Name:2017-01-1500:00:00,dtype:float64
In[58]:
df.iloc[2:5,0:2]#Byintegerslices,actingsimilartonumpy/python
Out[58]:
| | A | B |
---|
2017-01-14 | 0.589219 | 1.392792 |
---|
2017-01-15 | -0.151958 | -0.655249 |
---|
2017-01-16 | -1.323304 | 3.143659 |
---|
In[59]:
df.iloc[[1,3,4],[0,2]]#Bylistsofintegerpositionlocations,similartothenumpy/pythonstyle
Out[59]:
| | A | C |
---|
2017-01-13 | -0.658348 | 0.388625 |
---|
2017-01-15 | -0.151958 | -2.114725 |
---|
2017-01-16 | -1.323304 | 0.638996 |
---|
In[60]:
df.iloc[1:3,:]
Out[60]:
| | A | B | C | D |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
In[61]:
df.iloc[:,1:3]
Out[61]:
| | B | C |
---|
2017-01-12 | 2.456196 | 0.778567 |
---|
2017-01-13 | 0.622495 | 0.388625 |
---|
2017-01-14 | 1.392792 | 0.605545 |
---|
2017-01-15 | -0.655249 | -2.114725 |
---|
2017-01-16 | 3.143659 | 0.638996 |
---|
2017-01-17 | 0.385811 | -1.577185 |
---|
In[62]:
df.iloc[1,1]#Forgettingavalueexplicitly
Out[62]:
0.62249451281708756
In[63]:
df.iat[1,1]#Forgettingfastaccesstoascalar(equivtothepriormethod)
Out[63]:
0.62249451281708756
BooleanIndexing
In[64]:
df[df.A>0]#Usingasinglecolumn’svaluestoselectdata
Out[64]:
| | A | B | C | D |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
In[65]:
df[df>0]#SelectingvaluesfromaDataFramewhereabooleanconditionismet
Out[65]:
| | A | B | C | D |
---|
2017-01-12 | NaN | 2.456196 | 0.778567 | NaN |
---|
2017-01-13 | NaN | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | NaN | NaN | NaN | NaN |
---|
2017-01-16 | NaN | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | NaN | 0.385811 | NaN | NaN |
---|
In[66]:
df2
Out[66]:
| | A | B | C | D | E | F |
---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
---|
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
---|
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
---|
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
---|
In[67]:
df
Out[67]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
In[68]:
df2=df.copy()df2
Out[68]:
| | A | B | C | D |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
In[69]:
df.equals(df2)
Out[69]:
True
In[70]:
df==df2
Out[70]:
| | A | B | C | D |
---|
2017-01-12 | True | True | True | True |
---|
2017-01-13 | True | True | True | True |
---|
2017-01-14 | True | True | True | True |
---|
2017-01-15 | True | True | True | True |
---|
2017-01-16 | True | True | True | True |
---|
2017-01-17 | True | True | True | True |
---|
In[71]:
dfisdf2
Out[71]:
False
In[72]:
df2["E"]=["one","one","two","three","four","three"]df2
Out[72]:
| | A | B | C | D | E |
---|
2017-01-12 | -2.258121 | 2.456196 | 0.778567 | -2.030407 | one |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 | one |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 | two |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 | three |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 | four |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 | three |
---|
In[73]:
df2[df2.E.isin(["two","four"])]
Out[73]:
| | A | B | C | D | E |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 | two |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 | four |
---|
In[74]:
df2[df2["E"].isin(["two","four"])]
Out[74]:
| | A | B | C | D | E |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 | two |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 | four |
---|
Setting
In[75]:
s1=pd.Series([1,2,3,4,5,6],index=pd.date_range("20171016",periods=6))#Settinganewcolumnautomaticallyalignsthedatabytheindexes
s1
Out[75]:
2017-10-161
2017-10-172
2017-10-183
2017-10-194
2017-10-205
2017-10-216
Freq:D,dtype:int64
In[76]:
df.at[dates[0],"A"]=0#Settingvaluesbylabel
In[77]:
df
Out[77]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 2.456196 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
In[78]:
df.iat[0,1]=0df
Out[78]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | -2.030407 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 0.073587 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 1.231538 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -0.669839 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 0.898683 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | -0.021460 |
---|
In[79]:
df.loc[:,"D"]=np.array([5]*len(df))#Settingbyassigningwithanumpyarraydf
Out[79]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[80]:
df2=df.copy()df2
Out[80]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[81]:
df2[df2>0]=-df2df2
Out[81]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | -0.778567 | -5 |
---|
2017-01-13 | -0.658348 | -0.622495 | -0.388625 | -5 |
---|
2017-01-14 | -0.589219 | -1.392792 | -0.605545 | -5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | -5 |
---|
2017-01-16 | -1.323304 | -3.143659 | -0.638996 | -5 |
---|
2017-01-17 | -0.024935 | -0.385811 | -1.577185 | -5 |
---|
MissingData
In[83]:
df
Out[83]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[84]:
df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1
df1
Out[84]:
| | A | B | C | D | E |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 | 1.0 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 | 1.0 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 | NaN |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 | NaN |
---|
In[85]:
df1.dropna(how="any")#Todropanyrowsthathavemissingdata
Out[85]:
| | A | B | C | D | E |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 | 1.0 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 | 1.0 |
---|
In[86]:
df1.fillna(value=5)#Fillingmissingdata
Out[86]:
| | A | B | C | D | E |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 | 1.0 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 | 1.0 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 | 5.0 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 | 5.0 |
---|
In[87]:
df1
Out[87]:
| | A | B | C | D | E |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 | 1.0 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 | 1.0 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 | NaN |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 | NaN |
---|
In[88]:
pd.isnull(df1)
Out[88]:
| | A | B | C | D | E |
---|
2017-01-12 | False | False | False | False | False |
---|
2017-01-13 | False | False | False | False | False |
---|
2017-01-14 | False | False | False | False | True |
---|
2017-01-15 | False | False | False | False | True |
---|
In[89]:
df1.isnull()
Out[89]:
| | A | B | C | D | E |
---|
2017-01-12 | False | False | False | False | False |
---|
2017-01-13 | False | False | False | False | False |
---|
2017-01-14 | False | False | False | False | True |
---|
2017-01-15 | False | False | False | False | True |
---|
In[90]:
df1.isna()#没有这个方法~~
---------------------------------------------------------------------------AttributeErrorTraceback(mostrecentcalllast)<ipython-input-90-9dd6d031e095>in<module>()---->1df1.isna()#没有这个方法~~D:\Users\asus\Anaconda3\lib\site-packages\pandas\core\generic.pyin__getattr__(self,name)2968ifnameinself._info_axis:2969returnself[name]->2970returnobject.__getattribute__(self,name)29712972def__setattr__(self,name,value):AttributeError:'DataFrame'objecthasnoattribute'isna'
Options
Stats
Operationsingeneralexcludemissingdata.PerformingadescriptivestatisticIn[91]:
df
Out[91]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[92]:
df.mean()
Out[92]:
A-0.261554
B0.814918
C-0.213363
D5.000000
dtype:float64
In[93]:
df.mean(1)#Sameoperationontheotheraxis
Out[93]:
2017-01-121.444642
2017-01-131.338193
2017-01-141.896889
2017-01-150.519517
2017-01-161.864838
2017-01-170.945923
Freq:D,dtype:float64
In[94]:
s=pd.Series([1,2,3,np.nan,4,5],index=dates).shift(2)
#Operatingwithobjectsthathavedifferentdimensionalityandneedalignment.Inaddition,pandasautomaticallybroadcastsalongthespecifieddimension.
s
Out[94]:
2017-01-12NaN
2017-01-13NaN
2017-01-141.0
2017-01-152.0
2017-01-163.0
2017-01-17NaN
Freq:D,dtype:float64
In[95]:
df
Out[95]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[96]:
df.sub(s,axis="index")#dataFrame与series的减法
Out[96]:
| | A | B | C | D |
---|
2017-01-12 | NaN | NaN | NaN | NaN |
---|
2017-01-13 | NaN | NaN | NaN | NaN |
---|
2017-01-14 | -0.410781 | 0.392792 | -0.394455 | 4.0 |
---|
2017-01-15 | -2.151958 | -2.655249 | -4.114725 | 3.0 |
---|
2017-01-16 | -4.323304 | 0.143659 | -2.361004 | 2.0 |
---|
2017-01-17 | NaN | NaN | NaN | NaN |
---|
Apply
In[97]:
df
Out[97]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[98]:
df.apply(np.cumsum)#行叠加。
Out[98]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 1.167192 | 10 |
---|
2017-01-14 | -0.069129 | 2.015286 | 1.772737 | 15 |
---|
2017-01-15 | -0.221087 | 1.360038 | -0.341988 | 20 |
---|
2017-01-16 | -1.544392 | 4.503697 | 0.297008 | 25 |
---|
2017-01-17 | -1.569326 | 4.889508 | -1.280177 | 30 |
---|
In[99]:
df.apply(lambdax:x.max()-x.min())
Out[99]:
A1.912523
B3.798908
C2.893293
D0.000000
dtype:float64
Histogramming
In[100]:
s=pd.Series(np.random.randint(0,7,size=10))
s
Out[100]:
04
15
22
30
45
53
64
73
83
90
dtype:int32
In[101]:
s.value_counts()
Out[101]:
33
52
42
02
21
dtype:int64
StringMethods
Seriesisequippedwithasetofstringprocessingmethodsinthestrattributethatmakeiteasytooperateoneachelementofthearray,asinthecodesnippetbelow.Notethatpattern-matchinginstrgenerallyusesregularexpressionsbydefault(andinsomecasesalwaysusesthem).SeemoreatVectorizedStringMethods.In[102]:
s=pd.Series(['A','B','C','Aaba','Baca',np.nan,'CABA','dog','cat'])
s.str.lower()
Out[102]:
0a
1b
2c
3aaba
4baca
5NaN
6caba
7dog
8cat
dtype:object
In[103]:
s
Out[103]:
0A
1B
2C
3Aaba
4Baca
5NaN
6CABA
7dog
8cat
dtype:object
Merge合并
Concat
pandasprovidesvariousfacilitiesforeasilycombiningtogetherSeries,DataFrame,andPanelobjectswithvariouskindsofsetlogicfortheindexesandrelationalalgebrafunctionalityinthecaseofjoin/merge-typeoperations.SeetheMergingsectionConcatenatingpandasobjectstogetherwithconcat():In[104]:
df
Out[104]:
| | A | B | C | D |
---|
2017-01-12 | 0.000000 | 0.000000 | 0.778567 | 5 |
---|
2017-01-13 | -0.658348 | 0.622495 | 0.388625 | 5 |
---|
2017-01-14 | 0.589219 | 1.392792 | 0.605545 | 5 |
---|
2017-01-15 | -0.151958 | -0.655249 | -2.114725 | 5 |
---|
2017-01-16 | -1.323304 | 3.143659 | 0.638996 | 5 |
---|
2017-01-17 | -0.024935 | 0.385811 | -1.577185 | 5 |
---|
In[105]:
df=pd.DataFrame(np.random.randn(10,4))df
Out[105]:
| | 0 | 1 | 2 | 3 |
---|
0 | 0.111766 | -0.505125 | 2.156029 | 0.419152 |
---|
1 | 1.068870 | 1.180587 | 0.361345 | 1.090554 |
---|
2 | 0.488997 | 0.281507 | -0.738345 | -0.242974 |
---|
3 | -1.846709 | 1.686173 | -0.202319 | -1.151983 |
---|
4 | 0.573012 | -1.979189 | 1.544768 | 1.594595 |
---|
5 | -0.954571 | -0.696788 | 0.270959 | -2.296720 |
---|
6 | -1.511946 | 1.796113 | 0.399493 | 0.412664 |
---|
7 | 0.089844 | -0.545153 | -0.315653 | -0.235828 |
---|
8 | -0.747140 | 1.222900 | -1.650812 | 0.292432 |
---|
9 | 0.659855 | 0.501265 | 0.363978 | 1.722914 |
---|
In[106]:
#breakitintopiecespieces=[df[:3],df[3:7],df[7:]]pd.concat(pieces)
Out[106]:
| | 0 | 1 | 2 | 3 |
---|
0 | 0.111766 | -0.505125 | 2.156029 | 0.419152 |
---|
1 | 1.068870 | 1.180587 | 0.361345 | 1.090554 |
---|
2 | 0.488997 | 0.281507 | -0.738345 | -0.242974 |
---|
3 | -1.846709 | 1.686173 | -0.202319 | -1.151983 |
---|
4 | 0.573012 | -1.979189 | 1.544768 | 1.594595 |
---|
5 | -0.954571 | -0.696788 | 0.270959 | -2.296720 |
---|
6 | -1.511946 | 1.796113 | 0.399493 | 0.412664 |
---|
7 | 0.089844 | -0.545153 | -0.315653 | -0.235828 |
---|
8 | -0.747140 | 1.222900 | -1.650812 | 0.292432 |
---|
9 | 0.659855 | 0.501265 | 0.363978 | 1.722914 |
---|
In[107]:
pieces
Out[107]:
[0123
00.111766-0.5051252.1560290.419152
11.0688701.1805870.3613451.090554
20.4889970.281507-0.738345-0.242974,
0123
3-1.8467091.686173-0.202319-1.151983
40.573012-1.9791891.5447681.594595
5-0.954571-0.6967880.270959-2.296720
6-1.5119461.7961130.3994930.412664,
0123
70.089844-0.545153-0.315653-0.235828
8-0.7471401.222900-1.6508120.292432
90.6598550.5012650.3639781.722914]
Join
SQLstylemerges.SeetheDatabasestylejoiningIn[108]:
left=pd.DataFrame({"key":["foo","foo"],"lval":[1,2]})
right=pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
In[109]:
left
Out[109]:
In[110]:
right
Out[110]:
In[111]:
pd.merge(left,right,on="key")
Out[111]:
| | key | lval | rval |
---|
0 | foo | 1 | 4 |
---|
1 | foo | 1 | 5 |
---|
2 | foo | 2 | 4 |
---|
3 | foo | 2 | 5 |
---|
In[112]:
left=pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
right=pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
In[113]:
left
Out[113]:
In[114]:
right
Out[114]:
In[115]:
pd.merge(left,right,on="key")
Out[115]:
Append
In[116]:
df=pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])df
Out[116]:
| | A | B | C | D |
---|
0 | -0.852451 | 1.074357 | -0.591892 | 0.950982 |
---|
1 | -0.977580 | 1.656374 | 0.693657 | 0.718832 |
---|
2 | 0.303269 | -0.881728 | -1.509321 | 1.219849 |
---|
3 | 0.655751 | 1.235660 | 1.729038 | 1.074948 |
---|
4 | 0.658413 | -1.215348 | -1.139623 | 0.753772 |
---|
5 | 1.345115 | 1.420212 | -0.124543 | -0.099265 |
---|
6 | 1.129623 | 0.597484 | -0.804759 | -0.568266 |
---|
7 | -0.770570 | 0.540917 | -0.261607 | -0.083751 |
---|
In[117]:
s=df.iloc[3]s
Out[117]:
A0.655751
B1.235660
C1.729038
D1.074948
Name:3,dtype:float64
In[118]:
df.append(s,ignore_index=True)
Out[118]:
| | A | B | C | D |
---|
0 | -0.852451 | 1.074357 | -0.591892 | 0.950982 |
---|
1 | -0.977580 | 1.656374 | 0.693657 | 0.718832 |
---|
2 | 0.303269 | -0.881728 | -1.509321 | 1.219849 |
---|
3 | 0.655751 | 1.235660 | 1.729038 | 1.074948 |
---|
4 | 0.658413 | -1.215348 | -1.139623 | 0.753772 |
---|
5 | 1.345115 | 1.420212 | -0.124543 | -0.099265 |
---|
6 | 1.129623 | 0.597484 | -0.804759 | -0.568266 |
---|
7 | -0.770570 | 0.540917 | -0.261607 | -0.083751 |
---|
8 | 0.655751 | 1.235660 | 1.729038 | 1.074948 |
---|
Grouping
By“groupby”wearereferringtoaprocessinvolvingoneormoreofthefollowingsteps•Splittingthedataintogroupsbasedonsomecriteria•Applyingafunctiontoeachgroupindependently•CombiningtheresultsintoadatastructureIn[119]:
df=pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],'B':['one','one','two','three','two','two','one','three'],'C':np.random.randn(8),'D':np.random.randn(8)})df
Out[119]:
| | A | B | C | D |
---|
0 | foo | one | -0.523738 | -1.363519 |
---|
1 | bar | one | -0.071920 | -2.618027 |
---|
2 | foo | two | -2.712421 | -0.407372 |
---|
3 | bar | three | -0.635898 | -1.942854 |
---|
4 | foo | two | 0.952073 | -0.546110 |
---|
5 | bar | two | 1.474296 | -0.982238 |
---|
6 | foo | one | -0.529788 | -0.213397 |
---|
7 | foo | three | 0.877394 | -0.791663 |
---|
In[120]:
df.groupby("A").sum()
Out[120]:
| | C | D |
---|
A | | |
---|
bar | 0.766479 | -5.543120 |
---|
foo | -1.936480 | -3.322062 |
---|
In[121]:
df.groupby(["A","B"]).sum()#Groupingbymultiplecolumnsformsahierarchicalindex,whichwethenapplythefunction.
Out[121]:
| | | C | D |
---|
A | B | | |
---|
bar | one | -0.071920 | -2.618027 |
---|
three | -0.635898 | -1.942854 |
---|
two | 1.474296 | -0.982238 |
---|
foo | one | -1.053527 | -1.576917 |
---|
three | 0.877394 | -0.791663 |
---|
two | -1.760347 | -0.953482 |
---|
Reshaping
Stack
In[122]:
tuples=list(zip([['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]))tuples
Out[122]:
[(['bar','bar','baz','baz','foo','foo','qux','qux'],),
(['one','two','one','two','one','two','one','two'],)]
In[123]:
tuples=list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]))tuples
Out[123]:
[('bar','one'),
('bar','two'),
('baz','one'),
('baz','two'),
('foo','one'),
('foo','two'),
('qux','one'),
('qux','two')]
In[124]:
index=pd.MultiIndex.from_tuples(tuples,names=["first","second"])
index
Out[124]:
MultiIndex(levels=[['bar','baz','foo','qux'],['one','two']],
labels=[[0,0,1,1,2,2,3,3],[0,1,0,1,0,1,0,1]],
names=['first','second'])
In[125]:
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])df
Out[125]:
| | | A | B |
---|
first | second | | |
---|
bar | one | -1.101051 | -1.126231 |
---|
two | -0.395652 | -0.313567 |
---|
baz | one | 1.378579 | -1.637869 |
---|
two | 0.665960 | -0.259749 |
---|
foo | one | -0.256181 | 1.260131 |
---|
two | -0.994720 | 0.506272 |
---|
qux | one | -0.422649 | 0.191402 |
---|
two | -0.102085 | 0.975210 |
---|
In[126]:
df2=df[:4]df2
Out[126]:
| | | A | B |
---|
first | second | | |
---|
bar | one | -1.101051 | -1.126231 |
---|
two | -0.395652 | -0.313567 |
---|
baz | one | 1.378579 | -1.637869 |
---|
two | 0.665960 | -0.259749 |
---|
In[127]:
stacked=df2.stack()
stacked
Out[127]:
firstsecond
baroneA-1.101051
B-1.126231
twoA-0.395652
B-0.313567
bazoneA1.378579
B-1.637869
twoA0.665960
B-0.259749
dtype:float64
Witha“stacked”DataFrameorSeries(havingaMultiIndexastheindex),theinverseoperationofstack()isunstack(),whichbydefaultunstacksthelastlevel:In[128]:
stacked.unstack()
Out[128]:
| | | A | B |
---|
first | second | | |
---|
bar | one | -1.101051 | -1.126231 |
---|
two | -0.395652 | -0.313567 |
---|
baz | one | 1.378579 | -1.637869 |
---|
two | 0.665960 | -0.259749 |
---|
In[129]:
stacked.unstack(1)
Out[129]:
| | second | one | two |
---|
first | | | |
---|
bar | A | -1.101051 | -0.395652 |
---|
B | -1.126231 | -0.313567 |
---|
baz | A | 1.378579 | 0.665960 |
---|
B | -1.637869 | -0.259749 |
---|
In[130]:
stacked.unstack(0)
Out[130]:
| | first | bar | baz |
---|
second | | | |
---|
one | A | -1.101051 | 1.378579 |
---|
B | -1.126231 | -1.637869 |
---|
two | A | -0.395652 | 0.665960 |
---|
B | -0.313567 | -0.259749 |
---|
PivotTables
In[131]:
df=pd.DataFrame({'A':['one','one','two','three']*3,'B':['A','B','C']*4,'C':['foo','foo','foo','bar','bar','bar']*2,'D':np.random.randn(12),'E':np.random.randn(12)})df
Out[131]:
| | A | B | C | D | E |
---|
0 | one | A | foo | 0.039230 | 0.134261 |
---|
1 | one | B | foo | 0.952890 | -0.499183 |
---|
2 | two | C | foo | -0.778814 | -0.655735 |
---|
3 | three | A | bar | 0.798864 | 0.025109 |
---|
4 | one | B | bar | -0.580050 | -1.711672 |
---|
5 | one | C | bar | 0.004300 | -0.433591 |
---|
6 | two | A | foo | 0.229248 | -2.648814 |
---|
7 | three | B | foo | 0.506488 | 0.630373 |
---|
8 | one | C | foo | -0.315667 | 0.031764 |
---|
9 | one | A | bar | -1.547410 | 0.743825 |
---|
10 | two | B | bar | -0.480958 | 0.365255 |
---|
11 | three | C | bar | 1.742948 | 0.692884 |
---|
In[4]:
pd.pivot_table(df,values="D",index=["A","B"],columns=["C"])
Out[4]:
| | C | bar | foo |
---|
A | B | | |
---|
one | A | 0.932814 | -1.440079 |
---|
B | 0.060252 | 1.071877 |
---|
C | 2.879779 | 0.355274 |
---|
three | A | -0.328442 | NaN |
---|
B | NaN | -2.544812 |
---|
C | -1.879058 | NaN |
---|
two | A | NaN | -1.987377 |
---|
B | 0.220517 | NaN |
---|
C | NaN | -0.082820 |
---|
TimeSeries
pandashassimple,powerful,andefficientfunctionalityforperformingresamplingoperationsduringfrequencyconversion(e.g.,convertingsecondlydatainto5-minutelydata).Thisisextremelycommonin,butnotlimitedto,financialapplications.In[132]:
rng=pd.date_range("1/2/2017",periods=100,freq="S")
rng
Out[132]:
DatetimeIndex(['2017-01-0200:00:00','2017-01-0200:00:01',
'2017-01-0200:00:02','2017-01-0200:00:03',
'2017-01-0200:00:04','2017-01-0200:00:05',
'2017-01-0200:00:06','2017-01-0200:00:07',
'2017-01-0200:00:08','2017-01-0200:00:09',
'2017-01-0200:00:10','2017-01-0200:00:11',
'2017-01-0200:00:12','2017-01-0200:00:13',
'2017-01-0200:00:14','2017-01-0200:00:15',
'2017-01-0200:00:16','2017-01-0200:00:17',
'2017-01-0200:00:18','2017-01-0200:00:19',
'2017-01-0200:00:20','2017-01-0200:00:21',
'2017-01-0200:00:22','2017-01-0200:00:23',
'2017-01-0200:00:24','2017-01-0200:00:25',
'2017-01-0200:00:26','2017-01-0200:00:27',
'2017-01-0200:00:28','2017-01-0200:00:29',
'2017-01-0200:00:30','2017-01-0200:00:31',
'2017-01-0200:00:32','2017-01-0200:00:33',
'2017-01-0200:00:34','2017-01-0200:00:35',
'2017-01-0200:00:36','2017-01-0200:00:37',
'2017-01-0200:00:38','2017-01-0200:00:39',
'2017-01-0200:00:40','2017-01-0200:00:41',
'2017-01-0200:00:42','2017-01-0200:00:43',
'2017-01-0200:00:44','2017-01-0200:00:45',
'2017-01-0200:00:46','2017-01-0200:00:47',
'2017-01-0200:00:48','2017-01-0200:00:49',
'2017-01-0200:00:50','2017-01-0200:00:51',
'2017-01-0200:00:52','2017-01-0200:00:53',
'2017-01-0200:00:54','2017-01-0200:00:55',
'2017-01-0200:00:56','2017-01-0200:00:57',
'2017-01-0200:00:58','2017-01-0200:00:59',
'2017-01-0200:01:00','2017-01-0200:01:01',
'2017-01-0200:01:02','2017-01-0200:01:03',
'2017-01-0200:01:04','2017-01-0200:01:05',
'2017-01-0200:01:06','2017-01-0200:01:07',
'2017-01-0200:01:08','2017-01-0200:01:09',
'2017-01-0200:01:10','2017-01-0200:01:11',
'2017-01-0200:01:12','2017-01-0200:01:13',
'2017-01-0200:01:14','2017-01-0200:01:15',
'2017-01-0200:01:16','2017-01-0200:01:17',
'2017-01-0200:01:18','2017-01-0200:01:19',
'2017-01-0200:01:20','2017-01-0200:01:21',
'2017-01-0200:01:22','2017-01-0200:01:23',
'2017-01-0200:01:24','2017-01-0200:01:25',
'2017-01-0200:01:26','2017-01-0200:01:27',
'2017-01-0200:01:28','2017-01-0200:01:29',
'2017-01-0200:01:30','2017-01-0200:01:31',
'2017-01-0200:01:32','2017-01-0200:01:33',
'2017-01-0200:01:34','2017-01-0200:01:35',
'2017-01-0200:01:36','2017-01-0200:01:37',
'2017-01-0200:01:38','2017-01-0200:01:39'],
dtype='datetime64[ns]',freq='S')
In[133]:
ts=pd.Series(np.random.randint(0,500,len(rng)),index=rng)ts
Out[133]:
2017-01-0200:00:00251
2017-01-0200:00:0163
2017-01-0200:00:02108
2017-01-0200:00:03288
2017-01-0200:00:04491
2017-01-0200:00:05490
2017-01-0200:00:06343
2017-01-0200:00:07357
2017-01-0200:00:0872
2017-01-0200:00:09171
2017-01-0200:00:10324
2017-01-0200:00:11281
2017-01-0200:00:12176
2017-01-0200:00:1314
2017-01-0200:00:14495
2017-01-0200:00:15150
2017-01-0200:00:1669
2017-01-0200:00:17144
2017-01-0200:00:18126
2017-01-0200:00:19368
2017-01-0200:00:20129
2017-01-0200:00:21386
2017-01-0200:00:22228
2017-01-0200:00:23458
2017-01-0200:00:2498
2017-01-0200:00:25244
2017-01-0200:00:26206
2017-01-0200:00:2798
2017-01-0200:00:2892
2017-01-0200:00:29259
...
2017-01-0200:01:10127
2017-01-0200:01:11342
2017-01-0200:01:12185
2017-01-0200:01:13123
2017-01-0200:01:1473
2017-01-0200:01:15132
2017-01-0200:01:16462
2017-01-0200:01:17317
2017-01-0200:01:18180
2017-01-0200:01:19247
2017-01-0200:01:2097
2017-01-0200:01:21401
2017-01-0200:01:22342
2017-01-0200:01:23382
2017-01-0200:01:24304
2017-01-0200:01:2547
2017-01-0200:01:26193
2017-01-0200:01:27334
2017-01-0200:01:28196
2017-01-0200:01:29297
2017-01-0200:01:30195
2017-01-0200:01:31236
2017-01-0200:01:32200
2017-01-0200:01:33490
2017-01-0200:01:34196
2017-01-0200:01:35201
2017-01-0200:01:36397
2017-01-0200:01:37494
2017-01-0200:01:38482
2017-01-0200:01:39267
Freq:S,Length:100,dtype:int32
In[7]:
ts.resample("5Min").sum()
Out[7]:
2017-01-0222939
Freq:5T,dtype:int32
In[9]:
ts.resample("1Min").sum()
Out[9]:
2017-01-0200:00:0013896
2017-01-0200:01:009043
Freq:T,dtype:int32
Timezonerepresentation.零时区UTC表示。In[10]:
rng=pd.date_range("2/1/201700:00",periods=5,freq="D")
rng
Out[10]:
DatetimeIndex(['2017-02-01','2017-02-02','2017-02-03','2017-02-04',
'2017-02-05'],
dtype='datetime64[ns]',freq='D')
In[12]:
ts=pd.Series(np.random.randn(len(rng)),index=rng)ts
Out[12]:
2017-02-010.329594
2017-02-022.097319
2017-02-031.852023
2017-02-04-0.213452
2017-02-050.160873
Freq:D,dtype:float64
In[13]:
tsUtc=ts.tz_localize("UTC")
tsUtc
Out[13]:
2017-02-0100:00:00+00:000.329594
2017-02-0200:00:00+00:002.097319
2017-02-0300:00:00+00:001.852023
2017-02-0400:00:00+00:00-0.213452
2017-02-0500:00:00+00:000.160873
Freq:D,dtype:float64
Converttoanothertimezone.时区转换。In[14]:
tsUtc.tz_convert("US/Eastern")
Out[14]:
2017-01-3119:00:00-05:000.329594
2017-02-0119:00:00-05:002.097319
2017-02-0219:00:00-05:001.852023
2017-02-0319:00:00-05:00-0.213452
2017-02-0419:00:00-05:000.160873
Freq:D,dtype:float64
In[15]:
tsUtc
Out[15]:
2017-02-0100:00:00+00:000.329594
2017-02-0200:00:00+00:002.097319
2017-02-0300:00:00+00:001.852023
2017-02-0400:00:00+00:00-0.213452
2017-02-0500:00:00+00:000.160873
Freq:D,dtype:float64
ConvertingbetweentimespanrepresentationsIn[16]:
rng=pd.date_range("1/8/2017",periods=5,freq="M")
rng
Out[16]:
DatetimeIndex(['2017-01-31','2017-02-28','2017-03-31','2017-04-30',
'2017-05-31'],
dtype='datetime64[ns]',freq='M')
In[18]:
ts=pd.Series(np.random.randn(len(rng)),rng)ts
Out[18]:
2017-01-310.904523
2017-02-28-0.470144
2017-03-31-0.373244
2017-04-300.860448
2017-05-310.176226
Freq:M,dtype:float64
In[20]:
ps=ts.to_period()ps
Out[20]:
2017-010.904523
2017-02-0.470144
2017-03-0.373244
2017-040.860448
2017-050.176226
Freq:M,dtype:float64
In[21]:
ps.to_timestamp()
Out[21]:
2017-01-010.904523
2017-02-01-0.470144
2017-03-01-0.373244
2017-04-010.860448
2017-05-010.176226
Freq:MS,dtype:float64
In[22]:
ps
Out[22]:
2017-010.904523
2017-02-0.470144
2017-03-0.373244
2017-040.860448
2017-050.176226
Freq:M,dtype:float64
Convertingbetweenperiodandtimestampenablessomeconvenientarithmeticfunctionstobeused.Inthefollowingexample,weconvertaquarterlyfrequencywithyearendinginNovemberto9amoftheendofthemonthfollowingthequarterend:In[23]:
prng=pd.period_range("1990Q1","2017Q4",freq="Q-NOV")
prng
Out[23]:
PeriodIndex(['1990Q1','1990Q2','1990Q3','1990Q4','1991Q1','1991Q2',
'1991Q3','1991Q4','1992Q1','1992Q2',
...
'2015Q3','2015Q4','2016Q1','2016Q2','2016Q3','2016Q4',
'2017Q1','2017Q2','2017Q3','2017Q4'],
dtype='period[Q-NOV]',length=112,freq='Q-NOV')
In[25]:
ts=pd.Series(np.random.randn(len(prng)),prng)
ts.head()
Out[25]:
1990Q11.193031
1990Q20.621627
1990Q3-0.235553
1990Q40.642938
1991Q10.247024
Freq:Q-NOV,dtype:float64
In[26]:
ts.index=(prng.asfreq("M","e")+1).asfreq("H","s")+9
ts.head()
Out[26]:
1990-03-0109:001.193031
1990-06-0109:000.621627
1990-09-0109:00-0.235553
1990-12-0109:000.642938
1991-03-0109:000.247024
Freq:H,dtype:float64
Categoricals
In[34]:
df=pd.DataFrame({"id":[1,2,3,4,5,6],"raw_grade":["a","a","c","b","b","f"]})df
Out[34]:
| | id | raw_grade |
---|
0 | 1 | a |
---|
1 | 2 | a |
---|
2 | 3 | c |
---|
3 | 4 | b |
---|
4 | 5 | b |
---|
5 | 6 | f |
---|
Converttherawgradestoacategoricaldatatype.In[35]:
df["grade"]=df.raw_grade.astype("category")df
Out[35]:
| | id | raw_grade | grade |
---|
0 | 1 | a | a |
---|
1 | 2 | a | a |
---|
2 | 3 | c | c |
---|
3 | 4 | b | b |
---|
4 | 5 | b | b |
---|
5 | 6 | f | f |
---|
In[36]:
df.grade#Converttherawgradestoacategoricaldatatype
Out[36]:
0a
1a
2c
3b
4b
5f
Name:grade,dtype:category
Categories(4,object):[a,b,c,f]
In[37]:
#Renamethecategoriestomoremeaningfulnames(assigningtoSeries.cat.categoriesisinplace!)df.grade.cat.categories=["verygood","good","nomal","bad"]df
Out[37]:
| | id | raw_grade | grade |
---|
0 | 1 | a | verygood |
---|
1 | 2 | a | verygood |
---|
2 | 3 | c | nomal |
---|
3 | 4 | b | good |
---|
4 | 5 | b | good |
---|
5 | 6 | f | bad |
---|
In[38]:
#Reorderthecategoriesandsimultaneouslyaddthemissingcategories(methodsunderSeries.catreturnanewSeriesperdefault).
df.grade=df.grade.cat.set_categories(["verybad","bad","medium","good","verygood"])
df.grade
Out[38]:
0verygood
1verygood
2NaN
3good
4good
5bad
Name:grade,dtype:category
Categories(5,object):[verybad,bad,medium,good,verygood]
In[39]:
df
Out[39]:
| | id | raw_grade | grade |
---|
0 | 1 | a | verygood |
---|
1 | 2 | a | verygood |
---|
2 | 3 | c | NaN |
---|
3 | 4 | b | good |
---|
4 | 5 | b | good |
---|
5 | 6 | f | bad |
---|
Sortingisperorderinthecategories,notlexicalorderIn[40]:
df.sort_values(by="grade")
Out[40]:
| | id | raw_grade | grade |
---|
2 | 3 | c | NaN |
---|
5 | 6 | f | bad |
---|
3 | 4 | b | good |
---|
4 | 5 | b | good |
---|
0 | 1 | a | verygood |
---|
1 | 2 | a | verygood |
---|
GroupingbyacategoricalcolumnshowsalsoemptycategoriesIn[41]:
df.groupby("grade").size()
Out[41]:
grade
verybad0
bad1
medium0
good2
verygood2
dtype:int64
Plotting
In[43]:
ts=pd.Series(np.random.randn(1000),index=pd.date_range("1/1/2017",periods=1000))
ts.head()
Out[43]:
2017-01-01-0.745067
2017-01-02-0.070895
2017-01-030.233542
2017-01-04-0.206597
2017-01-050.891064
Freq:D,dtype:float64
In[45]:
ts=ts.cumsum()
ts.head()
Out[45]:
2017-01-01-0.745067
2017-01-02-1.561029
2017-01-03-2.143449
2017-01-04-2.932466
2017-01-05-2.830418
Freq:D,dtype:float64
In[48]:
ts.plot()
Out[48]:
<matplotlib.axes._subplots.AxesSubplotat0x19bf6a6e278>
In[50]:
df=pd.DataFrame(np.random.randn(1000,4),index=ts.index,columns=["A","B","C","D"])df.head()
Out[50]:
| | A | B | C | D |
---|
2017-01-01 | -1.940139 | -0.476590 | -0.154066 | 1.692812 |
---|
2017-01-02 | 0.399891 | 0.268976 | 0.596209 | -0.484979 |
---|
2017-01-03 | 0.814519 | -0.142193 | -0.084394 | -0.687342 |
---|
2017-01-04 | 0.385848 | -1.230059 | -0.093327 | -0.096652 |
---|
2017-01-05 | 0.407435 | -0.849347 | 0.379192 | 0.172933 |
---|
In[51]:
df=df.cumsum()
In[53]:
plt.figure()
df.plot()
plt.legend(loc="best")
plt.show()
<matplotlib.figure.Figureat0x19bf8855da0>
<matplotlib.figure.Figureat0x19bf897dc88>
GettingDataIn/Out
CSV
In[]:
df.to_csv("foo.csv")
In[]:
pd.read_csv("foo.csv")
HDF5
In[]:
df.to_hdf("foo.h5","df")
In[]:
pd.read_hdf("foo.h5","df")
Excel
In[]:
df.to_excel('foo.xlsx',sheet_name='Sheet1')
In[]:
pd.read_excel('foo.xlsx','Sheet1',index_col=None,na_values=['NA'])
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]:
In[]: