您的位置:首页 > 编程语言 > Python开发

Python处理缺失数据

2017-08-02 15:12 381 查看

滤除、填补,两种方式

import numpy as np

from numpy import nan as NA 

import pandas as pd 

from pandas import DataFrame 

#处理Series数据

data=pd.Series([1,2,np.nan,4])

print(data)

print([1,2,NA])

#dropna过滤掉缺失数据,对应的索引同时删除,余下索引号不变

print(data.dropna())

#drop()过滤掉()中索引对应的数据

print(data.drop(1))

#判断是否为缺失值,是则1;不是则0

print(data.isnull())

#判断是否不是缺失值,不是则1;是则0
print(data.notnull())

输出:0    1.0

1    2.0

2    NaN

3    4.0

dtype: float64

[1, 2, nan]

0    1.0

1    2.0

3    4.0

dtype: float64

0    1.0

2    NaN

3    4.0

dtype: float64

0    False

1    False

2     True

3    False

dtype: bool

0     True

1     True

2    False

3     True

dtype: bool

#处理DataFrame数据

data1=DataFrame({'a':[1,2,3,NA,NA],'b':[4,NA,5,6,NA]})

print(data1)

#dropna默认丢弃所有含有缺失值的行

print(data1.dropna())

#只丢弃全部为缺失值的行

print(data1.dropna(how='all'))

#加入一个全为缺失值的列

data1[2]=NA

print(data1)

#只丢弃全部为缺失值的列;axis为轴线,0为行,1为列
print(data1.dropna(axis=1,how='all'))

输出: a    b

0  1.0  4.0

1  2.0  NaN

2  3.0  5.0

3  NaN  6.0

4  NaN  NaN

     a    b

0  1.0  4.0

2  3.0  5.0

     a    b

0  1.0  4.0

1  2.0  NaN

2  3.0  5.0

3  NaN  6.0

     a    b   2

0  1.0  4.0 NaN

1  2.0  NaN NaN

2  3.0  5.0 NaN

3  NaN  6.0 NaN

4  NaN  NaN NaN

     a    b

0  1.0  4.0

1  2.0  NaN

2  3.0  5.0

3  NaN  6.0

4  NaN  NaN

#

#随机建立6行3列DataFrame,数值在0-1间

df=DataFrame(np.random.rand(6,3))

df1=DataFrame(np.random.rand(6,3))

print(df)

#设置一些数为缺失值

#:4,iloc默认0到3行,右开区间;loc默认0到4行,闭区间
df1.loc[1:4,1]=df1.loc[:2,2]=NA

print(df1)
df.iloc[1:4,1]=df.iloc[:2,2]=NA

print(df)

#丢弃含有少于2个非空值的行

print(df.dropna(thresh=2))

输出: 0         1         2

0  0.576558  0.145882  0.809296

1  0.001554  0.069911  0.084797

2  0.217655  0.139214  0.552833

3  0.585376  0.910459  0.835183

4  0.006047  0.500448  0.102748

5  0.163494  0.296311  0.667189

          0         1         2

0  0.729923  0.558838       NaN

1  0.010230       NaN       NaN

2  0.288920       NaN       NaN

3  0.025187       NaN  0.282088

4  0.955668       NaN  0.719648

5  0.297154  0.777692  0.838243

          0         1         2

0  0.576558  0.145882       NaN

1  0.001554       NaN       NaN

2  0.217655       NaN  0.552833

3  0.585376       NaN  0.835183

4  0.006047  0.500448  0.102748

5  0.163494  0.296311  0.667189

          0         1         2

0  0.576558  0.145882       NaN

2  0.217655       NaN  0.552833

3  0.585376       NaN  0.835183

4  0.006047  0.500448  0.102748

5  0.163494  0.296311  0.667189

#用1来填补缺失数据,并不改变本身df额值

print(df.fillna(1))

#用不同数值来填补不同列的缺失数据

print(df.fillna({1:0.5,2:-3}))

print(df)

#对现有对象就地修改

df.fillna(22,inplace=True)

print(df)

#用此列上一个值填补缺失值
print(df1.fillna(method='ffill'))

输出:0         1         2

0  0.922985  0.427313  1.000000

1  0.274581  1.000000  1.000000

2  0.825859  1.000000  0.427558

3  0.038837  1.000000  0.571464

4  0.425408  0.820305  0.215118

5  0.491830  0.240837  0.244575

          0         1         2

0  0.922985  0.427313 -3.000000

1  0.274581  0.500000 -3.000000

2  0.825859  0.500000  0.427558

3  0.038837  0.500000  0.571464

4  0.425408  0.820305  0.215118

5  0.491830  0.240837  0.244575

          0         1         2

0  0.922985  0.427313       NaN

1  0.274581       NaN       NaN

2  0.825859       NaN  0.427558

3  0.038837       NaN  0.571464

4  0.425408  0.820305  0.215118

5  0.491830  0.240837  0.244575

          0          1          2

0  0.922985   0.427313  22.000000

1  0.274581  22.000000  22.000000

2  0.825859  22.000000   0.427558

3  0.038837  22.000000   0.571464

4  0.425408   0.820305   0.215118

5  0.491830   0.240837   0.244575

          0         1         2

0  0.499691  0.247085       NaN

1  0.421605  0.247085       NaN

2  0.332292  0.247085       NaN

3  0.771274  0.247085  0.910545

4  0.294923  0.247085  0.555058

5  0.635277  0.030381  0.376938
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: