您的位置:首页 > 其它

《Hands-on Machine Learning with Scikit-Learn and TensorFlow》 读书笔记

2018-01-18 14:58 561 查看
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

housing=pd.read_csv('D:\\pythondata\\housing.csv')

housing.plot(kind="scatter", x="longitude", y="latitude")

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)

plt.legend()

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)

housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index,test_index in split.split(housing,housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]

housing["income_cat"].value_counts() / len(housing)

#删除income_cat  使得数据回到原始模样

for set in (strat_train_set, strat_test_set):
set.drop(["income_cat"], axis=1, inplace=True)

corr_matrix = housing.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

# 下面做数据清洗

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

#对缺失值的处理
housing.dropna(subset=["total_bedrooms"])    # option 1   将缺失值删除

housing.drop("total_bedrooms", axis=1) # option 2   将缺失值删除

median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median) # option 3    用中位数填充缺失值

#sklearn 库提供了一些可以用于处理缺失值的模块

from sklearn.preprocessing import Imputer   #导入模块

imputer = Imputer(strategy="median")

housing_num = housing.drop("ocean_proximity",axis=1)

imputer.fit(housing_num)

imputer.statistics_          #查看各个变量用于填充的中位数的值

housing_num.median().values

X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X,columns=housing_num.columns)

#对于分类变量的处理,将其转换为数字形式

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

print encoder.classes_

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True):      # no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room

def fit(self, X, y=None):
return self # nothing else to do

def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
population_per_household = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

#由于数据中既有连续型变量,又有分类变量,要分开处理,因此用num_pipeline 和 cat_pipeline两个管道分析处理,
# 然后通过FeatureUnion 函数合并。

from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)    #调用合并后的管道函数
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐