《Hands-on Machine Learning with Scikit-Learn and TensorFlow》 读书笔记
2018-01-18 14:58
561 查看
import pandas as pd import numpy as np import matplotlib.pyplot as plt housing=pd.read_csv('D:\\pythondata\\housing.csv') housing.plot(kind="scatter", x="longitude", y="latitude") housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, ) plt.legend() housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index,test_index in split.split(housing,housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] housing["income_cat"].value_counts() / len(housing) #删除income_cat 使得数据回到原始模样 for set in (strat_train_set, strat_test_set): set.drop(["income_cat"], axis=1, inplace=True) corr_matrix = housing.corr() corr_matrix["median_house_value"].sort_values(ascending=False) # 下面做数据清洗 housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() #对缺失值的处理 housing.dropna(subset=["total_bedrooms"]) # option 1 将缺失值删除 housing.drop("total_bedrooms", axis=1) # option 2 将缺失值删除 median = housing["total_bedrooms"].median() housing["total_bedrooms"].fillna(median) # option 3 用中位数填充缺失值 #sklearn 库提供了一些可以用于处理缺失值的模块 from sklearn.preprocessing import Imputer #导入模块 imputer = Imputer(strategy="median") housing_num = housing.drop("ocean_proximity",axis=1) imputer.fit(housing_num) imputer.statistics_ #查看各个变量用于填充的中位数的值 housing_num.median().values X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X,columns=housing_num.columns) #对于分类变量的处理,将其转换为数字形式 from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() housing_cat = housing["ocean_proximity"] housing_cat_encoded = encoder.fit_transform(housing_cat) housing_cat_encoded print encoder.classes_ from sklearn.base import BaseEstimator, TransformerMixin rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # nothing else to do def transform(self, X, y=None): rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) #由于数据中既有连续型变量,又有分类变量,要分开处理,因此用num_pipeline 和 cat_pipeline两个管道分析处理, # 然后通过FeatureUnion 函数合并。 from sklearn.pipeline import FeatureUnion num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('label_binarizer', LabelBinarizer()), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) housing_prepared = full_pipeline.fit_transform(housing) #调用合并后的管道函数
相关文章推荐
- 《Hands-on Machine Learning with Scikit-Learn and TensorFlow》 读书笔记
- 集成算法(chapter 7 - Hands on machine learning with scikit learn and tensorflow)
- OReilly.Hands-On.Machine.Learning.with.Scikit-Learn.and.TensorFlow.翻译以及读书心得--p33-p40
- OReilly.Hands-On.Machine.Learning.with.Scikit-Learn.and.TensorFlow.翻译以及读书心得--p41-53
- chapter2 of OReilly.Hands-On.Machine.Learning.with.Scikit-Learn.and.TensorFlow
- Machine Learning with Scikit-Learn and Tensorflow 6.4 CART算法
- Machine Learning with Scikit-Learn and Tensorflow 7.1 Voting Classifiers
- Machine Learning with Scikit-Learn and Tensorflow 7.9 Gradient Boosting
- Machine Learning with Scikit-Learn and Tensorflow 6.3 预测类别概率
- Machine Learning with Scikit-Learn and Tensorflow 6.5 计算复杂度
- Machine Learning with Scikit-Learn and Tensorflow 7.10 Stacking
- Machine Learning with Scikit-Learn and Tensorflow 6.8 决策树回归
- Machine Learning with Scikit-Learn and Tensorflow 7.2 Bagging和Pasting
- Machine Learning with Scikit-Learn and Tensorflow 7.11 练习
- Machine Learning with Scikit-Learn and Tensorflow 7.5 随机森林
- hands on machine learning with sklearn and tensorflow 附录B 翻译与整理(1)概要
- Machine Learning with Scikit-Learn and Tensorflow 7.8 AdaBoost
- Machine Learning with Scikit-Learn and Tensorflow 6.6 基尼不纯度/熵
- Machine Learning with Scikit-Learn and Tensorflow 7.6 Extra-Trees
- Machine Learning with Scikit-Learn and Tensorflow 7 集成学习和随机森林(章节目录)