  • 导入、打印包的信息
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
for module in mpl, np ,pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.0
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
  • 加载、打印数据集信息
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
# 数据集的描述
.. _california_housing_dataset:

California Housing dataset

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
- MedInc        median income in block
- HouseAge      median house age in block
- AveRooms      average number of rooms
- AveBedrms     average number of bedrooms
- Population    block population
- AveOccup      average house occupancy
- Latitude      house block latitude
- Longitude     house block longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297

(20640, 8)
import pprint
# 而pprint()采用分行打印输出
# 所以对于数据结构比较复杂、数据长度较长的数据,适合采用pprint()打印方式。
array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00,
1.02380952e+00,  3.22000000e+02,  2.55555556e+00,
3.78800000e+01, -1.22230000e+02],
[ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00,
9.71880492e-01,  2.40100000e+03,  2.10984183e+00,
3.78600000e+01, -1.22220000e+02],
[ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00,
1.07344633e+00,  4.96000000e+02,  2.80225989e+00,
3.78500000e+01, -1.22240000e+02],
[ 5.64310000e+00,  5.20000000e+01,  5.81735160e+00,
1.07305936e+00,  5.58000000e+02,  2.54794521e+00,
3.78500000e+01, -1.22250000e+02],
[ 3.84620000e+00,  5.20000000e+01,  6.28185328e+00,
1.08108108e+00,  5.65000000e+02,  2.18146718e+00,
3.78500000e+01, -1.22250000e+02]])
array([4.526, 3.585, 3.521, 3.413, 3.422])
  • 分割数据集为训练集、验证集、测试集
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
x_train_all, y_train_all, random_state = 11)
(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)
  • 归一化输入数据
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)
  • 构建模型、定义训练指标与方法
model = keras.models.Sequential([
callbacks = [keras.callbacks.EarlyStopping(
Model: "sequential"
Layer (type)                 Output Shape              Param #
dense (Dense)                (None, 30)                270
dense_1 (Dense)              (None, 1)                 31
Total params: 301
Trainable params: 301
Non-trainable params: 0
  • 训练数据
history = model.fit(x_train_scaled, y_train,
validation_data = (x_valid_scaled, y_valid),
epochs =100,
callbacks = callbacks)
Train on 11610 samples, validate on 3870 samples
Epoch 1/100
11610/11610 [==============================] - 2s 146us/sample - loss: 1.0878 - val_loss: 1.1014
Epoch 2/100
11610/11610 [==============================] - 1s 64us/sample - loss: 1.0323 - val_loss: 0.5181
Epoch 3/100
11610/11610 [==============================] - 1s 63us/sample - loss: 0.4491 - val_loss: 0.4698
Epoch 4/100
11610/11610 [==============================] - 1s 63us/sample - loss: 0.4440 - val_loss: 0.5120
Epoch 5/100
11610/11610 [==============================] - 1s 67us/sample - loss: 0.4084 - val_loss: 0.4171
Epoch 6/100
11610/11610 [==============================] - 1s 70us/sample - loss: 0.3905 - val_loss: 0.4072
Epoch 7/100
11610/11610 [==============================] - 1s 66us/sample - loss: 0.3799 - val_loss: 0.3934
Epoch 8/100
11610/11610 [==============================] - 1s 66us/sample - loss: 0.3732 - val_loss: 0.3878
Epoch 9/100
11610/11610 [==============================] - 1s 79us/sample - loss: 0.3746 - val_loss: 0.3847
Epoch 10/100
11610/11610 [==============================] - 1s 74us/sample - loss: 0.3696 - val_loss: 0.3798
Epoch 11/100
11610/11610 [==============================] - 1s 66us/sample - loss: 0.3616 - val_loss: 0.3777
Epoch 12/100
11610/11610 [==============================] - 1s 73us/sample - loss: 0.3785 - val_loss: 0.3768
Epoch 13/100
11610/11610 [==============================] - 1s 70us/sample - loss: 0.3590 - val_loss: 0.3733
Epoch 14/100
11610/11610 [==============================] - 1s 66us/sample - loss: 0.3601 - val_loss: 0.3719
Epoch 15/100
11610/11610 [==============================] - 1s 67us/sample - loss: 0.3593 - val_loss: 0.3755
  • 训练结果
def plot_learning_curves(history):
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.gca().set_ylim(0, 1)

  • 用测试集通过模型做评估
model.evaluate(x_test_scaled, y_test, verbose=0)
