sklearn的模型选择和评估之交叉验证
2018-10-15 16:32
218 查看
import numpy as np
交叉验证:评估学习器的表现
划分数据为训练集和测试集
X, y = np.arange(10).reshape((5, 2)), range(5) print(X) print(y)
[[0 1] [2 3] [4 5] [6 7] [8 9]] range(0, 5)
from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)
print(X_train) print(y_train)
[[4 5] [0 1] [6 7]] [2, 0, 3]
print(X_test,"\n",y_test)
[[2 3] [8 9]] [1, 4]
计算交叉验证的指标
cross_val_score
from sklearn import datasets, linear_model from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes() X= diabetes.data[:150] y = diabetes.target[:150]
lasso = linear_model.Lasso() cross_val_score(lasso,X,y)
array([ 0.33150734, 0.08022311, 0.03531764])
cross_validate 函数和多度量评估
- 它允许指定多个指标进行评估.
- 除了测试得分之外,它还会返回一个包含训练得分,拟合次数, score-times (得分次数)的一个字典
通过交叉验证获取预测
from sklearn.model_selection import cross_val_predict y_pred = cross_val_predict(lasso, X, y) y_pred
array([ 174.26933996, 117.6539241 , 164.60228641, 155.65049088, 132.68647979, 128.49511245, 120.76146877, 141.069413 , 164.18904498, 182.37394949, 111.04181265, 127.94311443, 135.0869234 , 162.83066014, 135.3573514 , 157.64516523, 178.95843326, 163.3919841 , 143.85237903, 144.29748882, 133.58117218, 124.77928571, 132.90918003, 208.52927 , 153.61908967, 154.16616341, 118.95351821, 163.50467541, 145.89406196, 168.3308101 , 155.87411031, 123.45960148, 185.70459144, 133.38468582, 117.2789469 , 150.27895019, 174.1541028 , 160.03235091, 192.31389633, 161.58568256, 154.2224809 , 119.35517679, 146.15706413, 133.82056934, 179.68118754, 137.96619936, 146.07788398, 126.77579723, 123.32101099, 166.26710247, 146.41559964, 161.67261029, 147.47731459, 138.44595305, 144.85421048, 113.77990664, 185.54970402, 115.31624749, 142.23672103, 171.07792136, 132.5394716 , 177.80524864, 116.5616502 , 134.25230846, 142.88707475, 173.2830912 , 154.31273504, 149.16680759, 144.88238997, 121.97783103, 110.38457621, 180.25559631, 199.06141058, 151.1195546 , 161.14217698, 153.96960812, 150.77179755, 113.30903579, 165.15755771, 115.85735727, 174.19267171, 150.12027233, 115.47891783, 153.38967232, 115.31573467, 156.49909623, 92.62211515, 178.15649994, 131.59320715, 134.46166754, 116.97678633, 190.00790119, 166.01173292, 126.25944471, 134.29256991, 144.71971963, 190.9769591 , 182.39199466, 154.45325308, 148.30325558, 151.72036937, 124.12825466, 138.6011155 , 137.75891286, 123.0917243 , 131.74735403, 112.07367481, 124.56956904, 156.78432061, 128.63135591, 93.68260079, 130.54324394, 131.8693231 , 154.5708257 , 179.81343019, 165.78130755, 150.04779033, 162.37974736, 143.92996797, 143.15645843, 125.20161377, 145.99590279, 155.3505536 , 145.97574185, 134.66120515, 163.92450638, 101.92329396, 139.33014324, 122.71377023, 152.20573113, 153.36931089, 116.76545147, 131.96936127, 109.74817383, 132.57453994, 159.38030328, 109.31343881, 147.69926269, 156.3664255 , 161.12509958, 128.16523686, 156.78446286, 154.04375702, 124.83705022, 143.85606595, 143.23651701, 147.76316913, 154.21572891, 129.07895017, 157.79644923])
交叉验证迭代器
交叉验证迭代器-循环遍历数据
K折
from sklearn.model_selection import KFold X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y= np.array([1, 2, 3, 4])
kf = KFold(n_splits=2) kf.get_n_splits(X)
2
print(kf)
KFold(n_splits=2, random_state=None, shuffle=False)
for train_index,test_index in kf.split(X): print("TRAIN:", train_index, "TEST:", test_index) print("训练集:",X[train_index],"\n",y[train_index]) print("测试集:",X[test_index],"\n",y[test_index])
TRAIN: [2 3] TEST: [0 1] 训练集: [[1 2] [3 4]] [3 4] 测试集: [[1 2] [3 4]] [1 2] TRAIN: [0 1] TEST: [2 3] 训练集: [[1 2] [3 4]] [1 2] 测试集: [[1 2] [3 4]] [3 4]
重复k折
from sklearn.model_selection import RepeatedKFold X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 1, 1]) rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
for train_index, test_index in rkf.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1] TEST: [2 3] TRAIN: [2 3] TEST: [0 1] TRAIN: [1 2] TEST: [0 3] TRAIN: [0 3] TEST: [1 2]
留一交叉验证(LOO)
from sklearn.model_selection import LeaveOneOut X = np.array([[1, 2], [3, 4]]) y = np.array([1, 2]) loo = LeaveOneOut()loo.get_n_splits(X)
2
loo
LeaveOneOut()
for train_index, test_index in loo.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train, X_test, y_train, y_test)
TRAIN: [1] TEST: [0] [[3 4]] [[1 2]] [2] [1] TRAIN: [0] TEST: [1] [[1 2]] [[3 4]] [1] [2]
留P交叉验证(LPO)
from sklearn.model_selection import LeavePOut X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 3, 4]) lpo = LeavePOut(2) lpo.get_n_splits(X) print(lpo) for train_index, test_index in lpo.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
LeavePOut(p=2) TRAIN: [2 3] TEST: [0 1] TRAIN: [1 3] TEST: [0 2] TRAIN: [1 2] TEST: [0 3] TRAIN: [0 3] TEST: [1 2] TRAIN: [0 2] TEST: [1 3] TRAIN: [0 1] TEST: [2 3]
随机排列交叉验证
from sklearn.model_selection import ShuffleSplit X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0) rs.get_n_splits(X)
3
for train_index, test_index in rs.split(X): print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [3 1 0] TEST: [2] TRAIN: [2 1 3] TEST: [0] TRAIN: [0 2 1] TEST: [3]
rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25, random_state=0) for train_index, test_index in rs.split(X): print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [3 1] TEST: [2] TRAIN: [2 1] TEST: [0] TRAIN: [0 2] TEST: [3]
基于类标签 , 具有分层的交叉验证迭代器
分层k折
每个小集合中, 各个类别的样例比例大致和完整数据集中相同。
from sklearn.model_selection import StratifiedKFold X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 1, 1]) skf = StratifiedKFold(n_splits=2) skf.get_n_splits(X, y)
2
for train_index, test_index in skf.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 3] TEST: [0 2] TRAIN: [0 2] TEST: [1 3]
分层随机split
创建一个划分,但是划分中每个类的比例和完整数据集中的相同。
from sklearn.model_selection import StratifiedShuffleSplit X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 1, 1]) sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0) sss.get_n_splits(X, y)
3
for train_index, test_index in sss.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 2] TEST: [3 0] TRAIN: [0 2] TEST: [1 3] TRAIN: [0 2] TEST: [3 1]
用于分组数据的交叉验证迭代器
组k-fold
from sklearn.model_selection import GroupKFold X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 3, 4]) groups = np.array([0, 0, 2, 2]) group_kfold = GroupKFold(n_splits=2) group_kfold.get_n_splits(X, y, groups)
2
for train_index, test_index in group_kfold.split(X, y, groups): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train, X_test, y_train, y_test)
TRAIN: [0 1] TEST: [2 3] [[1 2] [3 4]] [[5 6] [7 8]] [1 2] [3 4] TRAIN: [2 3] TEST: [0 1] [[5 6] [7 8]] [[1 2] [3 4]] [3 4] [1 2]
留一组交叉验证
from sklearn.model_selection import LeaveOneGroupOut X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([1, 1, 2, 2]) logo = LeaveOneGroupOut() logo.get_n_splits(X, y, groups) logo.get_n_splits(groups=groups)
2
for train_index, test_index in logo.split(X, y, groups): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train, X_test, y_train, y_test)
TRAIN: [2 3] TEST: [0 1] [[5 6] [7 8]] [[1 2] [3 4]] [1 2] [1 2] TRAIN: [0 1] TEST: [2 3] [[1 2] [3 4]] [[5 6] [7 8]] [1 2] [1 2]
留P组交叉验证
from sklearn.model_selection import LeavePGroupsOut X = np.array([[1, 2], [3, 4], [5, 6]]) y = np.array([1, 2, 1]) groups = np.array([1, 2, 3]) lpgo = LeavePGroupsOut(n_groups=2) lpgo.get_n_splits(X, y, groups) lpgo.get_n_splits(groups=groups)
3
for train_index, test_index in lpgo.split(X, y, groups): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train, X_test, y_train, y_test)
TRAIN: [2] TEST: [0 1] [[5 6]] [[1 2] [3 4]] [1] [1 2] TRAIN: [1] TEST: [0 2] [[3 4]] [[1 2] [5 6]] [2] [1 1] TRAIN: [0] TEST: [1 2] [[1 2]] [[3 4] [5 6]] [1] [2 1]
组划分(随机)
from sklearn.model_selection import GroupShuffleSplit X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001] y = ["a", "b", "b", "b", "c", "c", "c", "a"] groups = [1, 1, 2, 2, 3, 3, 4, 4] gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0) for train, test in gss.split(X, y, groups=groups): print("%s %s" % (train, test))
[0 1 2 3] [4 5 6 7] [2 3 6 7] [0 1 4 5] [2 3 4 5] [0 1 6 7] [4 5 6 7] [0 1 2 3]
预定义的折叠、验证集
交叉验证在时间序列数据中应用
时间序列分割
from sklearn.model_selection import TimeSeriesSplit X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4]) tscv = TimeSeriesSplit(n_splits=3) print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=3)
for train_index, test_index in tscv.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]
TRAIN: [0] TEST: [1] TRAIN: [0 1] TEST: [2] TRAIN: [0 1 2] TEST: [3]阅读更多
相关文章推荐
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- [翻译中]【Scikit-Learn 中文文档】二十八:交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 【Scikit-Learn 中文文档】28 交叉验证 - 模型选择和评估 - 用户指南 | ApacheCN
- 模型评估的方法:Holdout检验、交叉验证、自助法(Bootstrap)
- scikit-learn中交叉验证及其用于参数选择、模型选择、特征选择的例子
- Spark机器学习——模型选择与参数调优之交叉验证