您的位置:首页 > 其它

sklearn的模型选择和评估之交叉验证

2018-10-15 16:32 218 查看
import numpy as np

交叉验证:评估学习器的表现

划分数据为训练集和测试集

X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
range(0, 5)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)
print(X_train)
print(y_train)
[[4 5]
[0 1]
[6 7]]
[2, 0, 3]
print(X_test,"\n",y_test)
[[2 3]
[8 9]]
[1, 4]

计算交叉验证的指标

cross_val_score

from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X= diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
cross_val_score(lasso,X,y)
array([ 0.33150734,  0.08022311,  0.03531764])

cross_validate 函数和多度量评估

  1. 它允许指定多个指标进行评估.
  2. 除了测试得分之外,它还会返回一个包含训练得分,拟合次数, score-times (得分次数)的一个字典

通过交叉验证获取预测

from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(lasso, X, y)
y_pred
array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
132.68647979,  128.49511245,  120.76146877,  141.069413  ,
164.18904498,  182.37394949,  111.04181265,  127.94311443,
135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
133.58117218,  124.77928571,  132.90918003,  208.52927   ,
153.61908967,  154.16616341,  118.95351821,  163.50467541,
145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
179.68118754,  137.96619936,  146.07788398,  126.77579723,
123.32101099,  166.26710247,  146.41559964,  161.67261029,
147.47731459,  138.44595305,  144.85421048,  113.77990664,
185.54970402,  115.31624749,  142.23672103,  171.07792136,
132.5394716 ,  177.80524864,  116.5616502 ,  134.25230846,
142.88707475,  173.2830912 ,  154.31273504,  149.16680759,
144.88238997,  121.97783103,  110.38457621,  180.25559631,
199.06141058,  151.1195546 ,  161.14217698,  153.96960812,
150.77179755,  113.30903579,  165.15755771,  115.85735727,
174.19267171,  150.12027233,  115.47891783,  153.38967232,
115.31573467,  156.49909623,   92.62211515,  178.15649994,
131.59320715,  134.46166754,  116.97678633,  190.00790119,
166.01173292,  126.25944471,  134.29256991,  144.71971963,
190.9769591 ,  182.39199466,  154.45325308,  148.30325558,
151.72036937,  124.12825466,  138.6011155 ,  137.75891286,
123.0917243 ,  131.74735403,  112.07367481,  124.56956904,
156.78432061,  128.63135591,   93.68260079,  130.54324394,
131.8693231 ,  154.5708257 ,  179.81343019,  165.78130755,
150.04779033,  162.37974736,  143.92996797,  143.15645843,
125.20161377,  145.99590279,  155.3505536 ,  145.97574185,
134.66120515,  163.92450638,  101.92329396,  139.33014324,
122.71377023,  152.20573113,  153.36931089,  116.76545147,
131.96936127,  109.74817383,  132.57453994,  159.38030328,
109.31343881,  147.69926269,  156.3664255 ,  161.12509958,
128.16523686,  156.78446286,  154.04375702,  124.83705022,
143.85606595,  143.23651701,  147.76316913,  154.21572891,
129.07895017,  157.79644923])

交叉验证迭代器

交叉验证迭代器-循环遍历数据

K折

from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y= np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
kf.get_n_splits(X)
2
print(kf)
KFold(n_splits=2, random_state=None, shuffle=False)
for train_index,test_index in kf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
print("训练集:",X[train_index],"\n",y[train_index])
print("测试集:",X[test_index],"\n",y[test_index])
TRAIN: [2 3] TEST: [0 1]
训练集: [[1 2]
[3 4]]
[3 4]
测试集: [[1 2]
[3 4]]
[1 2]
TRAIN: [0 1] TEST: [2 3]
训练集: [[1 2]
[3 4]]
[1 2]
测试集: [[1 2]
[3 4]]
[3 4]

重复k折

from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
for train_index, test_index in rkf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 1] TEST: [2 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]

留一交叉验证(LOO)

from sklearn.model_selection import LeaveOneOut
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()loo.get_n_splits(X)
2
loo
LeaveOneOut()
for train_index, test_index in loo.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [1] TEST: [0]
[[3 4]] [[1 2]] [2] [1]
TRAIN: [0] TEST: [1]
[[1 2]] [[3 4]] [1] [2]

留P交叉验证(LPO)

from sklearn.model_selection import LeavePOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
lpo = LeavePOut(2)
lpo.get_n_splits(X)
print(lpo)
for train_index, test_index in lpo.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
LeavePOut(p=2)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 1] TEST: [2 3]

随机排列交叉验证

from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)
3
for train_index, test_index in rs.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [3 1 0] TEST: [2]
TRAIN: [2 1 3] TEST: [0]
TRAIN: [0 2 1] TEST: [3]
rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
random_state=0)
for train_index, test_index in rs.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
TRAIN: [3 1] TEST: [2]
TRAIN: [2 1] TEST: [0]
TRAIN: [0 2] TEST: [3]

基于类标签 , 具有分层的交叉验证迭代器

分层k折

每个小集合中, 各个类别的样例比例大致和完整数据集中相同。

from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)
2
for train_index, test_index in skf.split(X, y):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]

分层随机split

创建一个划分,但是划分中每个类的比例和完整数据集中的相同。

from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)
3
for train_index, test_index in sss.split(X, y):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 2] TEST: [3 0]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 2] TEST: [3 1]

用于分组数据的交叉验证迭代器

组k-fold

from sklearn.model_selection import GroupKFold
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(X, y, groups)
2
for train_index, test_index in group_kfold.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [0 1] TEST: [2 3]
[[1 2]
[3 4]] [[5 6]
[7 8]] [1 2] [3 4]
TRAIN: [2 3] TEST: [0 1]
[[5 6]
[7 8]] [[1 2]
[3 4]] [3 4] [1 2]

留一组交叉验证

from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
groups = np.array([1, 1, 2, 2])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)

logo.get_n_splits(groups=groups)
2
for train_index, test_index in logo.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [2 3] TEST: [0 1]
[[5 6]
[7 8]] [[1 2]
[3 4]] [1 2] [1 2]
TRAIN: [0 1] TEST: [2 3]
[[1 2]
[3 4]] [[5 6]
[7 8]] [1 2] [1 2]

留P组交叉验证

from sklearn.model_selection import LeavePGroupsOut
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 1])
groups = np.array([1, 2, 3])
lpgo = LeavePGroupsOut(n_groups=2)
lpgo.get_n_splits(X, y, groups)

lpgo.get_n_splits(groups=groups)
3
for train_index, test_index in lpgo.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
TRAIN: [2] TEST: [0 1]
[[5 6]] [[1 2]
[3 4]] [1] [1 2]
TRAIN: [1] TEST: [0 2]
[[3 4]] [[1 2]
[5 6]] [2] [1 1]
TRAIN: [0] TEST: [1 2]
[[1 2]] [[3 4]
[5 6]] [1] [2 1]

组划分(随机)

from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
print("%s %s" % (train, test))
[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]

预定义的折叠、验证集

交叉验证在时间序列数据中应用

时间序列分割

from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=3)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐