您的位置:首页 > 其它

scikit-learn 交叉验证绘图及原理实践 分类:机器学习Sklearn

2017-05-18 19:37 344 查看
交叉验证返回的是平均均方误或平均判定正确率。

[python] view plain copy print?



from sklearn import datasets from sklearn.cross_validation import cross_val_predict from sklearn import linear_model import matplotlib.pyplot as plt lr = linear_model.LinearRegression() boston = datasets.load_boston() y = boston.target#cross_val_predict returns an array of the same size as ‘y’ where each entry
#is a prediction obtained by cross validated

predicted = cross_val_predict(lr, boston.data, y, cv = 10)

fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], ’k–’, lw = 4)
ax.set_xlabel(”Measured”)
ax.set_ylabel(”Predicted”)
plt.show()


from sklearn import datasets
from sklearn.cross_validation import cross_val_predict
from sklearn import linear_model
import matplotlib.pyplot as plt

lr = linear_model.LinearRegression()
boston = datasets.load_boston()
y = boston.target


#cross_val_predict returns an array of the same size as 'y' where each entry

#is a prediction obtained by cross validated

predicted = cross_val_predict(lr, boston.data, y, cv = 10)

fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw = 4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()

上面ax.plot中’k–’,k指线为黑色,–是线的形状。lw指定线宽。

下面对上面的cross_val_predict进行展开:

[python] view plain copy print?



import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
print iris.data.shape, iris.target.shape

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
iris.data, iris.target, test_size = 0.4, random_state = 0)

print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

””’
statas_dict = dict()
for element in iris.target:
if statas_dict.get(element):
statas_dict[element] += 1
else:
statas_dict[element] = 1

print statas_dict
”’

#print np.bincount(iris.target)
#print np.unique(iris.target, return_counts = T
c8d5
rue)

clf = svm.SVC(kernel = ’linear’, C = 1).fit(X_train, y_train)
print clf.score(X_test, y_test)
print

clf = svm.SVC(kernel = ’linear’, C = 1)
scores = cross_validation.cross_val_score(clf, iris.data, iris.target, cv = 5)

print scores
print

def generate_split_sample(data, target, size_ratio):
return cross_validation.train_test_split(data, target, test_size = size_ratio)

#cv is the folds we need
#use the size_ratio to the abs num
def sample_split(data, target ,cv):
size_num = int(data.shape[0] / cv)
ori_data = data
ori_target = target

ListRequire = []

for i in range(cv):
if ori_data.shape[0] > size_num:
X_train, X_test, y_train, y_test = generate_split_sample(ori_data, ori_target, size_num)
ListRequire.append((X_test, y_test))
ori_data = X_train
ori_target = y_train
else:
ListRequire.append((ori_data, ori_target))

return ListRequire

#print sample_split(iris.data, iris.target, 5)

def return_score(train_data, train_target, test_data, test_target):
clf = svm.SVC(kernel = ’linear’, C = 1).fit(train_data, train_target)
return clf.score(test_data, test_target)

def return_scores(data, target, cv):
ListRequire = []
splitList = sample_split(data, target, cv)
for i in range(len(splitList)):
test_data, test_target = splitList[i]

otherIndexs = set(range(len(splitList)))
otherIndexs.remove(i)

train_data = None
train_target = None

for j in otherIndexs:
if type(train_data) == type(None):
train_data, train_target = splitList[j]
else:
train_data = np.append(train_data, splitList[j][0], axis = 0)
train_target = np.append(train_target, splitList[j][1], axis = 0)

ListRequire.append(return_score(train_data, train_target, test_data, test_target))

return ListRequire

print return_scores(iris.data, iris.target, 5)


import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
print iris.data.shape, iris.target.shape

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
iris.data, iris.target, test_size = 0.4, random_state = 0)

print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

'''
statas_dict = dict()
for element in iris.target:
if statas_dict.get(element):
statas_dict[element] += 1
else:
statas_dict[element] = 1

print statas_dict
'''


#print np.bincount(iris.target)

#print np.unique(iris.target, return_counts = True)

clf = svm.SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
print clf.score(X_test, y_test)
print

clf = svm.SVC(kernel = 'linear', C = 1)
scores = cross_validation.cross_val_score(clf, iris.data, iris.target, cv = 5)

print scores
print

def generate_split_sample(data, target, size_ratio):
return cross_validation.train_test_split(data, target, test_size = size_ratio)

#cv is the folds we need

#use the size_ratio to the abs num
def sample_split(data, target ,cv):
size_num = int(data.shape[0] / cv)
ori_data = data
ori_target = target

ListRequire = []

for i in range(cv):
if ori_data.shape[0] > size_num:
X_train, X_test, y_train, y_test = generate_split_sample(ori_data, ori_target, size_num)
ListRequire.append((X_test, y_test))
ori_data = X_train
ori_target = y_train
else:
ListRequire.append((ori_data, ori_target))

return ListRequire

#print sample_split(iris.data, iris.target, 5)

def return_score(train_data, train_target, test_data, test_target):
clf = svm.SVC(kernel = ‘linear’, C = 1).fit(train_data, train_target)
return clf.score(test_data, test_target)

def return_scores(data, target, cv):
ListRequire = []
splitList = sample_split(data, target, cv)
for i in range(len(splitList)):
test_data, test_target = splitList[i]

otherIndexs = set(range(len(splitList)))
otherIndexs.remove(i)

train_data = None
train_target = None

for j in otherIndexs:
if type(train_data) == type(None):
train_data, train_target = splitList[j]
else:
train_data = np.append(train_data, splitList[j][0], axis = 0)
train_target = np.append(train_target, splitList[j][1], axis = 0)

ListRequire.append(return_score(train_data, train_target, test_data, test_target))

return ListRequire

print return_scores(iris.data, iris.target, 5)

这里自实现了抽样过程,并得到相同结果。

random_state:伪随机数生成初值。test_size: 决定随机生成的test集合占总样本的比例。

svm.SVC svc指支持向量机分类。参数C指定penalty parameter

clf.score(X, y):指出用来进行拟合模型的(X, y)的正确率。

有两种返回ndarray中分类数据个数统计的方法,推荐第二总方法。

np.bincount(ndarray_object): 仅仅返回排序后统计个数的ndarray

np.unique(ndarray_object, return_count = False) :return_count设定为True时可以返回类别及计数。

np.append(first_ndarray, second_ndarray, axis = 0) :沿纵向进行扩展

这里利用了类型判断,这同样可以对np.ndarray进行(仅需初始化一个空对象 i.e. type(np.array([0, 1])) == type(np.ndarray([0])))

利用ndarray进行初始化也是最简单的(也可能是最快的)多维数组初始化方法。

根据文档,cross_val_predict 与 cross_val_score有相同的接口,不同是前者返回的是相应样本在对应测试集上

进行拟合的结果,细节不述。

更多了解请浏览:http://blog.csdn.net/sinat_30665603
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  机器学习
相关文章推荐