树回归
2016-06-12 23:58
381 查看
输入数据与目标变量之间为非线性关系时,可用树回归,使用树对预测值分段,包括分段常数、分段直线,前者为回归树,后者为模型树。若数据过拟合,需剪枝。
测试:回归树
测试:剪枝
测试:模型树
测试:树回归与标准回归比较
R^2 越接近1越好
可以看到,树回归要由于标准回归
#!/usr/bin/python # -*- coding: utf-8 -*- #coding=utf-8 from numpy import * #导入数据 def loadDataSet(fileName): datMat = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') frLine = map(float, curLine) datMat.append(frLine) return datMat #输入参数:数据集合,待切分的特征,该特征的某个某个值 def binSplitDataSet(dataSet, feature, value): mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0] mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0] return mat0, mat1 #生成叶节点,为目标变量的均值 def regLeaf(dataSet): return mean(dataSet[:, -1]) #误差估计函数,总方差 def regErr(dataSet): return var(dataSet[:, -1]) * shape(dataSet)[0] #找到数据的最佳二元切分方式 #如果找不到一个“好”的二元切分,返回None并同时调用createTree()产生叶结点 def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops=(1,4)): tolS = ops[0] #容许的误差下降值 tolN = ops[1] #切分的最少样本数,如果为1,直接返回 if len(set(dataSet[:, -1].T.tolist()[0])) == 1: return None, leafType(dataSet) m, n = shape(dataSet) S = errType(dataSet) #误差 bestS = inf bestIndex = 0 bestValue = 0 for featIndex in range(n-1): #每个特征 for splitVal in set(dataSet[:, featIndex]): #该特征的所有取值 mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue newS = errType(mat0) + errType(mat1) #新误差 if newS < bestS: bestIndex = featIndex bestValue = splitVal bestS = newS if (S - bestS) < tolS: #如果误差减少不大则退出 return None, leafType(dataSet) mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #如果切分出的数据集很小则退出 return None, leafType(dataSet) return bestIndex, bestValue #SART 分类回归树 #输入参数:数据集合,建立叶结点函数,误差计算函数,构建树所需的其它参数的元组 def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)): feat, val = chooseBestSplit(dataSet, leafType, errType, ops) #将数据集切分成2部分 if feat == None: #满足停止条件时,返回叶节点值 return val retTree = {} retTree['spInd'] = feat retTree['spVal'] = val lSet, rSet = binSplitDataSet(dataSet, feat, val) retTree['left'] = createTree(lSet, leafType, errType, ops) retTree['right'] = createTree(rSet, leafType, errType, ops) return retTree #回归树剪枝函数 #判断输入数据是否为一棵树 def isTree(obj): return (type(obj).__name__ == 'dict') #计算2个叶结点的平均值。对树进行塌陷处理 def getMean(tree): if isTree(tree['right']): tree['right'] = getMean(tree['right']) if isTree(tree['left']): tree['left'] = getMean(tree['left']) return (tree['right'] + tree['left']) / 2.0 #输入参数:待剪枝的树,待测试的数据 def prune(tree, testData): if shape(testData)[0] == 0: return getMean(tree) if (isTree(tree['right']) or isTree(tree['left'])): lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) #对左右子树剪枝 if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) #检查剪枝后的左右子树是否是树,如果不是,可以进行合并 #与合并前的误差进行比较,如果合并后的误差小,则合并,否则不合并 if not isTree(tree['left']) and not isTree(tree['right']): lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) errorNoMerge = sum(power(lSet[:,-1] - tree['left'], 2)) + sum(power(rSet[:,-1] - tree['right'], 2)) treeMean = (tree['left'] + tree['right']) / 2.0 errorMerge = sum(power(testData[:, -1] - treeMean, 2)) if errorMerge < errorNoMerge: print "merging" return treeMean else: return tree else: return tree #模型树 #模型树的叶结点生成函数 #将数据集格式化成目标变量Y和自变量X def linearSolve(dataSet): m, n = shape(dataSet) X = mat(ones((m, n))) Y = mat(ones((m, 1))) X[:, 1:n] = dataSet[:, 0:n-1] Y = dataSet[:, -1] xTx = X.T * X if linalg.det(xTx) == 0.0: raise NameError('This matrix is singular, cannot do inverse, \n try increaseing the second value of ops') ws = xTx.I * (X.T * Y) return ws, X, Y #当数据不再需要切分时,生成叶结点的模型 def modelLeaf(dataSet): ws, X, Y = linearSolve(dataSet) return ws #在给定的数据集上计算误差 def modelErr(dataSet): ws, X, Y = linearSolve(dataSet) yHat = X * ws return sum(power(Y-yHat, 2)) #利用树回归进行预测 #对回归树叶结点进行预测 def regTreeEval(model, inDat): return float(model) #对模型树结点进行预测 def modelTreeEval(model, inDat): n = shape(inDat)[1] X = mat(ones((1, n+1))) X[:, 1:n+1] = inDat #增加第0列 return float(X * model) #对于输入的单个数据点或者行向量,返回一个浮点值 def treeForeCast(tree, inData, modelEval=regTreeEval): if not isTree(tree): #如果是叶结点 return modelEval(tree, inData) if inData[tree['spInd']] > tree['spVal']: if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval) else: return modelEval(tree['left'], inData) else: if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval) else: return modelEval(tree['right'], inData) def createForeCast(tree, testData, modelEval=regTreeEval): m = len(testData) yHat = mat(zeros((m, 1))) for i in range(m): yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval) return yHat
测试:回归树
>>> import regTree >>> myDatl = loadDataSet('ex0.txt') >>> myDatl = mat(myDatl) >>> createTree(myDatl) {'spInd': 1, 'spVal': matrix([[ 0.39435]]), 'right': {'spInd': 1, 'spVal': matrix([[ 0.197834]]), 'right': -0.023838155555555553, 'left': 1.0289583666666664}, 'left': {'spInd': 1, 'spVal': matrix([[ 0.582002]]), 'right': 1.9800350714285717, 'left': {'spInd': 1, 'spVal': matrix([[ 0.797583]]), 'right': 2.9836209534883724, 'left': 3.9871632000000004}}}
测试:剪枝
>>> myMat2 = loadDataSet('ex2.txt') >>> myMat2 = mat(myMat2) >>> myTree = createTree(myMat2, ops=(0,1)) >>> myDatTest = loadDataSet('ex2test.txt') >>> myMat2Test = mat(myDatTest) >>> prune(myTree, myMat2Test) merging merging ... ... 850000001}}, 'left': {'spInd': 0, 'spVal': matrix([[ 0.948822]]), 'right': 69.318648999999994, 'left': 96.41885225}}}}}}}}}}}, 'left': {'spInd': 0, 'spVal': matrix([[ 0.965969]]), 'right': {'spInd': 0, 'spVal': matrix([[ 0.956951]]), 'right': 111.2013225, 'left': {'spInd': 0, 'spVal': matrix([[ 0.958512]]), 'right': 135.83701300000001, 'left': {'spInd': 0, 'spVal': matrix([[ 0.960398]]), 'right': 123.559747, 'left': 112.386764}}}, 'left': 92.523991499999994}}}}
测试:模型树
>>> import regTree >>> myMat2 = mat(loadDataSet('exp2.txt')) >>> createTree(myMat2, modelLeaf, modelErr, (1,10)) {'spInd': 0, 'spVal': matrix([[ 0.285477]]), 'right': matrix([[ 3.46877936],[ 1.18521743]]), 'left': matrix([[ 1.69855694e-03],[1.19647739e+01]])}
测试:树回归与标准回归比较
R^2 越接近1越好
#回归树 >>> trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt')) >>> testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt')) >>> myTree = createTree(trainMat, ops=(1,20)) >>> yHat = createForeCast(myTree, testMat[:,0]) >>> corrcoef(yHat, testMat[:,1], rowvar=0)[0,1] 0.96408523182221306 #模型树 >>> import regTree >>> trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt')) >>> testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt')) >>> myTree = createTree(trainMat, modelLeaf, modelErr, ops=(1,20)) >>> yHat = createForeCast(myTree, testMat[:, 0], modelTreeEval) >>> corrcoef(yHat, testMat[:,1], rowvar=0)[0,1] 0.97604121913806285 #标准回归 >>> ws, X, Y = linearSolve(trainMat) >>> ws matrix([[ 37.58916794], [6.18978355]]) >>> yHat = testMat[:,0] * ws[1,0] + ws[0,0] >>> corrcoef(yHat, testMat[:,1], rowvar=0)[0,1] 0.94346842356747584
可以看到,树回归要由于标准回归
相关文章推荐
- 代理模式
- python的函数定义方式
- 【Unity3D】ShaderLab学习笔记
- 《java编程思想》学习笔记(一)
- 责任链模式(Chain of Responsibility Pattern)
- Cookie
- HeyWeGo第五周项目总结
- iOS 导航栏透明,变色动画
- 前端开发需要用到的几款线编辑器插件
- docker-engine安装
- 所以一切都是有套路的,按照套路出牌是最合适的方式。
- 关于WPF中Popup中的一些用法的总结
- android中的保存数据方法
- 字符串匹配的Boyer-Moore算法
- mysql安装
- Iwfu-CoordinatorLayout(2)自定义Behavior
- hdu5289 Assignment
- 梳理hibernate-面试题
- Android开发常见的Activity中内存泄漏及解决办法
- SQL Server 2008 clearhtml函数清除字段中的HTML标记