预测数值型数据:回归
2016-06-11 15:52
260 查看
regress.py
标准回归
测试:
局部加权线性回归
测试:
缩减法:岭回归
缩减法可将系数缩减的很小的值直接缩减为0
测试:
得到8个特征值的系数与log(lambda)的关系图像。最左端,lambda最小时,所有系数值与线性回归一致;最右边,都缩减成0;在中间的某值可以取得最好的预测效果
crossValidation(xArr, yArr, numVal = 10)
the best model from Ridge Regression is:
[[ 0.0823653 -3.91472054 16.9506189 10.7538564 8.09628256 -19.35008479 -7.85980015 9.56536757]]
with constant term: 2.94541561804
缩减法:前向逐步线性回归
测试:
标准回归
#!/usr/bin/python # -*- coding: utf-8 -*- #coding=utf-8 from numpy import * #导入函数 #前几行为x,最后一行为y def loadDataSet(fileName): numFeat = len(open(fileName).readline().split('\t')) - 1 dataMat = [] labelMat = [] fr = open(fileName) for line in fr.readlines(): lineArr = [] curLine = line.strip().split('\t') for i in range(numFeat): lineArr.append(float(curLine[i])) dataMat.append(lineArr) labelMat.append(float(curLine[-1])) return dataMat, labelMat #标准回归函数 def standRegress(xArr, yArr): xMat = mat(xArr) yMat = mat(yArr).T xTx = xMat.T * xMat if linalg.det(xTx) == 0.0: #如果行列式为0,则逆不存在 print "This matrix is singular, cannot do inverse" return ws = xTx.I * (xMat.T * yMat) #回归系数 return ws #测试标准回归函数 def testStandRetress(xArr, yArr): xArr, yArr = loadDataSet('ex0.txt') ws = standRegress(xArr, yArr) print ws xMat = mat(xArr) yMat = mat(yArr) #真实值 import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) #原始图像 ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0]) xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy * ws #预测值 #预测图像 ax.plot(xCopy[:,1], yHat) plt.show()
测试:
>>> import regress >>> xArr, yArr = loadDataSet('ex0.txt') >>> testStandRetress(xArr, yArr) w= [[ 3.00774324] [ 1.69532264]] corrcoef= [[ 1. 0.98647356] [ 0.98647356 1. ]] #yHat与yMat的相关系数为0.986
局部加权线性回归
#局部加权线性回归 #k决定了对附近的嗲赋予多大的权重 def lwlr(testPoint, xArr, yArr, k=1.0): xMat = mat(xArr) yMat = mat(yArr).T m = shape(xMat)[0] weights = mat(eye((m))) #单位矩阵 for j in range(m): diffMat = testPoint - xMat[j, :] weights[j, j] = exp(diffMat * diffMat.T / (-2.0 * k ** 2)) #权重大小以指数级衰减 xTx = xMat.T * (weights * xMat) if linalg.det(xTx) == 0.0: print "This matrix is singular, cannot do inverse" return ws = xTx.I * (xMat.T * (weights * yMat)) return testPoint * ws #测试局部加权线性回归函数 def lwlrTest(xArr, yArr): print "yArr[0]", yArr[0] print "yHrr[0], k=0.01", lwlr(xArr[0], xArr, yArr, 0.01) print "yHrr[0], k=0.5", lwlr(xArr[0], xArr, yArr, 0.5) print "yHrr[0], k=1", lwlr(xArr[0], xArr, yArr, 1) m = shape(xArr)[0] yHat = zeros(m) for i in range(m): yHat[i] = lwlr(xArr[i], xArr, yArr, k=0.01) xMat = mat(xArr) srtInd= xMat[:,1].argsort(0) #对x排序 xSort = xMat[srtInd][:,0,:] import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(xSort[:,1], yHat[srtInd]) #预测图像 ax.scatter(xMat[:,1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red') #真实图像 plt.show()
测试:
>>> import regress >>> xArr, yArr = loadDataSet('ex0.txt') >>> lwlrTest(xArr, yArr) yArr[0] 3.176513 yHrr[0], k=0.01 [[ 3.20366661]] yHrr[0], k=0.5 [[ 3.12201662]] yHrr[0], k=1 [[ 3.12204471]]
缩减法:岭回归
缩减法可将系数缩减的很小的值直接缩减为0
#缩减法:岭回归 def ridgeRegres(xMat, yMat, lam=0.2): xTx = xMat.T * xMat denom = xTx + eye(shape(xMat[1])[1]) * lam if linalg.det(denom) == 0.0: print "This matrix is singular, cannot do inverse" return ws = denom.I * (xMat.T * yMat) return ws def ridgeTest(xArr, yArr): xMat = mat(xArr) yMat = mat(yArr).T #数据标准化 yMean = mean(yMat, 0) #均值 xMean = mean(xMat, 0) yMat = yMat - yMean xVar = var(xMat, 0) #方差 xMat = (xMat - xMean)/xVar numTestPts = 30 wMat = zeros((numTestPts, shape(xMat)[1])) for i in range(numTestPts): #在30个不同的lambda下调用ridgeRegres ws = ridgeRegres(xMat, yMat, exp(i-10)) #lambda以指数级变化 wMat[i,:] = ws.T #将所有的回归系数输出到一个矩阵 print wMat import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(wMat) plt.show() #交叉测试岭回归 #numVal为算法中交叉验证的次数 def crossValidation(xArr, yArr, numVal = 10): m = len(yArr) indexList = range(m) errorMat = zeros((numVal, 30)) #创建训练集和测试集容器 for i in range(numVal): trainX = [] trainY = [] testX = [] testY = [] random.shuffle(indexList) for j in range(m): #将数据分为训练集和测试集 if j < m * 0.9: #90%的数据为训练集,10%为测试集 trainX.append(xArr[indexList[j]]) trainY.append(yArr[indexList[j]]) else: testX.append(xArr[indexList[j]]) testY.append(yArr[indexList[j]]) wMat = ridgeTest(trainX, trainY) #保存所有回归系数 #用训练时的参数将测试数据标准化 for k in range(30): matTestX = mat(testX) matTrainX = mat(trainX) meanTrainX = mean(matTrainX, 0) varTrainX = var(matTrainX, 0) matTestX = (matTestX - meanTrainX) / varTrainX yEst = matTestX * mat(wMat[k,:]).T + mean(trainY) errorMat[i,k] = rssError(yEst.T.A, array(testY)) #误差 meanErrors = mean(errorMat, 0) minMean = float(min(meanErrors)) bestWeights = wMat[nonzero(meanErrors == minMean)] xMat = mat(xArr) yMat = mat(yArr).T meanX = mean(xMat, 0) varX = var(xMat, 0) #因为作了标准化,需进行数据还原 unReg = bestWeights / varX print "the best model from Ridge Regression is:\n", unReg print "with constant term: ", -1 * sum(multiply(meanX, unReg)) + mean(yMat)
测试:
>>> import regress >>> xArr, yArr = loadDataSet('abalone.txt') >>> ridgeTest(xArr, yArr) [[ 4.30441949e-02 -2.27416346e-02 1.32140875e-01 2.07518171e-02 2.22403745e+00 -9.98952980e-01 -1.17254237e-01 1.66229222e-01] [ 4.30441928e-02 -2.27416370e-02 1.32140878e-01 2.07518175e-02 2.22403626e+00 -9.98952746e-01 -1.17254174e-01 1.66229339e-01] [ 4.30441874e-02 -2.27416435e-02 1.32140885e-01 2.07518187e-02 2.22403305e+00 -9.98952110e-01 -1.17254003e-01 1.66229656e-01] ... ...
得到8个特征值的系数与log(lambda)的关系图像。最左端,lambda最小时,所有系数值与线性回归一致;最右边,都缩减成0;在中间的某值可以取得最好的预测效果
crossValidation(xArr, yArr, numVal = 10)
the best model from Ridge Regression is:
[[ 0.0823653 -3.91472054 16.9506189 10.7538564 8.09628256 -19.35008479 -7.85980015 9.56536757]]
with constant term: 2.94541561804
缩减法:前向逐步线性回归
#缩减法:前向逐步线性回归 def stageWise(xArr, yArr, eps=0.01, numItr=100): xMat = mat(xArr) yMat = mat(yArr).T #数据标准化 yMean = mean(yMat, 0) yMat = yMat - yMean xMat = regularize(xMat) m, n = shape(xMat) returnMat = zeros((numItr, n)) ws = zeros((n, 1)) wsTest = ws.copy() wsMax = ws.copy() for i in range(numItr): print ws.T lowestError = inf for j in range(n): #对于每个特征值 for sign in [-1, 1]: #分别计算增加或减少该特征值对误差的影响 wsTest = ws.copy() wsTest[j] += eps * sign yTest = xMat * wsTest rssE = rssError(yMat.A, yTest.A) #与所有误差比较后,取最小的误差 if rssE < lowestError: lowestError = rssE wsMax = wsTest ws = wsMax.copy() returnMat[i, :] = ws.T return returnMat def stageWiseTest(): xArr, yArr = loadDataSet('abalone.txt') wMat = stageWise(xArr, yArr, eps=0.01, numItr=200) print "前向逐步线性回归,eps=0.01, numTtr=200:", wMat wMat = stageWise(xArr, yArr, eps=0.001, numItr=5000) print "前向逐步线性回归,eps=0.001, numTtr=5000:", wMat #与最小二乘法比较 xMat = mat(xArr) yMat = mat(yArr).T yMean = mean(yMat, 0) xMat = regularize(xMat) yMat = yMat - yMean wMat = standRegress(xMat, yMat.T) print "标准回归", wMat.T
测试:
>>> import regress >>> stageWiseTest() 前向逐步线性回归,eps=0.01, numTtr=200: [[ 0. 0. 0. ..., 0. 0. 0. ] [ 0. 0. 0. ..., 0. 0. 0. ] [ 0. 0. 0. ..., 0. 0. 0. ] ..., [ 0.05 0. 0.09 ..., -0.64 0. 0.36] [ 0.04 0. 0.09 ..., -0.64 0. 0.36] [ 0.05 0. 0.09 ..., -0.64 0. 0.36]] #第2列和第7列都是0,说明不对目标值造成任何影响 前向逐步线性回归,eps=0.001, numTtr=5000: [[ 0. 0. 0. ..., 0. 0. 0. ] [ 0. 0. 0. ..., 0. 0. 0. ] [ 0. 0. 0. ..., 0. 0. 0. ] ..., [ 0.043 -0.011 0.12 ..., -0.963 -0.105 0.187] [ 0.044 -0.011 0.12 ..., -0.963 -0.105 0.187] 标准回归: [[ 0.0430442 -0.02274163 0.13214087 0.02075182 2.22403814 -0.99895312 -0.11725427 0.16622915]]
相关文章推荐
- 前端工程师
- va_list可变参数
- 学习进度条——第15周
- acm之搜索题目5
- LeetCode:Trapping Rain Water
- 简单工厂模式
- 基本输入输出系统BIOS
- Node.js and the new web front-end
- Setting Up a Fake SMTP Server
- Nor Flash & Nand Flash & OneNand Flash
- JSTL标签不起作用,JSTL标签直接输出表达式
- 欢迎使用CSDN-markdown编辑器
- 构建之法阅读笔记06
- C++操作符重载
- 广度优先搜索 codevs 2806 红与黑
- 第14周项目2-两个成员的类模版(1)
- 阿里大数据技术
- poj 3176 Cow Bowling
- 关于同步与异步的那些事儿
- 147. Insertion Sort List