您的位置:首页 > 编程语言 > Python开发

机器学习实战(8) ——预测数值型数据回归(python实现)

2018-03-28 17:13 816 查看
这是学习机器学习算法实战这本书时,写的代码实战。让自己对各个算法有更直观的了解,不能一直不写啊。不管简单还是不简单都亲自一行一行的敲一遍啊。
具体的源码和和数据链接:https://pan.baidu.com/s/1G2S2pb5gfBnxGNNTFgTkEA 密码:fov0
这个第八章的代码和自己做的测试regression.py。基本符合结果。# -*- coding: utf-8 -*-
# author: Yufeng Song
from numpy import*
import matplotlib.pyplot as plt
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t'))-1
dataMat = [];labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat

def standRegres(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:
print("This matrix is singular,cannot do inverse")
return
ws = xTx.I*(xMat.T*yMat)
return ws

def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:] #
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws

def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat

def rssError(yArr,yHatArr):
return ((yArr-yHatArr)**2).sum()

def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws

def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat

def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
# xMat = regularize(xMat)
m,n=shape(xMat)
returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):#could change this to while loop
#print ws.T
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat

from time import sleep
import json
# import urllib2
import urllib.request
# response = urllib.request.urlopen('http://python.org/')
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr = 'AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
# pg = urllib2.urlopen(searchURL)
pg = urllib.request.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['items'])):
try:
currItem = retDict['items'][i]
if currItem['product']['condition'] == 'new':
newFlag = 1
else: newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print ("%d\t%d\t%d\t%f\t%f" % (yr,numPce,newFlag,origPrc, sellingPrice))
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except: print ('problem with item %d' % i)

def scrapePage(inFile,outFile,yr,numPce,origPrc):
# from beautifulsoup4 import BeautifulSoup
from bs4 import BeautifulSoup
fr = open(inFile); fw=open(outFile,'a') #a is append mode writing
soup = BeautifulSoup(fr.read())
i=1
currentRow = soup.findAll('table', r="%d" % i)
while(len(currentRow)!=0):
currentRow = soup.findAll('table', r="%d" % i)
title = currentRow[0].findAll('a')[1].text
lwrTitle = title.lower()
if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):
newFlag = 1.0
else:
newFlag = 0.0
soldUnicde = currentRow[0].findAll('td')[3].findAll('span')
if len(soldUnicde)==0:
print ("item #%d did not sell" % i)
else:
soldPrice = currentRow[0].findAll('td')[4]
priceStr = soldPrice.text
priceStr = priceStr.replace('$','') #strips out $
priceStr = priceStr.replace(',','') #strips out ,
if len(soldPrice)>1:
priceStr = priceStr.replace('Free shipping', '') #strips out Free Shipping
print ("%s\t%d\t%s" % (priceStr,newFlag,title))
fw.write("%d\t%d\t%d\t%f\t%s\n" % (yr,numPce,newFlag,origPrc,priceStr))
i += 1
currentRow = soup.findAll('table', r="%d" % i)
fw.close()

# def setDataCollect(retX, retY):
# searchForSet(retX, retY, 8288, 2006, 800, 49.99)
# searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
# searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
# searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
# searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
# searchForSet(retX, retY, 10196, 2009, 3263, 249.99)

def setDataCollect():
scrapePage('setHtml/lego8288.html','out.txt', 2006, 800, 49.99)
scrapePage('setHtml/lego10030.html','out.txt', 2002, 3096, 269.99)
scrapePage('setHtml/lego10179.html','out.txt', 2007, 5195, 499.99)
scrapePage('setHtml/lego10181.html','out.txt', 2007, 3428, 199.99)
scrapePage('setHtml/lego10189.html','out.txt', 2008, 5922, 299.99)
scrapePage('setHtml/lego10196.html','out.txt', 2009, 3263, 249.99)

def crossValidation(xArr,yArr,numVal=10):
m = len(yArr)
indexList = range(m)
errorMat = zeros((numVal,30))#create error mat 30columns numVal rows
for i in range(numVal):
trainX=[]; trainY=[]
testX = []; testY = []
random.shuffle(indexList)
for j in range(m):#create training set based on first 90% of values in indexList
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
for k in range(30):#loop over all of the ridge estimates
matTestX = mat(testX); matTrainX=mat(trainX)
meanTrain = mean(matTrainX,0)
varTrain = var(matTrainX,0)
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
errorMat[i,k]=rssError(yEst.T.A,array(testY))
#print errorMat[i,k]
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
minMean = float(min(meanErrors))
bestWeights = wMat[nonzero(meanErrors==minMean)]
#can unregularize to get model
#when we regularized we wrote Xreg = (x-meanX)/var(x)
#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
xMat = mat(xArr); yMat=mat(yArr).T
meanX = mean(xMat,0); varX = var(xMat,0)
unReg = bestWeights/varX
print ("the best model from Ridge Regression is:\n",unReg)
print ("with constant term: ",-1*sum(multiply(meanX,unReg)) + mean(yMat))

if __name__ == '__main__':
# xArr,yArr = loadDataSet('ex0.txt')
# print(xArr[0:2])
# ws = standRegres(xArr,yArr)
# # print(ws)
# xMat = mat(xArr)
# yMat = mat(yArr)
# yHat = xMat*ws
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])#flatten缩成一维数组
# xCopy = xMat.copy()
# # print(xCopy)
# xCopy.sort(0)#按列排序
# # print(xCopy)
# yHat = xCopy*ws
# ax.plot(xCopy[:,1],yHat)
# plt.show()
# cor = corrcoef(yHat.T,yMat)
# print(cor)

#143页测试
# xArr,yArr = loadDataSet('ex0.txt')
# # print(yArr[0])
# # print(lwlr(xArr[0],xArr,yArr,1.0))
# yHat = lwlrTest(xArr,xArr,yArr,0.003)
# xMat = mat(xArr)
# print('#'*40)
# print(xMat)
# print('*'*40)
# print(xMat[151])
# print(xMat[:,1])#取第一列
# srtInd = xMat[:,1].argsort(0)
#
# print('#'*40)
# print(srtInd)
# xSort = xMat[srtInd][:,0,:]
# print('#'*40)
# print(xSort)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.plot(xSort[:,1],yHat[srtInd])
# ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).T[:,0].flatten().A[0],s=2,c='red')#flatten缩成一维数组
# plt.show()

# 145页测试
# abX,abY = loadDataSet('abalone.txt')
# # yHat01 = lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1)
# # yHat1 = lwlrTest(abX[0:99],abX[0:99],abY[0:99],1)
# # yHat10 = lwlrTest(abX[0:99],abX[0:99],abY[0:99],10)
# #
# # print(rssError(abY[0:99],yHat01.T))
# # print(rssError(abY[0:99],yHat1.T))
# # print(rssError(abY[0:99],yHat10.T))
#
# yHat01 = lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1)
# print(rssError(abY[100:199],yHat01))
#
# yHat1 = lwlrTest(abX[100:199],abX[0:99],abY[0:99],1)
# print(rssError(abY[100:199],yHat1))
#
# yHat10 = lwlrTest(abX[100:199],abX[0:99],abY[0:99],10)
# print(rssError(abY[100:199],yHat10))
#
# ws = standRegres(abX[0:99],abY[0:99])
# yHat = mat(abX[100:199])*ws
# print(rssError(abY[100:199],yHat.T.A))

#147页测试
# abX,abY = loadDataSet('abalone.txt')
# ridgeWeights = ridgeTest(abX,abY)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# print(ridgeWeights)
# print(ridgeWeights[-1,:])
# ax.plot(ridgeWeights[0,:])
# plt.show()

#150页测试
# xArr,yArr = loadDataSet('abalone.txt')
# stageWiseWeights = stageWise(xArr,yArr,0.01,200)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# print(stageWiseWeights)
# print(stageWiseWeights)
# ax.plot(stageWiseWeights)
# plt.show()

lgX=[];lgY=[]
print(setDataCollect())

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: