机器学习实战(4) ——Logistic回归(python实现)
2018-03-22 21:42
609 查看
这是学习机器学习算法实战这本书时,写的代码实战。让自己对各个算法有更直观的了解,不能一直不写啊。不管简单还是不简单都亲自一行一行的敲一遍啊。
具体的源码和和数据链接:https://pan.baidu.com/s/1G2S2pb5gfBnxGNNTFgTkEA 密码:fov0
下面是主程序logRegress.py和其中在实际操作中遇到的问题啊# -*- coding: utf-8 -*-
# author: Yufeng Song
import math
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = [];
labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
def sigmoid(inX):
return 1.0 / (1 + exp(-inX)) # exp在numpy里面啊
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn)
# print(dataMatrix)
# print("#"*30)
labelMat = mat(classLabels).transpose()
m, n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weights = ones((n, 1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
# print(h)
# print(labelMat)
error = (labelMat - h)
# print(error)
# print('#'*30)
# print(dataMatrix.transpose())
# print('#'*30)
# print(alpha*dataMatrix.transpose())
# print('#'*30)
# print(weights)
weights = weights + alpha * dataMatrix.transpose() * error # 归根结底还是数学啊
# print(weights)
return weights
def plotBestFit(weights):
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = [];
ycord1 = []
xcord2 = [];
ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1]);
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1]);
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
print('#' * 100)
print(x)
y = (-weights[0] - weights[1] * x) / weights[2]
print(y)
ax.plot(x, y)
plt.xlabel('X1');
plt.ylabel('X2');
plt.show()
def stocGradAscent0(dataMatrix, classLabels):
m, n = shape(dataMatrix)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weights = ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.0001 # 0.01#
randIndex = int(random.uniform(0, len(dataIndex)))
# randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
# if prob > 0.5: return 1.0
# else: return 0.0
return 1.0 if prob > 0.5 else 0.0
def colicTest():
frTrain = open('horseColicTraining.txt')
frTest = open('horseColicTest.txt')
trainingSet = [];
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
errorCount = 0;
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print("the error rate of this test is: %f" % errorRate)
return errorRate
# errorRate = (float(errorCount) / numTestVec) 这是放在for循环外面的啊
# # print("the error rate of this test is: %f" % errorRate)
# print("the error rate of this test is: %f" % errorRate)
# return errorRate
def multiTest():
numTests = 10;
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print('after %d iterations the average error rate is: %f' % (numTests, errorSum / float(numTests)))
if __name__ == '__main__':
# dataArr,labelMat = loadDataSet()
# # weightUpdated = gradAscent(dataArr,labelMat)
# # print(weightUpdated)
# weights = gradAscent(dataArr,labelMat)
# plotBestFit(weights.getA())
multiTest()
具体的源码和和数据链接:https://pan.baidu.com/s/1G2S2pb5gfBnxGNNTFgTkEA 密码:fov0
下面是主程序logRegress.py和其中在实际操作中遇到的问题啊# -*- coding: utf-8 -*-
# author: Yufeng Song
import math
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet():
dataMat = [];
labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
def sigmoid(inX):
return 1.0 / (1 + exp(-inX)) # exp在numpy里面啊
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn)
# print(dataMatrix)
# print("#"*30)
labelMat = mat(classLabels).transpose()
m, n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weights = ones((n, 1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
# print(h)
# print(labelMat)
error = (labelMat - h)
# print(error)
# print('#'*30)
# print(dataMatrix.transpose())
# print('#'*30)
# print(alpha*dataMatrix.transpose())
# print('#'*30)
# print(weights)
weights = weights + alpha * dataMatrix.transpose() * error # 归根结底还是数学啊
# print(weights)
return weights
def plotBestFit(weights):
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = [];
ycord1 = []
xcord2 = [];
ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1]);
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1]);
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
print('#' * 100)
print(x)
y = (-weights[0] - weights[1] * x) / weights[2]
print(y)
ax.plot(x, y)
plt.xlabel('X1');
plt.ylabel('X2');
plt.show()
def stocGradAscent0(dataMatrix, classLabels):
m, n = shape(dataMatrix)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weights = ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.0001 # 0.01#
randIndex = int(random.uniform(0, len(dataIndex)))
# randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
# if prob > 0.5: return 1.0
# else: return 0.0
return 1.0 if prob > 0.5 else 0.0
def colicTest():
frTrain = open('horseColicTraining.txt')
frTest = open('horseColicTest.txt')
trainingSet = [];
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000)
errorCount = 0;
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print("the error rate of this test is: %f" % errorRate)
return errorRate
# errorRate = (float(errorCount) / numTestVec) 这是放在for循环外面的啊
# # print("the error rate of this test is: %f" % errorRate)
# print("the error rate of this test is: %f" % errorRate)
# return errorRate
def multiTest():
numTests = 10;
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print('after %d iterations the average error rate is: %f' % (numTests, errorSum / float(numTests)))
if __name__ == '__main__':
# dataArr,labelMat = loadDataSet()
# # weightUpdated = gradAscent(dataArr,labelMat)
# # print(weightUpdated)
# weights = gradAscent(dataArr,labelMat)
# plotBestFit(weights.getA())
multiTest()
相关文章推荐
- 机器学习-Logistic回归python实现
- python机器学习实战1:实现k-近邻算法
- 【机器学习实战-kNN:手写识别】python3实现-书本知识【3】
- 机器学习实战笔记(Python实现)-09-树回归
- 机器学习实战 KNN算法 python3实现
- 机器学习实战笔记(Python实现)-01-机器学习实战
- 机器学习经典算法详解及Python实现---Logistic回归(LR)分类器
- 机器学习实战(5)--SVM(Support vector machine)(六)--Python实现
- 机器学习实战笔记(Python实现)-02-k近邻算法(kNN)
- 机器学习实战笔记(Python实现)-02-k近邻算法(kNN)
- 【机器学习实战-kNN(k-近邻)】python3实现-书本知识【1】
- 【机器学习系列】logistic回归python实现
- 机器学习实战(8) ——预测数值型数据回归(python实现)
- 【机器学习实战-python3】Logistic回归
- 机器学习经典算法详解及Python实现---Logistic回归(LR)分类器
- Python学习-机器学习实战-ch05 Logistic回归
- 机器学习实战及Python实现——奇异值分解(SVD)实现简单推荐系统
- 机器学习实战(1) ——K-近邻算法(python实现)
- 机器学习实战笔记(Python实现)-01-机器学习实战
- 机器学习实战(7) ——利用AdaBoost元算法提高分类性能(python实现)