学习笔记——《机器学习实战》KNN算法实现 约会网站测试,手写数字识别,代码,注释,错误修改
2016-12-01 14:11
896 查看
# -*- coding: cp936 -*-
'''
Created on Nov 27, 2016
KNN Code for Machine Learning in Action Ch. 2
@author: Miaotong Jiang
'''
from numpy import *
from os import listdir
import operator
mydict={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
def createDataSet():#创建训练数据集
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return group,labels
def classify0(inx,dataset,labels,k):#得到欧式距离下最近邻决策规则的测试数据的预测分类
datasetsize=dataset.shape[0]
diffs=tile(inx,(datasetsize,1))-dataset
sq=diffs**2
ss=sq.sum(axis=1)#( >>> sum([[1,5],[0,3]],axis=1) array([6, 3]) >>> sum([[1,5],[0,3]],axis=0) array([1, 8]) )
s=ss**0.5
sortedsindices=s.argsort()#返回数组值从小到大的索引值
classCount={}
for i in range(k):
votellabel=labels[sortedsindices[i]]
classCount[votellabel]=classCount.get(votellabel,0)+1
sortedclasscount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#sorted(可迭代
类型,cmp(用于比较的函数,由key决定,有默认值,), key(此处根据第二个域排序),reverse反转)
print classCount#输出是列表['B','2','A','1']
return sortedclasscount[0][0]
def file2matrix(filename):#解析训练数据文件,将其转化成训练数据输入矩阵和输出矩阵
fr=open(filename)
arrayolines=fr.readlines()
numberoflines=len(arrayolines)#得到文件行数
returnmat=zeros((numberoflines,3))#创建零填充的矩阵
classlabelvector=[]
index=0
for line in arrayolines:
line=line.strip()#截取掉所有回车符
listfromline=line.split('\t')#利用\t将数据分割成一个元素列表
returnmat[index,:]=listfromline[0:3]#选取前三个元素储存到矩阵中
classlabelvector.append(int(mydict[listfromline[-1]]))#把最后一列存到classlabelvector里
index+=1
return returnmat,classlabelvector
def autonorm(dataset):#将矩阵具体的特征值(即测试数据矩阵的每个元素归一化)
minvals=dataset.min(0)
maxvals=dataset.max(0)
ranges=maxvals-minvals
normdataset=zeros(shape(dataset))#shape() 输入参数:类似数组(比如列表,元组)等,或是数组返回:一 个整型数字的元组,元组中的每个元素表示相应的数组每一维的长度
m=dataset.shape[0]#行数
normdataset=dataset-tile(minvals,(m,1))
normdataset=normdataset/tile(ranges,(m,1))
return normdataset,ranges,minvals
def datingclasstest():#错误率测试函数
horatio=0.05#测试数据的比例
datingdatamat,datinglabels=file2matrix('datingtestset.txt')
normmat,ranges,minvals=autonorm(datingdatamat)
m=normmat.shape[0]
numtestvecs=int(m*horatio)
errorcount=0.0
for i in range(numtestvecs):#%还用在python的格式化输出,比如:a = 'test' ;print 'it is a %s' %(a) ;打印的 结果就是 it is a test
classifierresult=classify0(normmat[i,:],normmat[numtestvecs:m,:],datinglabels[numtestvecs:m],3)
print"the classifier came back with:%d,the real answer is: %d"%(classifierresult,datinglabels[i])
if(classifierresult!=datinglabels[i]):errorcount+=1.0
print"the total error rate is: %f"%(errorcount/float(numtestvecs))
def classifyperson():#海伦约会网站预测函数
resultlist=['not at all','in small doses','in large doses']
percenttats=float(raw_input("percentage of time spent playing video games?"))
ffmiles=float(raw_input("frequent flier miles earned per year?"))
icecream=float(raw_input("liters of ice cream consumed per year"))
datingdatamat,datinglabels=file2matrix('datingtestset.txt')
normmat,ranges,minvals=autonorm(datingdatamat)
inarr=array([ffmiles,percenttats,icecream])
classifierresult=classify0((inarr-minvals)/ranges,normmat,datinglabels,3)
print"You will probably like this person,"
return resultlist[classifierresult-1]#需要给出返回值
def img2vector(filename):#图像转换为向量
returnvector=zeros((1,1024))
fr=open(filename)
for i in range(32):
linestr=fr.readline()
for j in range(32):
returnvector[0,32*i+j]=int(linestr[j])
return returnvector
def handwritingclasstest():#手写数字识别系统测试
hwlabels=[]
trainingfilelist=listdir('trainingdigits')#获取目录内容
m=len(trainingfilelist)
trainingmat=zeros((m,1024))#每行是一个图像
for i in range(m):
filenamestr=trainingfilelist[i]
filestr=filenamestr.split('.')[0]#去掉.txt
classnumstr=int(filestr.split('_')[0])#去掉_及之后的内容
hwlabels.append(classnumstr)#从文件名解析分类数字
trainingmat[i,:]=img2vector('trainingdigits/%s'%filenamestr)
testfilelist=listdir('testdigits')
errorcount=0.0
mtest=len(testfilelist)
for i in range(mtest):
filenamestr=testfilelist[i]
filestr=filenamestr.split('.')[0]
classnumstr=int(filestr.split('_')[0])
vectorundertest=img2vector('testdigits/%s'%filenamestr)
classifierresult=classify0(vectorundertest,trainingmat,hwlabels,3)
print"the classifier came back with:%d, the real answer is:%d"%(classifierresult,classnumstr)
if(classifierresult!=classnumstr):errorcount+=1.0
print"the total number of errors is:%d"%errorcount
print"the total error rate is:%f"%(errorcount/mtest)
def handwritingclasspredict():#手写识别系统预测
vectorundertest=img2vector('testSet.txt')
hwlabels=[]
trainingfilelist=listdir('trainingdigits')#获取目录内容
m=len(trainingfilelist)
trainingmat=zeros((m,1024))#每行是一个图像
for i in range(m):
filenamestr=trainingfilelist[i]
filestr=filenamestr.split('.')[0]#去掉.txt
classnumstr=int(filest
a3a4
r.split('_')[0])#去掉_及之后的内容
hwlabels.append(classnumstr)#从文件名解析分类数字
trainingmat[i,:]=img2vector('trainingdigits/%s'%filenamestr)
classifierresult=classify0(vectorundertest,trainingmat,hwlabels,3)
return classifierresult
'''
Created on Nov 27, 2016
KNN Code for Machine Learning in Action Ch. 2
@author: Miaotong Jiang
'''
from numpy import *
from os import listdir
import operator
mydict={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
def createDataSet():#创建训练数据集
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return group,labels
def classify0(inx,dataset,labels,k):#得到欧式距离下最近邻决策规则的测试数据的预测分类
datasetsize=dataset.shape[0]
diffs=tile(inx,(datasetsize,1))-dataset
sq=diffs**2
ss=sq.sum(axis=1)#( >>> sum([[1,5],[0,3]],axis=1) array([6, 3]) >>> sum([[1,5],[0,3]],axis=0) array([1, 8]) )
s=ss**0.5
sortedsindices=s.argsort()#返回数组值从小到大的索引值
classCount={}
for i in range(k):
votellabel=labels[sortedsindices[i]]
classCount[votellabel]=classCount.get(votellabel,0)+1
sortedclasscount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#sorted(可迭代
类型,cmp(用于比较的函数,由key决定,有默认值,), key(此处根据第二个域排序),reverse反转)
print classCount#输出是列表['B','2','A','1']
return sortedclasscount[0][0]
def file2matrix(filename):#解析训练数据文件,将其转化成训练数据输入矩阵和输出矩阵
fr=open(filename)
arrayolines=fr.readlines()
numberoflines=len(arrayolines)#得到文件行数
returnmat=zeros((numberoflines,3))#创建零填充的矩阵
classlabelvector=[]
index=0
for line in arrayolines:
line=line.strip()#截取掉所有回车符
listfromline=line.split('\t')#利用\t将数据分割成一个元素列表
returnmat[index,:]=listfromline[0:3]#选取前三个元素储存到矩阵中
classlabelvector.append(int(mydict[listfromline[-1]]))#把最后一列存到classlabelvector里
index+=1
return returnmat,classlabelvector
def autonorm(dataset):#将矩阵具体的特征值(即测试数据矩阵的每个元素归一化)
minvals=dataset.min(0)
maxvals=dataset.max(0)
ranges=maxvals-minvals
normdataset=zeros(shape(dataset))#shape() 输入参数:类似数组(比如列表,元组)等,或是数组返回:一 个整型数字的元组,元组中的每个元素表示相应的数组每一维的长度
m=dataset.shape[0]#行数
normdataset=dataset-tile(minvals,(m,1))
normdataset=normdataset/tile(ranges,(m,1))
return normdataset,ranges,minvals
def datingclasstest():#错误率测试函数
horatio=0.05#测试数据的比例
datingdatamat,datinglabels=file2matrix('datingtestset.txt')
normmat,ranges,minvals=autonorm(datingdatamat)
m=normmat.shape[0]
numtestvecs=int(m*horatio)
errorcount=0.0
for i in range(numtestvecs):#%还用在python的格式化输出,比如:a = 'test' ;print 'it is a %s' %(a) ;打印的 结果就是 it is a test
classifierresult=classify0(normmat[i,:],normmat[numtestvecs:m,:],datinglabels[numtestvecs:m],3)
print"the classifier came back with:%d,the real answer is: %d"%(classifierresult,datinglabels[i])
if(classifierresult!=datinglabels[i]):errorcount+=1.0
print"the total error rate is: %f"%(errorcount/float(numtestvecs))
def classifyperson():#海伦约会网站预测函数
resultlist=['not at all','in small doses','in large doses']
percenttats=float(raw_input("percentage of time spent playing video games?"))
ffmiles=float(raw_input("frequent flier miles earned per year?"))
icecream=float(raw_input("liters of ice cream consumed per year"))
datingdatamat,datinglabels=file2matrix('datingtestset.txt')
normmat,ranges,minvals=autonorm(datingdatamat)
inarr=array([ffmiles,percenttats,icecream])
classifierresult=classify0((inarr-minvals)/ranges,normmat,datinglabels,3)
print"You will probably like this person,"
return resultlist[classifierresult-1]#需要给出返回值
def img2vector(filename):#图像转换为向量
returnvector=zeros((1,1024))
fr=open(filename)
for i in range(32):
linestr=fr.readline()
for j in range(32):
returnvector[0,32*i+j]=int(linestr[j])
return returnvector
def handwritingclasstest():#手写数字识别系统测试
hwlabels=[]
trainingfilelist=listdir('trainingdigits')#获取目录内容
m=len(trainingfilelist)
trainingmat=zeros((m,1024))#每行是一个图像
for i in range(m):
filenamestr=trainingfilelist[i]
filestr=filenamestr.split('.')[0]#去掉.txt
classnumstr=int(filestr.split('_')[0])#去掉_及之后的内容
hwlabels.append(classnumstr)#从文件名解析分类数字
trainingmat[i,:]=img2vector('trainingdigits/%s'%filenamestr)
testfilelist=listdir('testdigits')
errorcount=0.0
mtest=len(testfilelist)
for i in range(mtest):
filenamestr=testfilelist[i]
filestr=filenamestr.split('.')[0]
classnumstr=int(filestr.split('_')[0])
vectorundertest=img2vector('testdigits/%s'%filenamestr)
classifierresult=classify0(vectorundertest,trainingmat,hwlabels,3)
print"the classifier came back with:%d, the real answer is:%d"%(classifierresult,classnumstr)
if(classifierresult!=classnumstr):errorcount+=1.0
print"the total number of errors is:%d"%errorcount
print"the total error rate is:%f"%(errorcount/mtest)
def handwritingclasspredict():#手写识别系统预测
vectorundertest=img2vector('testSet.txt')
hwlabels=[]
trainingfilelist=listdir('trainingdigits')#获取目录内容
m=len(trainingfilelist)
trainingmat=zeros((m,1024))#每行是一个图像
for i in range(m):
filenamestr=trainingfilelist[i]
filestr=filenamestr.split('.')[0]#去掉.txt
classnumstr=int(filest
a3a4
r.split('_')[0])#去掉_及之后的内容
hwlabels.append(classnumstr)#从文件名解析分类数字
trainingmat[i,:]=img2vector('trainingdigits/%s'%filenamestr)
classifierresult=classify0(vectorundertest,trainingmat,hwlabels,3)
return classifierresult
相关文章推荐
- 机器学习实战之KNN算法识别手写数字_代码注释
- 使用Knn算法实现手写数字识别系统(附带jpg转txt代码)
- 《机器学习实战》学习笔记——K-近邻算法(KNN)(二)海伦约会网站匹配实战
- 各种机器学习方法(线性回归、支持向量机、决策树、朴素贝叶斯、KNN算法、逻辑回归)实现手写数字识别并用准确率、召回率、F1进行评估
- knn-2 利用knn算法实现手写数字识别
- 【KNN近邻算法】实现识别简单数字验证码(算法原理+代码笔记)
- Python实现knn算法手写数字识别
- 【python】机器学习实战KNN算法之手写数字识别
- 学习KNN(二)KNN算法手写数字识别的OpenCV实现
- KNN分类算法实现手写数字识别
- 使用KNN算法在python下识别手写数字(带注释)
- 机器学习实战(①)——KNN算法改进约会网站的配对效果和手写字识别系统
- 《机器学习实战》代码记录--knn--手写数字识别
- Python实现KNN算法手写识别数字
- KNN算法Python实现(代码来自机器学习实战)及注释
- 【机器学习】Knn算法实现手写数字识别
- kNN算法识别手写数字(代码笔记)
- 【机器学习 3】KNN算法实现梳理- Be based on“约会对象”、“手写识别”
- knn算法实现的数字手写识别
- MachineLearning— (KNN)k Nearest Neighbor实现手写数字识别(三)