机器学习基础KNN分类算法
2017-10-11 17:43
441 查看
咸鱼跟书学机器学习ing(0.0)然后数据包可以去https://www.manning.com/books/machine-learning-in-action下
#-*-coding:UTF-8-*- import operator #运算符模块 from numpy import * #科学计算包 import matplotlib #绘图库 import matplotlib.pyplot as plt from os import listdir #列出给定目录的文件名 #创造训练集 def createDataSet(): group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] return group, labels #将文本记录转成numpy的解析函数 def file2matrix(filename): #得到文件行数 fr = open(filename) array0Lines = fr.readlines() numberOfLines = len(array0Lines) #创建返回的numpy矩阵 returnMat = zeros((numberOfLines, 3)) #选取前三个元素作为特征值 classLabelVector = [] index = 0 for line in array0Lines: line = line.strip() #默认去掉line前面的空格和回车 listFromLine = line.split('\t') #将line按照制表符分割开 returnMat[index, :] = listFromLine[0:3] #选取最后一列元素存储,必须int否则会当做字符串 classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector #绘图,使用矩阵的第二列和第三列数据绘制散点图 def show(DataMat, DataLabels): fig = plt.figure() ax = fig.add_subplot(111) #利用分类标记个性化标记散点图上的点 ax.scatter(DataMat[:, 1], DataMat[:, 0], 15.0 * array(DataLabels), 15.0 * array(DataLabels)) plt.show() #归一化特征值,即将各个特征值化成等权重(都化为[0,1]的值) def autoNorm(dataSet): minVals = dataSet.min(0) #选取每列最小值 maxVals = dataSet.max(0) #选取每列最大值 ranges = maxVals - minVals #可能的取值范围 normDataSet = zeros(shape(dataSet)) #新的返回矩阵 m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m, 1)) normDataSet = normDataSet / tile(ranges, (m, 1)) #特征值相除 return normDataSet, ranges, minVals #分类 def classify0(inX, dataSet, labels, k): #计算欧式距离 d=sqrt((x1-x2)^2+(y1-y2)^2) dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize, 1)) - dataSet #将inX重复shape行,1列 sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) #axis=0表示列求和,axis=1表示行求和 distances = sqDistances**0.5 #选取距离最小的k个点 sortedDistIndicies = distances.argsort() #排序索引 classCount = {} #建立字典存储每个类的数目 for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 #排序(逆序) # key=operator.itemgetter(1)函数表示选取对象第一个域的值进行排序 sortedClassCount = sorted( classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] #分类器验证函数 def datingClassTest(): hoRatio = 0.10 #选取抽样数据的比例 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat = autoNorm(datingDataMat)[0] m = normMat.shape[0] numTestVecs = int(m * hoRatio) errorCount = 0.0 for i in range(numTestVecs): #本来应该随机选取数据,但是这里的数据本身是随机的所以直接选取 classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print("the classifier came back with %d,the real answer is %d" % (classifierResult, datingLabels[i])) if classifierResult != datingLabels[i]: errorCount += 1 if numTestVecs != 0: #考虑到numTestVecs==0时不能除 print("the total error rate is: %f" % (errorCount / float(numTestVecs))) #输入数据预测函数 def classifyPerson(): #类别 resultList = ["not at all", "in small doses", "in large doses"] percentTats = float(input("Percentage of time spent playing video games?")) ffMiles = float(input("Frequent flier miles earned per year?")) iceCream = float(input("liters of ice cream consumed per year?")) datingDataMat, datingLabels = file2matrix("datingTestSet2.txt") normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3) print("You will probably like this person:", resultList[classifierResult - 1]) #将32*32的二进制图像矩阵转化成1*1024的向量 def img2vector(filename): returnVect = zeros((1, 1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[0, 32 * i + j] = int(lineStr[j]) return returnVect #手写数字识别 def handwritingClassTest(): hwLabels = [] trainingFileList = os.listdir('trainingDigits') #获取目录内容 m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): #从文件名解析分类数字 fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = os.listdir('testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print('the classifier came back with %d, the real answer is: %d' % (classifierResult, classNumStr)) if classifierResult != classNumStr: errorCount += 1.0 print('\nthe total number of errors is: %d' % errorCount) print('\nthe total error rate is: %f' % (errorCount / float(mTest))) ''' group, labels = createDataSet() print(classify0([0, 0], group, labels, 3)) datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') show(autoNorm(datingDataMat)[0], datingLabels) datingClassTest() classifyPerson() '''还有还有,学着python现在敲代码都不爱打分号了0.0
相关文章推荐
- 最近邻分类算法KNN实现--参考麦子学院彭亮机器学习基础4.2
- 机器学习 -- KNN(k-Nearest Neighbor)最邻近规则分类算法
- 用Python开始机器学习(4:KNN分类算法) sklearn做KNN算法 python
- 机器学习之kNN分类算法
- <基础原理进阶>机器学习算法python实现【1】--分类简谈&KNN算法
- 机器学习(二)k-近邻分类算法(kNN)
- 机器学习(四) 机器学习(四) 分类算法--K近邻算法 KNN (下)
- 机器学习第二个算法KNN(最邻近规则分类KNN算法)
- 机器学习实战 笔记一:kNN分类算法
- 《机器学习》实施kNN分类算法
- 机器学习——最邻近规则分类(K Nearest Neighbor)KNN算法
- 【机器学习】3.最邻近规则分类KNN算法
- 机器学习与深度学习(二) k最邻近分类算法 (K-Nearest Neighbor) KNN
- 机器学习(四) 分类算法--K近邻算法 KNN (上)
- 机器学习——分类算法1:k-近邻 (KNN) 思想和代码
- 【机器学习实战】—KNN分类算法
- python机器学习之KNN分类算法
- 机器学习——最邻近规则分类(K Nearest Neighbor)KNN算法的应用
- 用Python开始机器学习(4:KNN分类算法)