您的位置:首页 > 其它

机器学习实战(二)

2015-11-09 19:56 232 查看
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 06 16:51:05 2015

@author: hzh
"""
from numpy import*
from os import listdir
import operator
import matplotlib
import matplotlib.pyplot as plt

# Create random data with numpy
import numpy as np
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels

#k临近算法
def classify0(inX, dataSet, labels, k ):
dataSetSize = dataSet.shape[0]
diffMat = tile( inX, (dataSetSize, 1 )) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1 )
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted( classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]

#将文本记录转换为NumPy的解析程序
def file2matrix( filename ):
fr = open( filename )
arrayOLines = fr.readlines()
numberOfLines = len( arrayOLines )
returnMat = zores((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector

#归一化特征值
def autoNorm( dataSet ):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros( shape(dataSet) )
normDataSet = dataSet - tile( minVals,(dataSet.shape[0],1) )
normDataSet = normDataSet / tile( ranges, (dataSet.shape[0],1))
return normDataSet, ranges, minVals

#使用Matplotlib创建散点图
x = random.rand(100)
x *= 100
c = random.rand(100,2)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(c[:,0],c[:,1],5*array(x),5*array(x))
plt.show()

def img2vector(filename):
returnVect = zeros( (1,1024) )
fr = open( filename )
for i in range( 32 ):
lineStr = fr.readline()
for j in range( 32 ):
returnVect[0,32 * i + j ] = int(lineStr[j])
return returnVect

#手写数字识别系统的测试代码
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')
m = len( trainingFileList )
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len( testFileList )
for i in range( mTest ):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s'%fileNameStr)
classifierResult = classify0( vectorUnderTest, trainingMat, hwLabels, 3 )
print "the classifier came back with : %d, the real answer is : %d "%(
classifierResult, classNumStr)
if( classifierResult != classNumStr ):
errorCount += 1.0
print 'the total number of errors is : %d'%errorCount
print 'the total error rate is :%f'%(errorCount/float(mTest))
###############################################################################
d = random.rand(5,2)
print d.min(0)
print d

xx = [ [1,2],[2,1]]
xx = array(xx)

print xx
print autoNorm(xx)[0]

handwritingClassTest()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: