您的位置:首页 > 其它

机器学习之K-最近邻规则分类(KNN)算法

2017-10-30 17:13 555 查看
准备分为两个部分,一个是理论,一个就是代码实现。代码也可以在我的GitHub上下载,后面有链接。

一、理论知识

相信我的笔记还是比较详细的



二、代码实现KNN算法

源码也可以在我的GitHub上获取:

https://github.com/codermckee/KNN

1. 首先要生成一些数据集,以供训练和测试

我造的数据是关于通过身高等信息预测女生是什么类型的数据(纯属扯淡的,不要当真)。属性是身高、体重、腿长,分类标签是[model,common,lolita]。数据量大小控制为150。

代码如下:

# -*- coding: utf-8 -*-
import random

def generate_data(min1,max1,min2,max2,min3,max3,len):
info = []
for i in range(len):
buf = []
height = random.uniform(min1, max1)
weight = random.uniform(min2, max2)#可能体重作为属性不太好,因为不同身高段的人体重可能在同一个小范围内。仅作为试验
leg_length = random.uniform(min3, max3)
buf.append(height)
buf.append(weight)
buf.append(leg_length)
info.append(buf)
return info
def save(info,label):
handle = open('data.txt','a+')
for i in info:
print i[0],i[1],i[2]
handle.write(str(i[0])+' ')
handle.write(str(i[1])+' ')
handle.write(str(i[2])+' ')
handle.write(label)
handle.write('\n')
handle.close()

if __name__ == '__main__':
model = generate_data(169.9,180,90,110,105,111,50)
lolita = generate_data(155,162,85,98,95,100,50)
common = generate_data(162,169,80,115,100,104.5,50)
save(model,'model')
save(lolita,'lolita')
save(common,'common')


得到的数据如下:

2. 利用KNN进行分类

# -*- coding: utf-8 -*-
import random
import math
def get_train_data():
test = []
train = []
handle = open('data.txt')
for line in handle:
line = line.strip().split()
if random.random() < 0.333333333333:
test.append(line)
else:
train.append(line)
return train,test
def cal_distance(a,b,dimension):#用的是欧几里得距离
d = 0
for i in range(dimension):
d += (float(a[i])-float(b[i]))*(float(a[i])-float(b[i]))
d = math.sqrt(d)
return d
def get_neighbor(K,test,train_data):
distance = []
neighbor = []
for i in range(len(train_data)):
info = []
l = cal_distance(test,train_data[i],3)
label = train_data[i][-1]
info.append(l)
info.append(label)
distance.append(info)
distance = sorted(distance)
for i in range(K):
neighbor.append(distance[i])
return neighbor

def get_prediction(K,neighbor):
model_num = 0
common_num = 0
lolita_num = 0
for i in range(K):
if neighbor[i][1] == 'model':
model_num += 1
elif neighbor[i][1] == 'common':
common_num += 1
else:
lolita_num += 1
if model_num > common_num and model_num > lolita_num:
prediction = 'model'
elif lolita_num > model_num and lolita_num > common_num:
prediction = 'lolita'
else:
prediction = 'common'
return prediction
def evaluate_Accuracy(test,train,K):
neighbor_set = []
prediction_set = []
for i in range(len(test)):
neighbor_buf = get_neighbor(K,test[i],train)
prediction = get_prediction(K,neighbor_buf)
prediction_set.append(prediction)
neighbor_set.append(neighbor_buf)
#print neighbor_set[0]
#print prediction_set
right = 0
for i in range(len(prediction_set)):
if prediction_set[i] == test[i][-1]:
right += 1
precision = float(right)/float(len(prediction_set))
return precision

if __name__ == '__main__':
K = 5
train_data, test_data = get_train_data() #得到训练集与测试集
#print train_data
#print test_data
precision = evaluate_Accuracy(test_data, train_data, K) #评定精度
print precision

test = [172, 96, 106]#测试一个第三方样本
neighbor = get_neighbor(K,test,train_data) #得到最近邻的K个值
prediction = get_prediction(K,neighbor) #对样本作预测
print 'Girl Type:', prediction


结果如下:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: