您的位置:首页 > 其它

机器学习之朴素贝叶斯

2016-12-18 23:28 447 查看
学习资料:

《机器学习实战》

《模式识别》张学工

维基百科:https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification

部分数据集:

boy.txt

173 50 38
163 51 38
165 56 38
168 56 38
171 68 38
165 50 39
163 59 39
172 60 39
172 65 39
172 66.5 39
174 50 40
168 52 40
175 54 40
163 55 40
173 56 40
175 60 40
173 62 40
172 65 40
174 67 40
180 70 40
174 84 40
158 44 36
160 50 39
158 50 37
161 52 38
175 47 37
164 48 38
166 52 36
168 45 37
162 53 37
152 42 36
156 43 36
170 53 38
169 48 38
164 62 39girl.txt
156 43 36
161 45 36
166 48 36
169 52 36
161 54 36
163 62 36
158 45 37
157 45 37
157 46 37
163 50 37
165 50 37
165 51 37
165 52 37
165 63 37
155 47 38
165 47 38
159 48 38
155 51 38
165 51 38
168 52 38
165 53 38
代码及解释:

#coding:utf-8
'''
author:dacaer
data:2016.12.28
'''

from numpy import *
from scipy import stats
import re
import matplotlib.pyplot as plt
'''
根据文件路径处理数据
#data_str 文件路径
@dataset train/test 数据
'''
def data_set(data_str):
dataset=[]
f = open(data_str,'r')
file_read = f.readlines()
for i in range(0,len(file_read)):
file_read[i] = re.sub(r'\s+',' ',file_read[i].strip())
datalist = file_read[i].split(' ')
datalist = [float(x) for x in datalist if x]
dataset.append(datalist)
f.close()
return dataset

'''
根据train数据集找出数据集的方差,均值,和先验概率
#dataset0 0类数据集
#dataset1 1类数据集
@p0mean,p1mean,p0std,p1std,p1abu 分别为0,1类的均值,0,1类的方差,1类的先验概率
'''
def trainNB0(dataset0,dataset1):
p1abu = len(dataset1)/float(len(dataset1)+len(dataset0))
p0mean = mean(dataset0,axis=0)
p1mean = mean(dataset1,axis=0)
p0std = std(dataset0,axis=0)
p1std = std(dataset1,axis=0)
return p0mean,p1mean,p0std,p1std,p1abu

'''
根据均值方差画出依照正太分布的图形
'''
def norm_shape():
for i in range(len(p0mean)):
x = arange(p0mean[i] - 5 * p0std[i], p0mean[i] + 5 * p0std[i], 1)
y = stats.norm.pdf(x, p0mean[i], p0std[i])
plt.plot(x, y)
for i in range(len(p1mean)):
x = arange(p1mean[i] - 5 * p1std[i], p1mean[i] + 5 * p1std[i], 1)
y = stats.norm.pdf(x, p1mean[i], p1std[i])
plt.plot(x, y)
plt.show()
'''
测试分类的正确性,并画出roc曲线
#dataset0,dataset1 分别为test0类和test1类
# p0mean,p1mean,p0std,p1std,p1abu 分别为0,1类的均值,0,1类的方差,1类的先验概率

'''

def testNB(dataset0,dataset1,p0mean,p1mean,p0std,p1std,p1abu):
yt = []
cnt = len(dataset0)+len(dataset1)
for i in arange(-80,28,0.5):#i为阂值
i = pow(2,i)
cnt10 = 0 #假阳
cnt01 = 0 #假阴
cnt00 = 0 #真阳
cnt11 = 0 #真阴
for data in dataset0:
d20 = 1
d21 = 1
for j in range(len(data)):
d20*=stats.norm.pdf(data[j],p0mean[j],p0std[j])
d21*=stats.norm.pdf(data[j],p1mean[j],p1std[j])
if(d20*(1-p1abu)<d21*p1abu*i):
cnt01+=1
else:
cnt00+=1
for data in dataset1:
d20 = 1
d21 = 1
for j in range(len(data)):
d20 *= stats.norm.pdf(data[j], p0mean[j], p0std[j])
d21 *= stats.norm.pdf(data[j], p1mean[j], p1std[j])
#print d20,d21
if (d20 * (1 - p1abu) < d21 * p1abu * i):
cnt11 += 1
else:
cnt10 += 1
#print cnt00,cnt01,cnt10,cnt11
yt.append([cnt00,cnt01,cnt10,cnt11])
tpp = [] #真阳性率
fpp = [] #真阴性率
for li in yt:
tpp.append(li[0]/float(li[0]+li[1]))
fpp.append(li[2]/float(li[2]+li[3]))
plt.plot(fpp,tpp)
plt.show()

#norm_shape()
dataset0=data_set("testdata/girl.txt")
dataset1 = data_set("testdata/boy.txt")
p0mean,p1mean,p0std,p1std,p1abu = trainNB0(dataset0,dataset1)

text0 = data_set("testdata/girlall.txt")
text1 = data_set("testdata/boyall.txt")
testNB(text0,text1,p0mean,p1mean,p0std,p1std,p1abu)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息