您的位置：首页 > 其它

机器学习算法——朴素贝叶斯

2016-08-17 14:55 183 查看

#%% 朴素贝叶斯（针对离散输入变量）
class NaiveBayes(object):
import numpy as np
def __init__(self, train_x, train_y):
self.train_x = train_x
self.train_y = np.array(train_y)
self.dimension = len(train_x[0])
self.n_sample = self.train_y.size
self.labels = np.unique(self.train_y)
# 计算label的先验概率和feature各维度的条件概率
self.pre_prob = self.cal_pre_prob()
self.condi_prob = self.cal_condi_prob()

# 计算y的先验概率
def cal_pre_prob(self):
pre_prob = {}
for y in self.labels:
pre_prob[y] = self.train_y.tolist().count(y)/float(self.train_y.size)
return pre_prob

# 计算特征各维度的条件概率
def cal_condi_prob(self):
condi_prob = {}
dim_x = zip(*self.train_x)
for i,xi in enumerate(dim_x):
xi = np.array(xi)
for xij in np.unique(xi):
bool_xij = xi==xij
for y in self.labels:
# p(xij|y) 第i个纬度取值为xij的特征
bool_y = self.train_y==y
condi_prob[(i,xij,y)] = sum(bool_y&bool_xij)/float(sum(bool_y))
return condi_prob

def predict(self, x):
if len(x)!=self.dimension:
raise 'feature dimension not equal!'
prob = {}
for y in self.labels:
prob[y] = self.pre_prob[y]
for i,xi in enumerate(x):
prob[y] *= self.condi_prob[(i,xi,y)]
# 计算出标签概率最大的那个
print prob
prob_sum = sum(prob.values())
max_label, max_prob = None, 0
for la in prob.keys():
if prob[la] > max_prob:
max_prob = prob[la]
max_label = la
return max_label, max_prob/float(prob_sum)

def test_NaiveBayes():
x = [[1,'s'],[1,'m'],[1,'m'],[1,'s'],[1,'s'],[2,'s'],[2,'m'],[2,'m'],
[2,'l'],[2,'l'],[3,'l'],[3,'m'],[3,'m'],[3,'l'],[3,'l']]
y = [-1,-1,1,1,-1,-1,-1] + [1]*7 + [-1]
cls = NaiveBayes(x, y)
new_x = [2,'s']
print cls.predict(new_x)

test_NaiveBayes()

结果如下：
{1: 0.02222222222222222, -1: 0.066666666666666666}

(-1, 0.75000000000000011)

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航