您的位置:首页 > 其它

《机器学习实战》学习笔记--朴素贝叶斯

2018-02-25 15:09 651 查看
机器学习实战,朴素贝叶斯一章代码详解:#encoding:utf-8

from numpy import *

#词表到向量的转换函数
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
print(len(postingList[0])+len(postingList[1])+len(postingList[2])+len(postingList[3])+len(postingList[4])+len(postingList[5]))
classVec = [0,1,0,1,0,1] #1,侮辱 0,正常
return postingList,classVec

def createVocabList(dataSet):
vocabSet = set([]) #调用set方法,创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #创建两个集合的并集
return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print "the word:%s is not in my Vocabulary" % word
return returnVec

def bagOfWords2VecMN(vocabList,inputSet):
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec

#朴素贝叶斯分类器训练集
def trainNB0(trainMatrix,trainCategory): #传入参数为文档矩阵,每篇文档类别标签所构成的向量
numTrainDocs = len(trainMatrix) #文档矩阵的长度,文档个数
numWords = len(trainMatrix[0]) #每个文档的单词个数
#print(sum(trainCategory)) #这里侮辱性文档标记为1,sum加和后则为全部文档中侮辱性文档的个数,除以总文档数即为侮辱性文档出现概率
pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档概率,根据初始手动输入的分类矩阵
#p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0
p0Num = ones(numWords);p1Num = ones(numWords) #初始化两个矩阵,长度为numWords,内容值为1
#p0Denom = 0.0;p1Denom = 0.0 #初始化概率
p0Denom = 2.0;p1Denom = 2.0 #p0Denom+p1Denom是所有单词个数和
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num +=trainMatrix[i]#这里可以当作侮辱性文档中出现的单词都在p1Num矩阵上相应单词位置加1,增加侮辱性文档单词个数
p1Denom += sum(trainMatrix[i]) #p1Denom每次加上侮辱性文档的单词个数
else:
p0Num +=trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#这里因为p1Num所有元素初始化为1,p1Denom初始化为2,所以sum(p1Num)-32=p1Denom-2,sum(p0Num)-32=p0Denom-2;
#p1Vect = p1Num/p1Denom #对每个元素做除法
#p0Vect = p0Num/p0Denom
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive

#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1>p0:
return 1
else:
return 0

def testingNB():
listOPosts,listClasses = loadDataSet() #产生文档矩阵和对应的标签
#print (len(listOPosts))#6个文档
myVocabList = createVocabList(listOPosts) #创建并集
#print(len(myVocabList)) #共32个单词
trainMat = [] #创建一个空的列表
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用词向量来填充trainMat列表
#print (len(trainMat)) #trainMat是每个文档对应单词表,表征该单词是否存在的单词矩阵,每个文档的单词矩阵相同,有32个向量
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练函数
testEntry = ['love','my','dalmation'] #测试文档列表
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))

if __name__ == "__main__":
testingNB()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  MachineLearning