《机器学习实战》学习笔记--朴素贝叶斯
2018-02-25 15:09
651 查看
机器学习实战,朴素贝叶斯一章代码详解:#encoding:utf-8
from numpy import *
#词表到向量的转换函数
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
print(len(postingList[0])+len(postingList[1])+len(postingList[2])+len(postingList[3])+len(postingList[4])+len(postingList[5]))
classVec = [0,1,0,1,0,1] #1,侮辱 0,正常
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([]) #调用set方法,创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #创建两个集合的并集
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print "the word:%s is not in my Vocabulary" % word
return returnVec
def bagOfWords2VecMN(vocabList,inputSet):
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
#朴素贝叶斯分类器训练集
def trainNB0(trainMatrix,trainCategory): #传入参数为文档矩阵,每篇文档类别标签所构成的向量
numTrainDocs = len(trainMatrix) #文档矩阵的长度,文档个数
numWords = len(trainMatrix[0]) #每个文档的单词个数
#print(sum(trainCategory)) #这里侮辱性文档标记为1,sum加和后则为全部文档中侮辱性文档的个数,除以总文档数即为侮辱性文档出现概率
pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档概率,根据初始手动输入的分类矩阵
#p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0
p0Num = ones(numWords);p1Num = ones(numWords) #初始化两个矩阵,长度为numWords,内容值为1
#p0Denom = 0.0;p1Denom = 0.0 #初始化概率
p0Denom = 2.0;p1Denom = 2.0 #p0Denom+p1Denom是所有单词个数和
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num +=trainMatrix[i]#这里可以当作侮辱性文档中出现的单词都在p1Num矩阵上相应单词位置加1,增加侮辱性文档单词个数
p1Denom += sum(trainMatrix[i]) #p1Denom每次加上侮辱性文档的单词个数
else:
p0Num +=trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#这里因为p1Num所有元素初始化为1,p1Denom初始化为2,所以sum(p1Num)-32=p1Denom-2,sum(p0Num)-32=p0Denom-2;
#p1Vect = p1Num/p1Denom #对每个元素做除法
#p0Vect = p0Num/p0Denom
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1>p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet() #产生文档矩阵和对应的标签
#print (len(listOPosts))#6个文档
myVocabList = createVocabList(listOPosts) #创建并集
#print(len(myVocabList)) #共32个单词
trainMat = [] #创建一个空的列表
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用词向量来填充trainMat列表
#print (len(trainMat)) #trainMat是每个文档对应单词表,表征该单词是否存在的单词矩阵,每个文档的单词矩阵相同,有32个向量
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练函数
testEntry = ['love','my','dalmation'] #测试文档列表
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
if __name__ == "__main__":
testingNB()
from numpy import *
#词表到向量的转换函数
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
print(len(postingList[0])+len(postingList[1])+len(postingList[2])+len(postingList[3])+len(postingList[4])+len(postingList[5]))
classVec = [0,1,0,1,0,1] #1,侮辱 0,正常
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([]) #调用set方法,创建一个空集
for document in dataSet:
vocabSet = vocabSet | set(document) #创建两个集合的并集
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print "the word:%s is not in my Vocabulary" % word
return returnVec
def bagOfWords2VecMN(vocabList,inputSet):
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
#朴素贝叶斯分类器训练集
def trainNB0(trainMatrix,trainCategory): #传入参数为文档矩阵,每篇文档类别标签所构成的向量
numTrainDocs = len(trainMatrix) #文档矩阵的长度,文档个数
numWords = len(trainMatrix[0]) #每个文档的单词个数
#print(sum(trainCategory)) #这里侮辱性文档标记为1,sum加和后则为全部文档中侮辱性文档的个数,除以总文档数即为侮辱性文档出现概率
pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档概率,根据初始手动输入的分类矩阵
#p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0
p0Num = ones(numWords);p1Num = ones(numWords) #初始化两个矩阵,长度为numWords,内容值为1
#p0Denom = 0.0;p1Denom = 0.0 #初始化概率
p0Denom = 2.0;p1Denom = 2.0 #p0Denom+p1Denom是所有单词个数和
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num +=trainMatrix[i]#这里可以当作侮辱性文档中出现的单词都在p1Num矩阵上相应单词位置加1,增加侮辱性文档单词个数
p1Denom += sum(trainMatrix[i]) #p1Denom每次加上侮辱性文档的单词个数
else:
p0Num +=trainMatrix[i]
p0Denom += sum(trainMatrix[i])
#这里因为p1Num所有元素初始化为1,p1Denom初始化为2,所以sum(p1Num)-32=p1Denom-2,sum(p0Num)-32=p0Denom-2;
#p1Vect = p1Num/p1Denom #对每个元素做除法
#p0Vect = p0Num/p0Denom
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1>p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet() #产生文档矩阵和对应的标签
#print (len(listOPosts))#6个文档
myVocabList = createVocabList(listOPosts) #创建并集
#print(len(myVocabList)) #共32个单词
trainMat = [] #创建一个空的列表
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用词向量来填充trainMat列表
#print (len(trainMat)) #trainMat是每个文档对应单词表,表征该单词是否存在的单词矩阵,每个文档的单词矩阵相同,有32个向量
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练函数
testEntry = ['love','my','dalmation'] #测试文档列表
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
if __name__ == "__main__":
testingNB()
相关文章推荐
- 《机器学习实战》学习笔记-[4]-基于概率的分类-朴素贝叶斯
- 机器学习-斯坦福:学习笔记6-朴素贝叶斯
- 【机器学习-斯坦福】学习笔记6 - 朴素贝叶斯
- 机器学习实战笔记(3.3)-朴素贝叶斯算法(多项式模型的朴素贝叶斯实现)
- 统计学习笔记六----朴素贝叶斯
- 机器学习实战笔记(Python实现)-03-朴素贝叶斯
- 机器学习学习笔记之三:朴素贝叶斯
- 《机器学习实战》笔记之四——基于概率论的分类方法:朴素贝叶斯
- 《机器学习实战》代码片段学习3 朴素贝叶斯
- Machine Learning in Action 学习笔记-(4)基于概率论的分类方法:朴素贝叶斯
- 【机器学习-斯坦福】学习笔记6 - 朴素贝叶斯
- ML学习笔记-朴素贝叶斯
- 机器学习学习笔记之三:朴素贝叶斯
- 【学习笔记】斯坦福大学公开课(机器学习) 之生成学习算法:朴素贝叶斯
- Spark MLlib 入门学习笔记 - 朴素贝叶斯
- 机器学习实战学习笔记(三):朴素贝叶斯
- 机器学习(八):CS229ML课程笔记(4)——生成学习,高斯判别分析,朴素贝叶斯
- 机器学习学习笔记之三:朴素贝叶斯
- 【机器学习-斯坦福】学习笔记6 - 朴素贝叶斯
- [学习笔记]机器学习实战(一)