您的位置:首页 > 其它

机器学习实战 - 第四章 基于概率论的分类方法:朴素贝叶斯 - 学习随手记

2017-07-12 18:16 756 查看
朴素贝叶斯
一个特征或者单词出现的可能性与它和其他单词相邻没有关系
每个特征同等重要

朴素贝叶斯分类器通常有两种实现方式
基于贝努利模型实现

不考虑词在文档中出现的次数,只考虑出现与否,在这个意义上相当于假设词是等权重的

基于多项式模型实现
考虑词在文档中出现的次数

Page61 在转换成矩阵的时候,可不可以在每个行向量后面再加一位,表明这个整体是1还是0? ( 是坏句子还是好句子)
>>> for postinDoc inlistOPosts:

...    trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))

...

>>> p0V, p1V, pAb =bayes.trainNB0(trainMat, listClasses)

>>> pAb

0.5

>>> p0V

array([ 0.04166667, 0.08333333,  0.04166667,  0.04166667, 0.        ,

        0.        , 0.04166667,  0.04166667,  0.04166667, 0.125     ,

        0.04166667,  0.04166667, 0.04166667,  0.04166667,  0.04166667,

        0.        , 0.        ,  0.       ,  0.        , 0.04166667,

        0.04166667,  0.       ,  0.        , 0.04166667,  0.        ,

        0.04166667,  0.04166667, 0.04166667,  0.04166667,  0.       ,

        0.        , 0.04166667])

>>> p1V

array([ 0.        ,  0.05263158, 0.10526316,  0.        , 0.05263158,

        0.10526316,  0.05263158, 0.05263158,  0.        , 0.        ,

        0.        , 0.        ,  0.       ,  0.        , 0.        ,

        0.15789474,  0.05263158, 0.05263158,  0.05263158,  0.       ,

        0.        , 0.05263158,  0.05263158,  0.       ,  0.05263158,

        0.        , 0.        ,  0.       ,  0.        , 0.05263158,

        0.05263158,  0.       ])

>>> myVocabList

['is', 'him', 'dog', 'streak', 'food', 'worthless', 'stop', 'to','ate', 'my', 'so', 'dalmation', 'flea', 'I', 'licks', 'stupid', 'buying', 'quit', 'take', 'help', 'has','posting', 'garbage', 'please', 'not', 'problems', 'cute', 'mr', 'how', 'park','maybe',
'love']

这个p0V, p1V求出的是广泛的每个词在字典里,被划分到“好人”和“坏人”的概率。 (基于我们人工标注的那个list (1,0,1,0,1,0), 比如看 p1V中概率最高的那个是0.157, 通过字典list找到对应索引位置的词语,是stupid)

>>> reload(bayes)

<module 'bayes' from 'G:\\ML\\bayes.py'>

>>> bayes.testingNB()

['love', 'my', 'dalmation'] classified as:  0

['stupid', 'garbage'] classified as: 1

['stupid', 'love'] classified as: 1

>>> mySent = 'This book isthe best book on python or M.L. I have ever laid eyes upon'

>>> mySent.split()

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or','M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']

>>> mySent = 'This book isthe best book on python or M.L. I have ever laid eyes upon.'

>>> mySent.split()

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or','M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']

>>> import re

>>> regEx =re.compile('\\W*')

>>> listOfTokens =regEx.split(mySent)

__main__:1: FutureWarning: split() requires a non-empty patternmatch.

>>> listOfTokens

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or','M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']

>>> [tok for tok inlistOfTokens if len(tok) >0]

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or','M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']

>>> [tok.lower() for tokin listOfTokens if len(tok) >0]

['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or','m', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']

>>>

 

from numpy import *

def  loadDataSet():
postingList=[ ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him' ],
[ 'stop', 'posting', 'stupid', 'worthless', 'garbage'],
[ 'mr', 'licks', 'ate', 'my', 'streak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']  ]
classVec = [0,1,0,1,0,1] #  1  is  bad words,   0  is  normal words
return postingList, classVec

def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print ("the word: %s is not in my vocabulary!" % word )
return returnVec
#到此为止,目前还没有明确接下来要怎么利用这个词向量,是将每一个句子转换成词向量,然后跟例子中的标记为1的句子求余弦相似度?那标记为0的句子有没有用呢,需要在接下来学习过程中明确

def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in inputSet:
returnVec[vocabList.index(word)] += 1  #本函数是词袋模型,统计了词出现的次数
return returnVec

def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
#p0Num = zeros(numWords);  p1Num = zeros(numWords)
#p0Denom =0.0;  p1Denom = 0.0
#因为在利用贝叶斯分类器对文档进行分类时,要计算多个概率的乘积来获得文档属于某个类别的概率,而为了避免某个概率为0导致乘积为0,改进如下:
p0Num = ones(numWords);  p1Num = ones(numWords)
p0Denom = 2.0;  p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] ==1:
p1Num += trainMatrix[i]
p1Denom += sum( trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom)   #change to log()
p0Vect = log(p0Num/p0Denom)  #change to log()
return p0Vect, p1Vect, pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)    # log(pClass1)  这个值加上会效果更好是吗, 一开始觉得只要前面的乘积就好了
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0

def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
testEntry=['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print ( testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'love']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))

def textParse(bigString):
import re
listOfTokens = re.split(r'\W*', bigString)   #注意这里唷! 开始写成了小写的w结果出来的全是符号, classification error的词语没有正常输出
return [tok.lower() for tok in listOfTokens if len(tok) >2 ]

def spamTest():
docList = [];   classList=[];   fullText = []
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = list(range(50));  testSet=[]   #总共有25+25=50个数据集
for i in range(10):    #选取了10个作为test集,其余40个作为训练集
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])  #这种随机选择数据一部分作为训练集,剩余部分作为测试机的过程叫做【留存交叉验证】hold-out cross validation
trainMat = [];  trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[randIndex])  #这句话作用是什么?
p0V,p1V,pSpam = trainNB0(array(trainMat),  array(trainClasses)) #作用就在这里呀hhh
errorCount = 0
for docIndex in testSet:
#wordVector = setOfWords2Vec(vocabList, docList[docIndex])
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount +=1
print ("classification error: ", docList[docIndex])
print ('the error rate is: ', float(errorCount)/len(testSet))
#用spamTest进行了下命令行测试,为什么我的错误率好高呢,试验多次的概率大部分都是大于0.5的
#待进一步分析找原因

#接下来使用朴素贝叶斯发现地域相关的用词.  此功能没有成功运行, 显示报错ValueError: 'for' is not in list,不知道是不是RSS源的问题,待解决
def calcMostFreq(vocabList, fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]

def localWords(feed1,feed0):
import feedparser
docList=[];  classList = [];  fullText = []
minLen = min( len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)  #vocabList 去重后的字典
top30Words = calcMostFreq(vocabList, fullText)
for pairW in top30Words:
if pairW[0] in vocabList: vocabList.remove(pairW[0]) #这里的[0]是为啥咧。 噢,好像懂了,因为calcMostFreq 返回的是一个dict, 每一个item都包含key和value,所以取[0]是取的key值
trainingSet = list(range(2*minLen));  testSet = []
for i in range(2):
randIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])  #从trainingSet中取出来20个当test,原书中是有100以上的,但是那个RSS源无法打开了,只能用这个新的RSS源
trainMat=[];   trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))  #docList 此时包含了feed1和feed2的所有的,所以在长度 minLen*2 完全够用
trainClasses.append(classList[docIndex])
p0V, p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print ('the error rate is: ', float(errorCount)/len(testSet))
return vocabList, p0V, p1V

def getTopWords(ny,sf):
import operator
vocabList, p0V, p1V = localWords(ny,sf)
topNY=[];  topSF=[]
for i in range(len(p0V)):
if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
if p1V[i]>-6.0: topNF.append((vocabList[i], p1V[i]))
sortedSF = sorted(topSF, key = lambda pair: pair[1], reverse = True)
print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
print (item[0])
sortedNY = sorted(topNY, key = lambda pair: pair[1], reverse = True)
print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
print (item[0])





                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐