《机器学习实战(Scala实现)》(四)——朴素贝叶斯
2017-03-29 12:55
295 查看
原理
关于算法原理可以参阅:http://blog.csdn.net/u011239443/article/details/53735609#t35构建词向量
python
def loadDataSet(): postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] # 1 代表侮辱性的词 0则不是 classVec = [0,1,0,1,0,1] return postingList,classVec def createVocabList(dataSet): # 创建空集合 vocabSet = set([]) for document in dataSet: # 合并两个集合 vocabSet = vocabSet | set(document) return list(vocabSet) def setOfWords2Vec(vocabList, inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print "the word: %s is not in my Vocabulary!" % word return returnVec
训练与测试算法
python
训练算法
def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pAbusive = sum(trainCategory)/float(numTrainDocs) p0Num = ones(numWords); p1Num = ones(numWords) p0Denom = 2.0; p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive
这里的
pAbusive其实应该计算的是各个类别的概率。但是我们这里是类别只有 0 和 1 的二分类,所以只要返回一个类别为 1 的概率给后续程序就行了。
p0Num = ones(numWords); p1Num = ones(numWords)和
p0Denom = 2.0; p1Denom = 2.0是为了避免后续计算log中的指和分母值取到
0。
p1Num/p1Denom得到向量第i个特征即p(wi/c1),而log(p(w/c1))=log(p(w1/c1))+log(p(w2/c1))+...+log(p(wn/c1))。所以我们只需要将该向量中的每个特征取对数,再累加就能得到log(p(w/c1))
测试算法
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 def testingNB(): listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
由于p(w)是相同的,所以我们只需要比较p(w/c0)p(c0)和p(w/c1)p(c1)的大小,即log(p(w/c0)p(c0))和log(p(w/c1)p(c1))的大小。如:log(p(w/c0)p(c0))=log(p(w/c0))+log(p(c0))=
sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
scala
package NativeBayes import scala.collection.mutable.ArrayBuffer object NativeBayes { def loadDataSet() = { val postingList = Array(Array("my", "dog", "has", "flea", "problems", "help", "please"), Array("maybe", "not", "take", "him", "to", "dog", "park", "stupid"), Array("my", "dalmation", "is", "so", "cute", "I", "love", "him"), Array("stop", "posting", "stupid", "worthless", "garbage"), Array("mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"), Array("quit", "buying", "worthless", "dog", "food", "stupid")) //1 代表不良信息, 反之为 0 val classVec = Array(0, 1, 0, 1, 0, 1) (postingList, classVec) } def setOfWords2Vec(vocabList: Array[String], inputSet: Array[String]) = { val returnVec = new Array[Int](vocabList.length) val vocabListWithIndex = vocabList.zipWithIndex for (word <- inputSet) { if (vocabList.contains(word)) returnVec(vocabListWithIndex.filter(_._1 == word)(0)._2) = 1 else printf("the word: %s is not in my Vocabulary!\n", word) } returnVec } def trainNB0(trainMatrix: Array[Array[Int]], trainCategory: Array[Int]) = { val numTrainDocs = trainMatrix.length val numWords = trainMatrix(0).length val pAbusive = trainCategory.sum / numTrainDocs.toDouble var p0Num = Array.fill(numWords)(1) var p1Num = Array.fill(numWords)(1) var p0Denom = 2.0 var p1Denom = 2.0 for (i <- 0 to numTrainDocs - 1) { if (trainCategory(i) == 1) { var cnt = 0 p1Num = p1Num.map { x => val v = x + trainMatrix(i)(cnt) cnt += 1 v } p1Denom += trainMatrix(i).sum } else { var cnt = 0 p0Num = p0Num.map { x => val v = x + trainMatrix(i)(cnt) cnt += 1 v } p0Denom += trainMatrix(i).sum } } (p1Num.map(x => math.log(x / p1Denom)), p0Num.map(x => Math.log(x / p0Denom)), pAbusive) } def classifyNB(vec2Classify: Array[Int], p0Vec: Array[Double], p1Vec: Array[Double], pClass1: Double) = { var cnt = 0 val p1 = vec2Classify.map { x => val v = x * p1Vec(cnt) cnt += 1 v }.sum + math.log(pClass1) cnt = 0 val p0 = vec2Classify.map { x => val v = x * p0Vec(cnt) cnt += 1 v }.sum + math.log(1.0 - pClass1) if (p1 > p0) 1 else 0 } def main(args: Array[String]): Unit = { val DataSet = loadDataSet() val listOPosts = DataSet._1 val listClasses = DataSet._2 val myVocabList = listOPosts.reduce((a1, a2) => a1.++:(a2)).distinct var trainMat = new ArrayBuffer[Array[Int]](listOPosts.length) listOPosts.foreach(postinDoc => trainMat.append(setOfWords2Vec(myVocabList, postinDoc))) val p = trainNB0(trainMat.toArray, listClasses) val p0V = p._2 val p1V = p._1 val pAb = p._3 val testEntry = Array("love", "my", "dalmation") val thisDoc = setOfWords2Vec(myVocabList, testEntry) println(testEntry.mkString(",") + " classified as: " + classifyNB(thisDoc, p0V, p1V, pAb)) val testEntry2 = Array("stupid", "garbage") val thisDoc2 = setOfWords2Vec(myVocabList, testEntry2) println(testEntry2.mkString(",") + " classified as: " + classifyNB(thisDoc2, p0V, p1V, pAb)) } }
相关文章推荐
- 《机器学习实战(Scala实现)》(二)——k-邻近算法
- 【机器学习实战之二】:C++实现基于概率论的分类方法--朴素贝叶斯分类(Naive Bayes Classifier)
- 【朴素贝叶斯】实战朴素贝叶斯_代码实现_数据和接口
- 【朴素贝叶斯】实战朴素贝叶斯_代码实现_特征选择1
- 机器学习之实战朴素贝叶斯算法
- scala学习:List的基本操作实战与基于模式匹配的List排序算法实现
- Scala深入浅出实战经典《第84讲:Scala中List和ListBuffer设计实现思考》笔记
- 机器学习实战:单变量线性回归的实现
- 机器学习Matlab实战之垃圾邮件分类————朴素贝叶斯模型
- 机器学习实战——第四章:朴素贝叶斯
- Scala深入浅出实战经典《第88讲:Scala中使用For表达式实现map、flatMap、filter》笔记
- 【朴素贝叶斯】实战朴素贝叶斯_代码实现_特征选择2
- 【机器学习实战】:C++实现K-近邻算法KNN
- Scala 深入浅出实战经典 第41讲:List继承体系实现内幕和方法操作源码揭秘
- 【朴素贝叶斯】实战朴素贝叶斯_代码实现_训练算法
- 机器学习实战:单变量线性回归的实现
- 机器学习经典算法详解及Python实现---朴素贝叶斯分类及其在文本分类、垃圾邮件检测中的应用
- Scala 深入浅出实战经典 第51讲:Scala中链式调用风格的实现代码实战及其在Spark中应用
- Scala实战-通过微信聊天窗口实现应答式点餐 0