Python贝叶斯算法进行情感分析
2015-10-01 23:03
726 查看
from __future__ import division import re from numpy import ones, array from numpy.lib.scimath import log from nltk import * def loadDataSet(): pos=open("pos_train.txt",'r') neg=open("neg_train.txt",'r') lst_all=[] classVec=[] for i in range(700): classVec.append(i%2) for i in range(350): str0=pos.readline() str1=neg.readline() regEx0=re.compile('\\W*') regEx1=re.compile('\\W*') lst_pos=regEx0.split(str0) lst_neg=regEx1.split(str1) lst_all.append([tok.lower() for tok in lst_pos if len(tok)>0]) lst_all.append([tok.lower() for tok in lst_neg if len(tok)>0]) return lst_all,classVec def loadTestSet(): pos=open("pos_test.txt",'r') neg=open("neg_test.txt",'r') lst_pos_test=[] lst_neg_test=[] for i in range(350): str0=pos.readline() regEx0=re.compile('\\W*') lst_pos=regEx0.split(str0) lst_pos_test.append([tok.lower() for tok in lst_pos if len(tok)>0]) for i in range(350): str1=neg.readline() regEx1=re.compile('\\W*') lst_neg=regEx1.split(str1) lst_neg_test.append([tok.lower() for tok in lst_neg if len(tok)>0]) # print 'loadtestset' return lst_pos_test,lst_neg_test def createVocabList(dataSet): vocabSet = set([]) #create empty set for document in dataSet: vocabSet = vocabSet | set(document) #union of the two sets # print "createVocabList" return list(vocabSet) def bagOfWords2VecMN(vocabList, inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 # else: # print "the word: %s is not in my Vocabulary!" % word # print "bagbagbag" return returnVec def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pCi = sum(trainCategory)/float(numTrainDocs) p0Num = ones(numWords); p1Num = ones(numWords) #change to ones() p0Denom = 2.0; p1Denom = 2.0 #change to 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) #change to log() p0Vect = log(p0Num/p0Denom) #change to log() print "training" return p0Vect,p1Vect,pCi def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # print "classifying" if p1 > p0: return 1 else: return 0 def testingNB(lst_pos,lst_neg): listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat=[] pos_corre=[] neg_corre=[] for postinDoc in listOPosts: trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) for i in range(350): testEntry=lst_pos[i] thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) a=classifyNB(thisDoc,p0V,p1V,pAb) pos_corre.append(a) print("the positive text classify accuracy: {} ".format(1-sum(pos_corre)/350)) print(sum(pos_corre)) for i in range(350): testEntry = lst_neg[i] thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) a=classifyNB(thisDoc,p0V,p1V,pAb) neg_corre.append(a) print("the negative text classify accuracy: {} ".format(sum(neg_corre)/350)) print(sum(neg_corre)) print(p0V) print(p1V) print(pAb) if __name__=='__main__': lst_pos,lst_neg=loadTestSet() testingNB(lst_pos,lst_neg)
相关文章推荐
- Python实现文件夹深度优先遍历
- Python利用遗传算法解决八皇后问题
- [Python爬虫] 中文编码问题:raw_input输入、文件读取、变量比较等str、unicode、utf-8转换问题
- Python3.x和Python2.x的区别
- 《机器学习实战》笔记之十四——利用SVD简化数据
- 【Python之旅】第四篇(一):Python装饰器
- python3.4-小爬虫
- python学习之list
- Python核心编程笔记 - 第13章 面向对象编程(二)
- Python美女[从新手到高手]--阅读"见个面问题 HashMap 储存方法"联想
- Python核心编程笔记 - 第13章 面向对象编程 (一)
- Windows7 平台下Python+NLTK环境搭建
- python 中的 filter, lambda, map, reduce 内置函数
- python的timer带参数传递
- Python连接mysql数据库
- Sublime Text 搭建 Python 开发平台
- 八大排序算法(Python实现)
- virtualenv--python虚拟沙盒的安装使用
- python可变交换性能优化
- python连接数据库设置