您的位置:首页 > 编程语言 > Python开发

Python贝叶斯算法进行情感分析

2015-10-01 23:03 726 查看
from __future__ import division
import re
from numpy import ones, array
from numpy.lib.scimath import log
from nltk import *

def loadDataSet():
pos=open("pos_train.txt",'r')
neg=open("neg_train.txt",'r')
lst_all=[]
classVec=[]
for i in range(700):
classVec.append(i%2)
for i in range(350):
str0=pos.readline()
str1=neg.readline()
regEx0=re.compile('\\W*')
regEx1=re.compile('\\W*')
lst_pos=regEx0.split(str0)
lst_neg=regEx1.split(str1)
lst_all.append([tok.lower() for tok in lst_pos if len(tok)>0])
lst_all.append([tok.lower() for tok in lst_neg if len(tok)>0])
return lst_all,classVec

def loadTestSet():
pos=open("pos_test.txt",'r')
neg=open("neg_test.txt",'r')
lst_pos_test=[]
lst_neg_test=[]
for i in range(350):
str0=pos.readline()
regEx0=re.compile('\\W*')
lst_pos=regEx0.split(str0)
lst_pos_test.append([tok.lower() for tok in lst_pos if len(tok)>0])
for i in range(350):
str1=neg.readline()
regEx1=re.compile('\\W*')
lst_neg=regEx1.split(str1)
lst_neg_test.append([tok.lower() for tok in lst_neg if len(tok)>0])
# print 'loadtestset'
return lst_pos_test,lst_neg_test

def createVocabList(dataSet):
vocabSet = set([])  #create empty set
for document in dataSet:
vocabSet = vocabSet | set(document) #union of the two sets
# print "createVocabList"
return list(vocabSet)

def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
# else:
# print "the word: %s is not in my Vocabulary!" % word
# print "bagbagbag"
return returnVec

def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pCi = sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones()
p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom)          #change to log()
p0Vect = log(p0Num/p0Denom)          #change to log()
print "training"
return p0Vect,p1Vect,pCi

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
# print "classifying"
if p1 > p0:
return 1
else:
return 0

def testingNB(lst_pos,lst_neg):
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
pos_corre=[]
neg_corre=[]
for postinDoc in listOPosts:
trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
for i in range(350):
testEntry=lst_pos[i]
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
a=classifyNB(thisDoc,p0V,p1V,pAb)
pos_corre.append(a)
print("the positive text classify accuracy: {} ".format(1-sum(pos_corre)/350))
print(sum(pos_corre))
for i in range(350):
testEntry = lst_neg[i]
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
a=classifyNB(thisDoc,p0V,p1V,pAb)
neg_corre.append(a)
print("the negative text classify accuracy: {} ".format(sum(neg_corre)/350))
print(sum(neg_corre))
print(p0V)
print(p1V)
print(pAb)

if __name__=='__main__':
lst_pos,lst_neg=loadTestSet()
testingNB(lst_pos,lst_neg)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: