您的位置:首页 > 编程语言 > Python开发

Python贝叶斯算法进行文本主客观分析(采用文本双词模型)

2015-10-01 23:05 716 查看
from __future__ import division
import re
from numpy import ones, array
from numpy.lib.scimath import log
from nltk import *

def loadDataSet():
obj=open("obj_train_data.txt",'r')
sbj=open("sbj_train_data.txt",'r')
lst_all=[]
classVec=[]
for i in range(2000):
classVec.append(i%2)
for i in range(1000):
str0=obj.readline()
str1=sbj.readline()
regEx0=re.compile('\\W*')
regEx1=re.compile('\\W*')
lst_obj=regEx0.split(str0)
lst_sbj=regEx1.split(str1)
each_lst_obj=[]
each_lst_obj.append([tok.lower() for tok in lst_obj if len(tok)>0])
lst_all.append(bigramGenerate(each_lst_obj[0]))
each_lst_sbj=[]
each_lst_sbj.append([tok.lower() for tok in lst_sbj if len(tok)>0])
lst_all.append(bigramGenerate(each_lst_sbj[0]))
return lst_all,classVec

def bigramGenerate(sampleLst):
parentLst=[]
for i in range(len(sampleLst)):
if(i<len(sampleLst)-1):
childLst=[]
childLst.append(sampleLst[i])
childLst.append(sampleLst[i+1])
parentLst.append(childLst)
else:
break
return parentLst

def loadTestSet():
obj=open("obj_test.txt",'r')
sbj=open("sbj_test.txt",'r')
lst_obj_test=[]
lst_sbj_test=[]
for i in range(1000):
str0=obj.readline()
regEx0=re.compile('\\W*')
lst_obj=regEx0.split(str0)
each_lst_obj=[]
each_lst_obj.append([tok.lower() for tok in lst_obj if len(tok)>0])
lst_obj_test.append(bigramGenerate(each_lst_obj[0]))
for i in range(1000):
str1=sbj.readline()
regEx1=re.compile('\\W*')
lst_sbj=regEx1.split(str1)
each_lst_sbj=[]
each_lst_sbj.append([tok.lower() for tok in lst_sbj if len(tok)>0])
lst_sbj_test.append(bigramGenerate(each_lst_sbj[0]))
return lst_obj_test,lst_sbj_test

def createVocabList(dataSet):
lst_whole=[]
for document in dataSet:
for i in range(len(document)):
lst_whole.append(tuple(document[i])) #union of the two sets
vocabSet=set(lst_whole)
lst_whole=list(vocabSet)
vocabList=[]
for i in range(len(lst_whole)):
vocabList.append(list(lst_whole[i]))
return vocabList

def bagOfWords2VecMN(vocabList, inputSet): #inputset just like [['a','b'],['c','d']]
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec

def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pCi = sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones()
p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom)          #change to log()
p0Vect = log(p0Num/p0Denom)          #change to log()
return p0Vect,p1Vect,pCi

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0

def testingNB(lst_obj,lst_sbj):
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
obj_corre=[]
sbj_corre=[]
for postinDoc in listOPosts:
trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
for i in range(1000):
testEntry=lst_obj[i]
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
a=classifyNB(thisDoc,p0V,p1V,pAb)
obj_corre.append(a)
print("the objective text classify accuracy: {} ".format(1-sum(obj_corre)/1000))
print(sum(obj_corre))
for i in range(1000):
testEntry = lst_sbj[i]
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
a=classifyNB(thisDoc,p0V,p1V,pAb)
sbj_corre.append(a)
print("the subjective text classify accuracy: {} ".format(sum(sbj_corre)/1000))
print(sum(sbj_corre))
print(p0V)
print(p1V)
print(pAb)

if __name__=='__main__':
lst_obj,lst_sbj=loadTestSet()
testingNB(lst_obj,lst_sbj)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: