Python贝叶斯算法进行文本主客观分析(采用文本双词模型)
2015-10-01 23:05
716 查看
from __future__ import division import re from numpy import ones, array from numpy.lib.scimath import log from nltk import * def loadDataSet(): obj=open("obj_train_data.txt",'r') sbj=open("sbj_train_data.txt",'r') lst_all=[] classVec=[] for i in range(2000): classVec.append(i%2) for i in range(1000): str0=obj.readline() str1=sbj.readline() regEx0=re.compile('\\W*') regEx1=re.compile('\\W*') lst_obj=regEx0.split(str0) lst_sbj=regEx1.split(str1) each_lst_obj=[] each_lst_obj.append([tok.lower() for tok in lst_obj if len(tok)>0]) lst_all.append(bigramGenerate(each_lst_obj[0])) each_lst_sbj=[] each_lst_sbj.append([tok.lower() for tok in lst_sbj if len(tok)>0]) lst_all.append(bigramGenerate(each_lst_sbj[0])) return lst_all,classVec def bigramGenerate(sampleLst): parentLst=[] for i in range(len(sampleLst)): if(i<len(sampleLst)-1): childLst=[] childLst.append(sampleLst[i]) childLst.append(sampleLst[i+1]) parentLst.append(childLst) else: break return parentLst def loadTestSet(): obj=open("obj_test.txt",'r') sbj=open("sbj_test.txt",'r') lst_obj_test=[] lst_sbj_test=[] for i in range(1000): str0=obj.readline() regEx0=re.compile('\\W*') lst_obj=regEx0.split(str0) each_lst_obj=[] each_lst_obj.append([tok.lower() for tok in lst_obj if len(tok)>0]) lst_obj_test.append(bigramGenerate(each_lst_obj[0])) for i in range(1000): str1=sbj.readline() regEx1=re.compile('\\W*') lst_sbj=regEx1.split(str1) each_lst_sbj=[] each_lst_sbj.append([tok.lower() for tok in lst_sbj if len(tok)>0]) lst_sbj_test.append(bigramGenerate(each_lst_sbj[0])) return lst_obj_test,lst_sbj_test def createVocabList(dataSet): lst_whole=[] for document in dataSet: for i in range(len(document)): lst_whole.append(tuple(document[i])) #union of the two sets vocabSet=set(lst_whole) lst_whole=list(vocabSet) vocabList=[] for i in range(len(lst_whole)): vocabList.append(list(lst_whole[i])) return vocabList def bagOfWords2VecMN(vocabList, inputSet): #inputset just like [['a','b'],['c','d']] returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pCi = sum(trainCategory)/float(numTrainDocs) p0Num = ones(numWords); p1Num = ones(numWords) #change to ones() p0Denom = 2.0; p1Denom = 2.0 #change to 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) #change to log() p0Vect = log(p0Num/p0Denom) #change to log() return p0Vect,p1Vect,pCi def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 def testingNB(lst_obj,lst_sbj): listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat=[] obj_corre=[] sbj_corre=[] for postinDoc in listOPosts: trainMat.append(bagOfWords2VecMN(myVocabList, postinDoc)) p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) for i in range(1000): testEntry=lst_obj[i] thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) a=classifyNB(thisDoc,p0V,p1V,pAb) obj_corre.append(a) print("the objective text classify accuracy: {} ".format(1-sum(obj_corre)/1000)) print(sum(obj_corre)) for i in range(1000): testEntry = lst_sbj[i] thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry)) a=classifyNB(thisDoc,p0V,p1V,pAb) sbj_corre.append(a) print("the subjective text classify accuracy: {} ".format(sum(sbj_corre)/1000)) print(sum(sbj_corre)) print(p0V) print(p1V) print(pAb) if __name__=='__main__': lst_obj,lst_sbj=loadTestSet() testingNB(lst_obj,lst_sbj)
相关文章推荐
- Python贝叶斯算法进行情感分析
- Python实现文件夹深度优先遍历
- Python利用遗传算法解决八皇后问题
- [Python爬虫] 中文编码问题:raw_input输入、文件读取、变量比较等str、unicode、utf-8转换问题
- Python3.x和Python2.x的区别
- 《机器学习实战》笔记之十四——利用SVD简化数据
- 【Python之旅】第四篇(一):Python装饰器
- python3.4-小爬虫
- python学习之list
- Python核心编程笔记 - 第13章 面向对象编程(二)
- Python美女[从新手到高手]--阅读"见个面问题 HashMap 储存方法"联想
- Python核心编程笔记 - 第13章 面向对象编程 (一)
- Windows7 平台下Python+NLTK环境搭建
- python 中的 filter, lambda, map, reduce 内置函数
- python的timer带参数传递
- Python连接mysql数据库
- Sublime Text 搭建 Python 开发平台
- 八大排序算法(Python实现)
- virtualenv--python虚拟沙盒的安装使用
- python可变交换性能优化