您的位置：首页 > 其它

中文文本挖掘的贝叶斯分类器&SVM

2015-05-20 11:41 113 查看

贝叶斯分类器

library(jiebaR)
library(tm)
train.dt<-read.csv('words_result.csv',header=T,as.is=T)
names(train.dt)[c(1,3,4)]<-c('','58企业','58同城')
norm.tr.dt<-train.dt[which(train.dt[,2]=='normal'),]
wu8.tr.dt<-train.dt[which(train.dt[,2]=='58'),]
exp.tr.dt<-train.dt[which(train.dt[,2]=='exposure'),]
find.train<-function(file){
a<-as.matrix(file[,-2])
a.sum<-colSums(a[,-1])
a1<-data.frame(cbind(names(a.sum),as.numeric(a.sum)),stringsAsFactors=F)
names(a1)<-c('term','freq')
a1$freq<-as.numeric(a1$freq)
a2<-sapply(2:ncol(a),function(i) {length(which(a[,i]>0))/nrow(a)})
a3<-a1$freq/sum(a1$freq)
train<-transform(a1,occurrence=a2,density=a3)
return(train)
}
train.norm<-find.train(norm.tr.dt)
train.58<-find.train(wu8.tr.dt)
train.exp<-find.train(exp.tr.dt)
head(train.norm[with(train.norm,order(-occurrence)),])

code58<-'分类工作/58tc/'
codepz<-'分类工作/骗子曝光/'

get.msg<-function(path){
con<-file(path,open='rt',encoding='gb2312')
text<-readLines(con)
close(con)
return(paste(text,collapse='\n'))
}

get.tdm<-function(file){
stopwords<-unlist(read.table('中文 stop word.txt',stringsAsFactors=F))
txt1<-dir(code58)
all.text1<-gsub('[0-9 0 1 2 3 4 5 6 7 8 9 A-Z a-z m^2 < > ~]','',file)
cutter=worker(user='rr.utf8')
all.text2<-segment(all.text1,cutter)
sour<-Corpus(VectorSource(all.text2))
control<-list(removePunctuation=T,removeNumbers=T,minDocFreq=2,stopwords=T,wordLengths=c(1,Inf))
text.tdm<-TermDocumentMatrix(sour,control)
rownames(text.tdm)<-gsub('\\n','',rownames(text.tdm))
text.Tdm<-as.matrix(text.tdm)
return(text.Tdm)
}

classify<-function(path,train.file,p,c=1e-6){
text<-get.msg(path)
text.Tdm<-get.tdm(text)
msg.freq<-rowSums(text.Tdm)
msg.match<-intersect(names(msg.freq),train.file$term)
if(length(msg.match)<1){
return(p*c^(length(msg.match)))
}
else{
match.num<-train.file$occurrence[match(msg.match,train.file$term)]
return(p*prod(match.num))
}
}

f<-function(path){
txt1<-dir(path)
norm<-sapply(txt1,function(x){classify(paste(path,x,sep=''),train.norm,p=0.5)})
exposure<-sapply(txt1,function(x){classify(paste(path,x,sep=''),train.exp,p=0.5)})
return(summary(ifelse(exposure>norm,T,F)))
}

f(codepz) # F:7 T:42 误判率：0.143

SVM

library(e1071)
train.dt[which(train.dt[,2]=='normal'),2]<-0
train.dt[which(train.dt[,2]=='exposure'|train.dt[,2]=='58 '),2]<-1
rownames(train.dt)<-train.dt[,1]
train.dt<-train.dt[,-1]
names(train.dt)<-paste('x',1:147,sep='')
train.dt[,1]<-as.numeric(train.dt[,1])

train.num<-sort(sample(1:nrow(train.dt),round(0.7*nrow(train.dt))))
test.num<-which(! 1:nrow(train.dt) %in% train.num)
train.x<-train.dt[train.num,2:ncol(train.dt)]
train.y<-train.dt[train.num,1]
test.x<-train.dt[test.num,2:ncol(train.dt)]
test.y<-train.dt[test.num,1]
svm.1<-svm(train.x,train.y,kernel='linear')
svm.2<-svm(train.x,train.y,kernel='radial')

a<-predict(svm.1,test.x)
a<-as.numeric(a>0)
mse1<-mean(a!=test.y) #误判率0.175

b<-predict(svm.2,test.x)
b<-as.numeric(b>0)
mse2<-mean(b!=test.y) #误判率0.25

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 分类器 tm R

相关文章推荐

新的分享

章节导航