您的位置:首页 > 其它

基于gensim的文本主题模型(LDA)分析

2016-01-05 20:56 627 查看
最近参加牛客网的一个数据挖掘比赛,里面用到文本分析,其中用到主题模型(LDA),在这里顺便就写一个小小的例子吧。

我用的是python代码,调用gensim包,对进八万条短文本进行主题模型分析。
http://blog.csdn.net/huagong_adu/article/details/7937616 http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/
1.gensim的安装与使用:

不多说,https://pypi.python.org/pypi直接搜gensim下载解压python setup.py install安装即可

使用,查看gensim官网文档http://radimrehurek.com/gensim/tutorial.html

2.文本分词

推荐结巴分词,我这里用的是取关键词

key_list = jieba.analyse.extract_tags(term[2],30) #get keywords
3.统计主题模型

步骤:构建词典—>对应text的向量(word2vec1)—>统计tfidf—>对应text的向量(word2vec2)—>lda模型—>lda特征向量

核心代码:

dic = corpora.Dictionary(data_list) #构造词典
corpus = [dic.doc2bow(text) for text in data_list] # 每个text 对应的稀疏向量
tfidf = models.TfidfModel(corpus) #统计tfidf
print "lda"
corpus_tfidf = tfidf[corpus]  #得到每个文本的tfidf向量,稀疏矩阵
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = 200)
corpus_lda = lda[corpus_tfidf] #
至此得到的corpus_lda就是每个text的LDA向量,稀疏的,元素值是隶属与对应序数类的权重

4.工程文件

本工程是,先对文本做LDA分析,然后用kmeans聚类,从而提取每个文本的特征向量,经过实验,在本工程场景,lda向量和直接聚类与先lda再聚类之后的效果相差不大

文本文件在我的资源,如有需要自行查找(文本三个字段,id,title(可能空),content(可能空))

#!/usr/bin/python
# -*- coding:utf8 -*-

import os
import time
import re
import jieba.analyse

def post_cut(url):
fr = open(url+"/post_data.txt")
fo = open(url+"/post_key.txt","a+")
for line in fr.readlines():
term = line.strip().split("\t")
if len(term) == 3 and  term[2] != "":
key_list = jieba.analyse.extract_tags(term[2],30) #get keywords
ustr = term[0]+"\t"
for i in key_list:
ustr += i.encode("utf-8") +" "
fo.write(ustr+"\n")
fr.close()
fo.close()

def post_tfidf(url):
from sklearn.feature_extraction.text import HashingVectorizer
fr = open(url+"/post_key.txt")
id_list = []
data_list = []
for  line in fr.readlines():
term = line.strip().split("\t")
if len(term) == 2:
id_list.append(term[0])
data_list.append(term[1])

hv = HashingVectorizer(n_features = 10000,non_negative=True)#该类实现hash技巧
post_tfidf = hv.fit_transform(data_list)    #return feature vector 'fea_train' [n_samples,n_features]
print 'Size of fea_train:' + repr(post_tfidf.shape)
print post_tfidf.nnz
post_cluster(url,id_list,post_tfidf)

def post_cluster(url,id,tfidf_vec):
from sklearn.cluster import KMeans
kmean = KMeans(n_clusters = 300)
print "kmeans"
kmean.fit(tfidf_vec)
#     pred = kmean.transform(tfidf_vec)

#   count1 = 0
#   count2 = 0
#     pred_str = []
#
#     for item in pred:
#         count1 += 1
#         vec = ""
#         for tmp in item :
#             vec += str(tmp)[0:7] + "\t"
#         pred_str.append(vec)
#
#     print len(pred_str)
#     print len(id)

pred = kmean.predict(tfidf_vec)
fo = open(url+"/cluster.txt","a+")
for i in range(len(pred)):
count2 += 1
fo.write(id[i]+"\t"+str(pred[i])+"\n")
fo.close()
print "%d+%d"%(count1,count2)

def post_lda(url,cluster):
from gensim import corpora, models,matutils
count = 0
fr = open(url+"/post_key.txt")
fo2 = open(url+"/post_vec_lda.txt","a+")
id_list = []
data_list = []

for  line in fr.readlines():
term = line.strip().split("\t")
if len(term) == 2:
count += 1
id_list.append(term[0])
word = term[1].strip().split()
data_list.append(word)
print "lda"
dic = corpora.Dictionary(data_list) #构造词典
corpus = [dic.doc2bow(text) for text in data_list] # 每个text 对应的稀疏向量
tfidf = models.TfidfModel(corpus) #统计tfidf
print "lda"
corpus_tfidf = tfidf[corpus]  #得到每个文本的tfidf向量,稀疏矩阵
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = 200)
corpus_lda = lda[corpus_tfidf] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
print "lda"

num = 0
for doc in corpus_lda:
wstr = ""
for i in range(len(doc)):
item = doc[i]
wstr += str(item[0]) +","+str(item[1])[0:7] + "/"
fo2.write(id_list[num]+"\t"+wstr[0:-1]+"\n")
num += 1
fr.close()
fo2.close()
print num

if cluster:
lda_csc_matrix = matutils.corpus2csc(corpus_lda).transpose()  #gensim sparse matrix to scipy sparse matrix
post_cluster(url,id_list,lda_csc_matrix)

if __name__ == "__main__":
url = "path"
time = time.time()
post_cut(url)
post_tfidf(url)
lda_cluster = False
post_lda(url,lda_cluster)

print time.time() - time
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: