nltk-构建和使用语料库-可用于小说的推荐-完整实例
2016-09-22 16:29
288 查看
步骤1:构建语料库:
[python] view
plain copy
#!/usr/bin/env python
#-*-coding=utf-8-*-
#数据源目录(二级目录)
sourceDataDir='data'
#数据源文件列表
fileLists = []
import os
from gensim import corpora, models, similarities
def getSourceFileLists(sourceDataDir):
fileLists = []
subDirList = os.listdir(sourceDataDir)
for subDir in subDirList:
subList = os.listdir(sourceDataDir + '/' + subDir)
fileList = [ sourceDataDir+'/'+subDir+'/'+ x for x in subList if os.path.isfile(sourceDataDir+'/'+subDir+'/'+x)]
fileLists += fileList
return fileLists
fileLists = getSourceFileLists(sourceDataDir)
if 0 < len(fileLists):
import codecs
import jieba
punctuations = ['','\n','\t',',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
if not os.path.exists('dict'):
os.mkdir("dict")
if not os.path.exists('corpus'):
os.mkdir("corpus")
for fileName in fileLists:
print fileName
hFile = None
content = None
try:
hFile = codecs.open(fileName,'r','gb18030')
content = hFile.readlines()
except Exception,e:
print e
finally:
if hFile:
hFile.close()
if content:
fileFenci = [ x for x in jieba.cut(' '.join(content),cut_all=True)]
fileFenci2 = [word for word in fileFenci if not word in punctuations]
texts = [fileFenci2]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
sFileDir, sFileName = os.path.split(fileName)
dictFileName = 'dict/'+sFileName+'.dict'
corpusFileName = 'corpus/'+sFileName+'.mm'
dictionary = corpora.Dictionary(texts)
dictionary.save_as_text(dictFileName)
corpus = ([dictionary.doc2bow(text) for text in texts])
corpora.MmCorpus.serialize(corpusFileName, corpus)
print 'Build corpus done'
数据源:
来自 http://d1.txthj.com/newrar/txthj_264.rar 的83篇小说,将其目录存放在目录 ./data/下。
加载时作为二层目录处理
输出:
./dict 和 ./corpus
在对应目录下生成 xxx.dict 和 xxx.mm,xxx为原文件的全称(不包括路径,包括后缀)
步骤2:加载语料库,相似性分析
[python] view
plain copy
#!/usr/bin/env python
#-*-coding=utf-8-*-
import os
from gensim import corpora, models, similarities
def getFileList(dir):
return [ dir + x for x in os.listdir(dir)]
dictLists = getFileList('./dict/')
class LoadDictionary(object):
def __init__(self, dictionary):
self.dictionary = dictionary
def __iter__(self):
for dictFile in dictLists:
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
sFileDir, sFileName = os.path.split(sFileRaw)
(dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm')
yield self.dictionary.load_from_text(dictFile)
class LoadCorpus(object):
def __iter__(self):
for dictFile in dictLists:
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
sFileDir, sFileName = os.path.split(sFileRaw)
(dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm')
yield corpora.MmCorpus(corpusFile)
"""
预处理(easy_install nltk)
"""
#简化的 中文+英文 预处理
def pre_process_cn(inputs, low_freq_filter = True):
"""
1.去掉停用词
2.去掉标点符号
3.处理为词干
4.去掉低频词
"""
import nltk
import jieba.analyse
from nltk.tokenize import word_tokenize
texts_tokenized = []
for document in inputs:
texts_tokenized_tmp = []
for word in word_tokenize(document):
texts_tokenized_tmp += jieba.analyse.extract_tags(word,10)
texts_tokenized.append(texts_tokenized_tmp)
texts_filtered_stopwords = texts_tokenized
#去除标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
#词干化
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
#去除过低频词
if low_freq_filter:
all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
else:
texts = texts_stemmed
return texts
dictionary = corpora.dictionary.Dictionary()
dictionary_memory_friendly = LoadDictionary(dictionary)
for vector in dictionary_memory_friendly:
dictionary = vector
corpus = []
corpus_memory_friendly = LoadCorpus()
for vector in corpus_memory_friendly:
corpus.append(vector[0])
if 0 < len(corpus):
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
model = models.LsiModel(corpus_tfidf, id2word=None, num_topics=20, chunksize=2000000) #不指定 id2word=dictionary 时,LsiModel内部会根据 corpus 重建 dictionary
index = similarities.Similarity('./novel_', model[corpus], num_features=len(corpus))
#要处理的对象登场,这里随便从小说中截取了一段话
target_courses = ['男人们的脸上沉重而冷凝,蒙着面纱的女人们则是发出断断续续的哭泣声,他们无比专注地看着前方,见证一场生与死的拉锯战。']
target_text = pre_process_cn(target_courses, low_freq_filter=False)
"""
对具体对象相似度匹配
"""
#选择一个基准数据
ml_course = target_text[0]
#词袋处理
ml_bow = dictionary.doc2bow(ml_course)
#在上面选择的模型数据 lsi model 中,计算其他数据与其的相似度
ml_lsi = model[ml_bow] #ml_lsi 形式如 (topic_id, topic_value)
sims = index[ml_lsi] #sims 是最终结果了, index[xxx] 调用内置方法 __getitem__() 来计算ml_lsi
#排序,为输出方便
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
#查看结果
print sort_sims[0:10]
print len(dictLists)
print dictLists[sort_sims[1][0]]
print dictLists[sort_sims[2][0]]
print dictLists[sort_sims[3][0]]
说明:
yield的使用是为了更好的内存效率。
遗留问题:
步骤2会有提示:
/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:122: UserWarning: indices array has non-integer dtype (float64)
不影响处理过程
[python] view
plain copy
#!/usr/bin/env python
#-*-coding=utf-8-*-
#数据源目录(二级目录)
sourceDataDir='data'
#数据源文件列表
fileLists = []
import os
from gensim import corpora, models, similarities
def getSourceFileLists(sourceDataDir):
fileLists = []
subDirList = os.listdir(sourceDataDir)
for subDir in subDirList:
subList = os.listdir(sourceDataDir + '/' + subDir)
fileList = [ sourceDataDir+'/'+subDir+'/'+ x for x in subList if os.path.isfile(sourceDataDir+'/'+subDir+'/'+x)]
fileLists += fileList
return fileLists
fileLists = getSourceFileLists(sourceDataDir)
if 0 < len(fileLists):
import codecs
import jieba
punctuations = ['','\n','\t',',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
if not os.path.exists('dict'):
os.mkdir("dict")
if not os.path.exists('corpus'):
os.mkdir("corpus")
for fileName in fileLists:
print fileName
hFile = None
content = None
try:
hFile = codecs.open(fileName,'r','gb18030')
content = hFile.readlines()
except Exception,e:
print e
finally:
if hFile:
hFile.close()
if content:
fileFenci = [ x for x in jieba.cut(' '.join(content),cut_all=True)]
fileFenci2 = [word for word in fileFenci if not word in punctuations]
texts = [fileFenci2]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
sFileDir, sFileName = os.path.split(fileName)
dictFileName = 'dict/'+sFileName+'.dict'
corpusFileName = 'corpus/'+sFileName+'.mm'
dictionary = corpora.Dictionary(texts)
dictionary.save_as_text(dictFileName)
corpus = ([dictionary.doc2bow(text) for text in texts])
corpora.MmCorpus.serialize(corpusFileName, corpus)
print 'Build corpus done'
数据源:
来自 http://d1.txthj.com/newrar/txthj_264.rar 的83篇小说,将其目录存放在目录 ./data/下。
加载时作为二层目录处理
输出:
./dict 和 ./corpus
在对应目录下生成 xxx.dict 和 xxx.mm,xxx为原文件的全称(不包括路径,包括后缀)
步骤2:加载语料库,相似性分析
[python] view
plain copy
#!/usr/bin/env python
#-*-coding=utf-8-*-
import os
from gensim import corpora, models, similarities
def getFileList(dir):
return [ dir + x for x in os.listdir(dir)]
dictLists = getFileList('./dict/')
class LoadDictionary(object):
def __init__(self, dictionary):
self.dictionary = dictionary
def __iter__(self):
for dictFile in dictLists:
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
sFileDir, sFileName = os.path.split(sFileRaw)
(dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm')
yield self.dictionary.load_from_text(dictFile)
class LoadCorpus(object):
def __iter__(self):
for dictFile in dictLists:
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
sFileDir, sFileName = os.path.split(sFileRaw)
(dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm')
yield corpora.MmCorpus(corpusFile)
"""
预处理(easy_install nltk)
"""
#简化的 中文+英文 预处理
def pre_process_cn(inputs, low_freq_filter = True):
"""
1.去掉停用词
2.去掉标点符号
3.处理为词干
4.去掉低频词
"""
import nltk
import jieba.analyse
from nltk.tokenize import word_tokenize
texts_tokenized = []
for document in inputs:
texts_tokenized_tmp = []
for word in word_tokenize(document):
texts_tokenized_tmp += jieba.analyse.extract_tags(word,10)
texts_tokenized.append(texts_tokenized_tmp)
texts_filtered_stopwords = texts_tokenized
#去除标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
#词干化
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
#去除过低频词
if low_freq_filter:
all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
else:
texts = texts_stemmed
return texts
dictionary = corpora.dictionary.Dictionary()
dictionary_memory_friendly = LoadDictionary(dictionary)
for vector in dictionary_memory_friendly:
dictionary = vector
corpus = []
corpus_memory_friendly = LoadCorpus()
for vector in corpus_memory_friendly:
corpus.append(vector[0])
if 0 < len(corpus):
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
model = models.LsiModel(corpus_tfidf, id2word=None, num_topics=20, chunksize=2000000) #不指定 id2word=dictionary 时,LsiModel内部会根据 corpus 重建 dictionary
index = similarities.Similarity('./novel_', model[corpus], num_features=len(corpus))
#要处理的对象登场,这里随便从小说中截取了一段话
target_courses = ['男人们的脸上沉重而冷凝,蒙着面纱的女人们则是发出断断续续的哭泣声,他们无比专注地看着前方,见证一场生与死的拉锯战。']
target_text = pre_process_cn(target_courses, low_freq_filter=False)
"""
对具体对象相似度匹配
"""
#选择一个基准数据
ml_course = target_text[0]
#词袋处理
ml_bow = dictionary.doc2bow(ml_course)
#在上面选择的模型数据 lsi model 中,计算其他数据与其的相似度
ml_lsi = model[ml_bow] #ml_lsi 形式如 (topic_id, topic_value)
sims = index[ml_lsi] #sims 是最终结果了, index[xxx] 调用内置方法 __getitem__() 来计算ml_lsi
#排序,为输出方便
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
#查看结果
print sort_sims[0:10]
print len(dictLists)
print dictLists[sort_sims[1][0]]
print dictLists[sort_sims[2][0]]
print dictLists[sort_sims[3][0]]
说明:
yield的使用是为了更好的内存效率。
遗留问题:
步骤2会有提示:
/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:122: UserWarning: indices array has non-integer dtype (float64)
不影响处理过程
相关文章推荐
- nltk-构建和使用语料库-可用于小说的推荐-完整实例
- nltk-构建和使用语料库-可用于小说的推荐-完整实例
- nltk-构建和使用语料库-可用于小说的推荐-完整实例
- nltk-构建和使用语料库-可用于小说的推荐-完整实例
- 使用“忽略授权表”参数登录多实例数据库——用于多实例数据库忘记密码时登录 推荐
- 使用maven构建,打包Java项目完整实例
- mahout+Eclipse,使用 Taste 构建推荐引擎实例 – 电影推荐引擎
- 使用maven构建web项目实例
- 使用百度地图JavaScript API构建离线地图应用(完整教程)
- C# WinForm使用Aspose.Cells.dll 导出导入Excel/Doc 完整实例教程
- 使用python构建基于hadoop的mapreduce日志分析平台 推荐
- mahout入门实例-基于 Apache Mahout 构建社会化推荐引擎-实战(参考IBM)
- PHP中AJAX的使用(完整实例【大牛可飘过】)
- 利用resteasy框架构建rest webservice----第二波:使用不同的方式让resteasy发布我们的restful webservice 服务(实例、教程)
- 使用实例讲解RSA算法(用于公钥和私钥体系)
- tomcat结合nginx使用小结[完整推荐]
- 使用SQLServer配置管理器配置SQLServer数据库引擎实例,以便侦听特定的固定1433端口。 推荐
- 使用golang的http模块构建redis读写查api 推荐
- 高级I/O复用技术:Epoll的使用及一个完整的C实例
- 使用tornado模板引擎配合yaml构建nginx配置接口 [扩展saltstack] 推荐