Python:通过gensim和jieba分词进行文本相似度分析
2017-01-20 00:00
531 查看
#! -*- coding:utf-8 -*- import pymongo import codecs,sys from pymongo import MongoClient import jieba from gensim import corpora, models, similarities import nltk import jieba.analyse from nltk.tokenize import word_tokenize from pprint import pprint # pretty-printer reload(sys) sys.setdefaultencoding('utf-8') kickpath="" #"/root/python/" dics=[] dits={} labels={} count=1 mydoclist =[] courses=[] questions=[] uuids=[] #通过jieba中文分词生成词条 def jieba_preprocess_cn(courses, low_freq_filter = True): #jieba.analyse.set_stop_words("../extra_dict/stop_words.txt") #jieba.analyse.set_idf_path("../extra_dict/idf.txt.big"); texts_tokenized = [] for document in courses: texts_tokenized_tmp = [] words= jieba.cut(document,cut_all=True) tages= jieba.analyse.extract_tags(document,500) texts_tokenized.append(tages) texts_filtered_stopwords = texts_tokenized pprint(texts_filtered_stopwords) #去除标点符号 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] texts_filtered = [[word for word in document if not word in english_punctuations] for document intexts_filtered_stopwords] #去除过低频词 if low_freq_filter: # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in texts_filtered: &nbs 7fe0 p; for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts_filtered] else: texts = texts_filtered pprint(texts) return texts def train_by_lsi(lib_texts): #为了能看到过程日志 #import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): 将collection words 转为词袋,用两元组(word_id, word_frequency)表示 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #拍脑袋的:训练topic数量为10的LSI模型 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) #, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) # index 是 gensim.similarities.docsim.MatrixSimilarity 实例 dictionary.save(kickpath+"kick.dict") lsi.save(kickpath+"kick.lsi") index.save(kickpath+"kick.index") return (index, dictionary, lsi) if __name__ == '__main__': conn = MongoClient("xxx", 27017) db = conn.health db.authenticate("xx", "xxx") content = db.kickchufang.find({'doctorId':'huanghuang'}) index=0 for i in content: line = str(i['desc']) #.decode("utf-8") #.encode("GB18030")) #print "line:",line uuid = i['uuid'] uuids.append(uuid) #print uuid, line courses.append(line) print str(index) index=index+1 #if (index>10): # break man_file = open(kickpath+"kick.uuids", 'w') print(uuids, man_file) man_file.close() courses_name = courses # 库建立完成 -- 这部分可能数据很大,可以预先处理好,存储起来 lib_texts = jieba_preprocess_cn(courses) (index, dictionary, lsi) = train_by_lsi(lib_texts) |
相关文章推荐
- gensim和jieba分词进行主题分析,文本相似度
- Python使用jieba分词并用weka进行文本分类
- python+gensim︱jieba分词、词袋doc2bow、TFIDF文本挖掘
- 用Python进行简单的文本相似度分析
- Python 文本挖掘:使用gensim进行文本相似度计算
- Python贝叶斯算法进行文本主客观分析(采用文本双词模型)
- Python 文本挖掘:使用gensim进行文本相似度计算
- Python 文本挖掘:使用gensim进行文本相似度计算
- Python 文本挖掘:jieba中文分词和词性标注
- Python进行文本预处理(文本分词,过滤停用词,词频统计,特征选择,文本表示)
- 文本分析--jieba中文分词
- 使用python 的结巴(jieba)库进行中文分词
- Python安装jieba包,进行分词
- 用R进行文本挖掘与分析:分词、画词云【2】
- Python 文本挖掘:使用情感词典进行情感分析(算法及程序设计)
- Python 文本挖掘:使用机器学习方法进行情感分析(一、特征提取和选择)
- Perl 调用R分词进行文本数据分析
- 使用python jieba库进行中文分词
- Python 文本挖掘:使用gensim进行文本相似度计算
- [置顶] 【python 走进NLP】 NLP 使用jieba分词处理文本