TF-IDF算法抽取中文内容的主题关键词
2017-12-22 18:23
423 查看
db.ini
main.py
参考:
https://github.com/dongxiexidian/Chinese/tree/master/dict
http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
https://zhuanlan.zhihu.com/p/27330205
# db [db] db_port = 3306 db_user = user db_host = localhost db_pass = pwd db_database = db
main.py
# -*-coding:utf-8-*- import MySQLdb import configparser import os import jieba.posseg as pseg from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer import numpy as np import sys reload(sys) sys.setdefaultencoding('utf8') PATH = lambda p: os.path.abspath(os.path.join(os.path.dirname(__file__), p)) db_file = PATH('db.ini') class IfTdf(object): def init_db(self): dbc = configparser.ConfigParser() dbc.read(db_file) self.conn = MySQLdb.connect( host=dbc.get("db", 'db_host'), user=dbc.get("db", 'db_user'), passwd=dbc.get("db", 'db_pass'), db=dbc.get("db", 'db_database'), port=int(dbc.get("db", 'db_port')), charset='utf8') self.cur = self.conn.cursor(MySQLdb.cursors.DictCursor) def __init__(self): self.conn = None self.cur = None self.init_db() def get_data(self): self.cur.execute("SELECT id, content FROM `table` WHERE 1 ORDER BY `id` DESC LIMIT 1000") return self.cur.fetchall() def get_words(self, data): stop_word = [unicode(line.rstrip()) for line in open(PATH('chinese_stopwords.txt'))] for r in data: content = r['content'].strip().replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '') seg_list = pseg.cut(content) seg_list_after = [] for seg in seg_list: if seg.word not in stop_word: seg_list_after.append(seg.word) yield ' '.join(seg_list_after) def get_ids(self, data): for r in data: yield '%s %s Topic:\n' % (r['id'], r['content']) def __del__(self): self.cur.close() self.conn.close() print 'Finished!' def main(self): data = self.get_data() list_words = list(self.get_words(data)) list_ids = list(self.get_ids(data)) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(list_words)) words = vectorizer.get_feature_names() weight = tfidf.toarray() n = 3 for (id, w) in zip(list_ids, weight): print u'{}:'.format(id) loc = np.argsort(-w) for i in range(n): print u'-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]]) print '\n' IfTdf().main()
参考:
https://github.com/dongxiexidian/Chinese/tree/master/dict
http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
https://zhuanlan.zhihu.com/p/27330205
相关文章推荐
- tf-idf算法,实现文章关键字抽取
- TF-IDF算法自动提取关键词
- TF-IDF提取关键词并用余弦算法计算相似度
- 关键词权重计算算法:TF-IDF
- Python TF-IDF 算法 提取文本关键词
- 关键词权重计算算法 - TF-IDF
- 自然语言处理——TF-IDF算法提取关键词
- 结巴中文分词与Tf-IDF关键词权重(二 附Demo)
- TF-IDF提取文章关键词算法
- tf-idf关键词提取算法
- 关键词权重计算算法 - TF-IDF
- TF-IDF算法解析与Python实现方法详解
- TF-IDF及其算法
- TF-IDF与余弦相似性的应用(一):自动提取关键词
- TF-IDF及其算法
- TF-IDF与余弦相似性的应用(一):自动提取关键词
- [转] TF-IDF与余弦相似性的应用(一):自动提取关键词
- TF-IDF及其算法
- python进行中文文本聚类实例(TFIDF计算、词袋构建)
- TF-IDF与余弦相似性的应用(一):自动提取关键词