您的位置:首页 > 其它

TF-IDF算法抽取中文内容的主题关键词

2017-12-22 18:23 423 查看
db.ini

# db
[db]
db_port = 3306
db_user = user
db_host = localhost
db_pass = pwd
db_database = db


main.py

# -*-coding:utf-8-*-

import MySQLdb
import configparser
import os
import jieba.posseg as pseg

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

import sys
reload(sys)
sys.setdefaultencoding('utf8')

PATH = lambda p: os.path.abspath(os.path.join(os.path.dirname(__file__), p))
db_file = PATH('db.ini')

class IfTdf(object):
def init_db(self):
dbc = configparser.ConfigParser()
dbc.read(db_file)
self.conn = MySQLdb.connect(
host=dbc.get("db", 'db_host'),
user=dbc.get("db", 'db_user'),
passwd=dbc.get("db", 'db_pass'),
db=dbc.get("db", 'db_database'),
port=int(dbc.get("db", 'db_port')),
charset='utf8')
self.cur = self.conn.cursor(MySQLdb.cursors.DictCursor)

def __init__(self):
self.conn = None
self.cur = None
self.init_db()

def get_data(self):
self.cur.execute("SELECT id, content FROM `table` WHERE 1 ORDER BY `id` DESC LIMIT 1000")
return self.cur.fetchall()

def get_words(self, data):
stop_word = [unicode(line.rstrip()) for line in open(PATH('chinese_stopwords.txt'))]
for r in data:
content = r['content'].strip().replace('\n', '').replace(' ', '').replace('\t', '').replace('\r', '')
seg_list = pseg.cut(content)
seg_list_after = []
for seg in seg_list:
if seg.word not in stop_word:
seg_list_after.append(seg.word)
yield ' '.join(seg_list_after)

def get_ids(self, data):
for r in data:
yield '%s %s Topic:\n' % (r['id'], r['content'])

def __del__(self):
self.cur.close()
self.conn.close()
print 'Finished!'

def main(self):
data = self.get_data()
list_words = list(self.get_words(data))
list_ids = list(self.get_ids(data))

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(list_words))

words = vectorizer.get_feature_names()
weight = tfidf.toarray()

n = 3

for (id, w) in zip(list_ids, weight):
print u'{}:'.format(id)
loc = np.argsort(-w)
for i in range(n):
print u'-{}: {} {}'.format(str(i + 1), words[loc[i]], w[loc[i]])
print '\n'

IfTdf().main()


参考:

https://github.com/dongxiexidian/Chinese/tree/master/dict

http://www.ruanyifeng.com/blog/2013/03/tf-idf.html

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

https://zhuanlan.zhihu.com/p/27330205
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  算法