您的位置:首页 > 其它

TF-IDF算法实现,稀疏矩阵的转化

2019-05-10 12:54 120 查看
[code]from math import log10
import numpy as np

# docList is the corpus with each element a doc, each doc is a list of words
def tfidf(docList):
docNum = len(docList)
docList = [i.split(' ') for i in docList]
term_idf = dict()
for doc in docList:
# set(doc) 得到每篇文档的词,不包含重复, 即可统计该词在不同文档的出现次数
for term in set(doc):
if term not in term_idf:
term_idf[term] = 1.0
else:
term_idf[term] += 1.0
# IDF 统计词语的逆文档频率
for term in term_idf:
# log10  10为底数
term_idf[term] = log10(docNum / term_idf[term])
print('all word num = ', len(term_idf))
# term_tfidf 总词典
term_tfidf = dict()
doc_id = 0
for doc in docList:
term_tfidf[doc_id] = dict()
# 每个文档的词频统计
term_tf = dict()
for term in doc:
if term not in term_tf:
term_tf[term] = 1.0
else:
term_tf[term] += 1.0
# 每个文档的词数目
docLen = len(doc)
for term in doc:
tfidf = term_tf[term] / docLen * term_idf[term]
term_tfidf[doc_id][term] = tfidf
doc_id += 1

for voc in term_idf.keys():
all_word.append(voc)

return term_tfidf

with open('demo.txt') as f:
data = []
for line in f.readlines():
if line != '\n':
line = line.strip('\n').strip('.[]()')
data.append(line)

# print(data)
print('all doc num = ', len(data))
# 词表循环
all_word = []

score = tfidf(data)
X = np.zeros((len(data), len(all_word)))
doc_id = 0

# 转换为稀疏矩阵
for (d,x) in score.items():
for (k, v) in x.items():
if k in all_word:
X[doc_id][all_word.index(k)] = float(v)
doc_id += 1

print(X)

 

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: