您的位置:首页 > 其它

词语相似度计算:4、提取文本tf、tfidf特征

2016-03-21 10:20 417 查看
还是sklearn,不多做解释:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#!usr/bin/env python
# -*- coding:utf-8 -*-

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import sys
reload(sys)
sys.setdefaultencoding("utf8")
#for UnicodeEncodeError

#get all file names in the "ParentFolder"
def GetFilesInFolder(ParentFolder):
import os
filenameList = []
for filename in os.listdir(ParentFolder):
print filename
filenameList.append(filename)
return filenameList

ParentFolder="wikiData"
filenameList=GetFilesInFolder(ParentFolder)
dataList=[]
for fileName in filenameList:
f=open(ParentFolder+"/"+fileName,"r")
fileDatas=f.readlines()
f.close()
fileStr=""
for lineDatas in fileDatas:
fileStr+=lineDatas
dataList.append(fileStr)

print "countVectorizer operation", "=="*20
countVectorizer=CountVectorizer(encoding='utf-8', lowercase=True, stop_words='english', token_pattern='(?u)[A-Za-z][A-Za-z]+[A-Za-z]', ngram_range=(1, 1), analyzer='word', max_df=0.85, min_df=2, max_features=15000)
#why i use "min_df=2", because we want to compare TWO words, so...
tfFeature=countVectorizer.fit_transform(dataList) #sparse matrix
tfResult=np.hstack( ( (np.array(filenameList)).reshape(len(filenameList),1), tfFeature.toarray() ) )
featureName=countVectorizer.get_feature_names()
(pd.DataFrame(tfResult)).to_csv("tfResult.csv", index=False, header=["docID"]+featureName)

print "tfidfTransformer operation", "=="*20
tfidfTransformer=TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)
#do not apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
tfidfFeature=tfidfTransformer.fit_transform(tfFeature) #sparse matrix
tfidfResult=np.hstack( ( (np.array(filenameList)).reshape(len(filenameList),1), tfidfFeature.toarray() ) )
(pd.DataFrame(tfidfResult)).to_csv("tfidfResult.csv", index=False, header=["docID"]+featureName)

print "data size", "=="*20, tfFeature.shape
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息