词语相似度计算:4、提取文本tf、tfidf特征
2016-03-21 10:20
417 查看
还是sklearn,不多做解释:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
#!usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import sys
reload(sys)
sys.setdefaultencoding("utf8")
#for UnicodeEncodeError
#get all file names in the "ParentFolder"
def GetFilesInFolder(ParentFolder):
import os
filenameList = []
for filename in os.listdir(ParentFolder):
print filename
filenameList.append(filename)
return filenameList
ParentFolder="wikiData"
filenameList=GetFilesInFolder(ParentFolder)
dataList=[]
for fileName in filenameList:
f=open(ParentFolder+"/"+fileName,"r")
fileDatas=f.readlines()
f.close()
fileStr=""
for lineDatas in fileDatas:
fileStr+=lineDatas
dataList.append(fileStr)
print "countVectorizer operation", "=="*20
countVectorizer=CountVectorizer(encoding='utf-8', lowercase=True, stop_words='english', token_pattern='(?u)[A-Za-z][A-Za-z]+[A-Za-z]', ngram_range=(1, 1), analyzer='word', max_df=0.85, min_df=2, max_features=15000)
#why i use "min_df=2", because we want to compare TWO words, so...
tfFeature=countVectorizer.fit_transform(dataList) #sparse matrix
tfResult=np.hstack( ( (np.array(filenameList)).reshape(len(filenameList),1), tfFeature.toarray() ) )
featureName=countVectorizer.get_feature_names()
(pd.DataFrame(tfResult)).to_csv("tfResult.csv", index=False, header=["docID"]+featureName)
print "tfidfTransformer operation", "=="*20
tfidfTransformer=TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)
#do not apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
tfidfFeature=tfidfTransformer.fit_transform(tfFeature) #sparse matrix
tfidfResult=np.hstack( ( (np.array(filenameList)).reshape(len(filenameList),1), tfidfFeature.toarray() ) )
(pd.DataFrame(tfidfResult)).to_csv("tfidfResult.csv", index=False, header=["docID"]+featureName)
print "data size", "=="*20, tfFeature.shape
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
#!usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import sys
reload(sys)
sys.setdefaultencoding("utf8")
#for UnicodeEncodeError
#get all file names in the "ParentFolder"
def GetFilesInFolder(ParentFolder):
import os
filenameList = []
for filename in os.listdir(ParentFolder):
print filename
filenameList.append(filename)
return filenameList
ParentFolder="wikiData"
filenameList=GetFilesInFolder(ParentFolder)
dataList=[]
for fileName in filenameList:
f=open(ParentFolder+"/"+fileName,"r")
fileDatas=f.readlines()
f.close()
fileStr=""
for lineDatas in fileDatas:
fileStr+=lineDatas
dataList.append(fileStr)
print "countVectorizer operation", "=="*20
countVectorizer=CountVectorizer(encoding='utf-8', lowercase=True, stop_words='english', token_pattern='(?u)[A-Za-z][A-Za-z]+[A-Za-z]', ngram_range=(1, 1), analyzer='word', max_df=0.85, min_df=2, max_features=15000)
#why i use "min_df=2", because we want to compare TWO words, so...
tfFeature=countVectorizer.fit_transform(dataList) #sparse matrix
tfResult=np.hstack( ( (np.array(filenameList)).reshape(len(filenameList),1), tfFeature.toarray() ) )
featureName=countVectorizer.get_feature_names()
(pd.DataFrame(tfResult)).to_csv("tfResult.csv", index=False, header=["docID"]+featureName)
print "tfidfTransformer operation", "=="*20
tfidfTransformer=TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)
#do not apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
tfidfFeature=tfidfTransformer.fit_transform(tfFeature) #sparse matrix
tfidfResult=np.hstack( ( (np.array(filenameList)).reshape(len(filenameList),1), tfidfFeature.toarray() ) )
(pd.DataFrame(tfidfResult)).to_csv("tfidfResult.csv", index=False, header=["docID"]+featureName)
print "data size", "=="*20, tfFeature.shape
相关文章推荐
- 词语相似度计算:1、安装NLTK和下载WordNet语料库;WordNet的使用
- 词语相似度计算:2、使用NLTK和WordNet计算词语相似度
- 词语相似度计算:3、使用urllib爬取wiki文章,使用beautifulSoup解析html
- 词语相似度计算:6、实验报告
- 词语相似度计算:5、训练各种相似度模型(LR,RF,NMF,LDA等)【待续】
- Android定位
- FC金手指使用方法+大全
- Java Socket应用(一)--Socket通信
- android坐标系详解
- iOS个人整理30-网络请求Session与Connection
- 前端资源
- const的常见用法
- python 字符串操作
- python 混淆点一
- Html+Css CSS3_transform属性中的_scale属性值
- Zend Framework框架教程之Zend_Db_Table_Rowset用法实例分析
- java中的匿名内部类总结
- ios点击手势学习笔记
- 如何让一个受损的,并且发送一个外向洪范攻击或DDoS攻击的Droplet恢复
- MFC 中CTime 和 COleDateTime的区别