您的位置:首页 > 编程语言 > Python开发

基于Python结巴分词(调用自定义词库已经去除停用词)

2017-08-08 15:45 471 查看
# -*- coding: utf-8 -*-
import time
import jieba
import jieba.posseg as pseg#用于词性标注
#分词

#停用词过滤
def stop_word(fid1,fid2,fid3):
stopword=[]
for j in fid2.readlines():
stopword.append(j.strip().decode("utf-8"))#储存停用词表
#print j
for i in fid1.readlines():
data_line=i.strip()
wordList = jieba.cut(data_line.decode("utf-8"))#wordlist是一个生成器
outStr=''
for word in wordList:
if word not in stopword:
outStr+=word
outStr+=' '
fid3.write(outStr.strip().encode('utf-8') + '\n')

#主文件
def main():
jieba.enable_parallel()
# 加入自定义词库
jieba.load_userdict("/Users/zhuxinquan/Desktop/mykeyword.txt")
t1 = time.time()
fid1=open('/Users/zhuxinquan/Desktop/合并3.txt','r')#读取文件
fid2=open('/Users/zhuxinquan/Desktop/stopword.txt','r')#读取停用词表
fid3=open('/Users/zhuxinquan/Desktop/文本检索语料库5.txt','w')#将要写入的文件
stop_word(fid1,fid2,fid3)#停用词过滤

fid1.close()
fid2.close()
fid3.close()
t2 = time.time()
tm_cost = t2-t1
print tm_cost

main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: