您的位置:首页 > 其它

计算mover distance的问题,速度太慢了

2017-03-02 09:47 204 查看
任务

要用1000篇文章来跟数据集里面的9722篇文章做一个mover distance的计算,总共要求的9722000个结果,但是现在每个结果平均都要两秒以上,有可能更慢。用了多线程,开了10个线程也快不起来。电脑是128G内存,CPU是i7-6855。请大神指教,谢谢!

源码

#!/usr/bin/python
# -*- encoding:utf-8 -*-

"""
@author : kelvin
@file : wmd_demo
@time : 2017/2/27 13:40
@description :

"""
from __future__ import division
import threading
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 将9722篇文档读入,并以列表方式存储
def read_file():
doc = []
f = open("G:\yang\MMSED-Text\Metadata_Text_100.txt")
for line in f.readlines():
f_split = line.split(':::')
path = f_split[6]
docpath = path.replace('C:\Users\zhengyang5\Workspaces\MyEclipse MMSED\EmptyWikiProcessing\Text20161007','G:\yang\MMSED-Text\Text20161007')
#print docpath
docpath = docpath.replace('\n','') #去除路径最后的换行符
doc_file = open(docpath)
doc.append(doc_file.read()) # 每篇文档是列表中的一个元素
doc_file.close()
f.close()
return doc

def word_mover_distance(d1,d2):
"""
用了word2vec的方法,计算文档相似度,以及mover distance
:param d1:文档一
:param d2:文档二
:return:cosine相似度以及mover distance
"""
# d1 = "Government speaks to the media in Illinois"
# d2 = "The president addresses the press"

# 去除不在word2vec中的词
vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.wv.vocab]
vect = CountVectorizer(vocabulary=vocabulary).fit([d1, d2])
v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()

from sklearn.metrics import euclidean_distances
W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
D_ = euclidean_distances(W_)
D_ = D_.astype(np.double)
D_ /= D_.max() # just for comparison purposes

from pyemd import emd

def emd_d(v_1, v_2):
# pyemd needs double precision input
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
mover_dis = float(emd(v_1, v_2, D_))
return mover_dis
v_1, v_2 = vect.transform([d1, d2])
mover_dis = emd_d(v_1, v_2)
return mover_dis

def get_text_num(num):
text_num = []
num_file = open("G:\yang\MMSED-Text\Text_Index1_10.txt")
for line in num_file.readlines()[num:num+10]: # 10篇为间隔, num为开始的数
line = line.strip('\n')
text_num.append(int(line)-1)
num_file.close()
return text_num

def loop_file_thread(n): # 从第几篇开始,取10篇来算
text = get_text_num(n)
mover_thread = []
for num in text:
doc1 = doc[num]
mover_doc1 = [] # 所有9722文档与doc1比较的mover distance存到列表中
for doc2 in doc:
mover = word_mover_distance(str(doc1), str(doc2)) # 调用函数计算两个值
mover_doc1.append(mover)
mover_thread.append(mover_doc1)
print 'Finish one 9722 compare'
save_txt(mover_thread, 'mov_'+str(n)+'_'+str(n+10)+'.txt')

def save_txt(mover_distance, fname): # 存文件
mover_dis_matrix = np.array(mover_distance)
print mover_dis_matrix.shape
np.savetxt(fname, mover_dis_matrix, delimiter=',', fmt='%10.8f')

if __name__ == '__main__':
# 载入googleNews的word2vec模型
model = Word2Vec.load_word2vec_format("G:\yang\MMSED-Text\GoogleNews-vectors-negative300.bin", binary=True)
# 读入所有文档
doc = read_file()
print "Please wait.I'm caculating......"
# 定义任意个线程
for i in range(0,10):
t = threading.Thread(target=loop_file_thread, args={10*i})
t.setDaemon(True)
t.start()
t.join() # 子进程完成后才能进入父进程
print 'all finish!!'

怎样才能快点啊,这个问题困扰了我两天啊!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  文本挖掘 多线程