您的位置:首页 > 其它

tensorflow/word2evc初步认识

2017-07-22 10:30 549 查看
word2vec也叫word embeddings,中文名“词向量”,作用就是将自然语言中的字词转为计算机可以理解的稠密向量(Dense Vector)。Word2Vec可以将One-Hot Encoder转化为低维度的连续值,也就是稠密向量,并且其中意思相近的词将被映射到向量空间中相近的位置。 word2vec模型其实就是简单化的神经网络

#coding:utf-8
import tensorflow as tf
import numpy as np
import collections
import matplotlib.pyplot as plt
"""
将文本通过word2vec转为向量
"""
batch_size = 20
embedding_size = 2
#负样本个数
num_samples = 15

sentences = ["the quick brown fox jumped over the lazy dog",
"I love cats and dogs",
"we all love cats and dogs",
"cats and dogs are great",
"sung likes cats",
"she loves dogs",
"cats can be very independent",
"cats are great companions when they want to be",
"cats are playful",
"cats are natural hunters",
"It's raining cats and dogs",
"dogs and cats love sung"]

#按空格将句子拆分为单词
words = " ".join(sentences).split()
#统计不同单词出现的次数
count = collections.Counter(words).most_common()

#提取不同的单词
keywords = [i[0] for i in count]
#字典,键是单词,值是键所在的索引位置
dict = {w:i for i,w in enumerate(keywords)}
#语料长度
voc_size = len(dict)

#每个单词在keywords中的索引
words_index = [dict[word] for word in words]

#CBOW构造语境和目标词汇映射关系映射关系
#he is a handsome boy [he,a]-->is,[is,handsome]-->a
"""
单词索引:[3,4,5,6,7,3,1,3,2,1......]
映射关系:[[[3,5],4],[[4,6],5],[[5,7],6]......]
"""
chow_pairs = []
for i in range(1,len(words_index)-1):
chow_pairs.append([[words_index[i-1],words_index[i+1]],words_index[i]])

skip_gram_pairs = []
for c in chow_pairs:
skip_gram_pairs.append([c[1],c[0][0]])
skip_gram_pairs.append([c[1],c[0][1]])
"""
skip_gram_pairs存储
[[4,3],[4,5],[5,4],[5,6],[6,5],[6,7]......]
"""

def generate_batch(size):
assert size < len(skip_gram_pairs)
x_data = []
y_data = []
#np.random.choice(a,size,replace=False)从a中随机选择size个数
r = np.random.choice(range(len(skip_gram_pairs)),size,replace=False)
for i in r:
x_data.append(skip_gram_pairs[i][0])
y_data.append([skip_gram_pairs[i][1]])
return x_data,y_data
"""
如果选到:[4,3],[5,4],[6,7]
x_data:[4,5,6]
y_data:[3,
4,
7]
"""

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

#构建embedding层
#tf.random_unoform(size,a,b)a,b范围内指定size的随机数
embeddings = tf.Variable(
tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs)

#构造nce噪声对比估计
nce_weights = tf.Variable(
tf.random_uniform([voc_size, embedding_size],-1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))

loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, num_sampled=num_samples, voc_size))

train_op = tf.train.AdamOptimizer(1e-1).minimize(loss)

with tf.Session() as sess:

init = tf.global_variables_initializer()
sess.run(init)
for step in range(100):
batch_inputs, batch_labels = generate_batch(batch_size)
_, loss_val = sess.run([train_op, loss],
feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
if step % 10 == 0:
print("Loss at ", step, loss_val) # Report the loss

trained_embeddings = embeddings.eval()

#将词向量在二维平面中表示
if trained_embeddings.shape[1] == 2:
labels = keywords[:10]
for i, label in enumerate(labels):
x, y = trained_embeddings[i,:]
plt.scatter(x, y)
#plt.annotate()添加注释xy表示符号位置,xytext表示文字位置
plt.annotate(label, xy=(x, y), xytext=(5, 2),textcoords='offset points', ha='right', va='bottom')
plt.savefig("word2vec.png")


结果:

将词转为向量映射到二维平面



语法:

tf.nn.embedding_lookup(params, ids, partition_strategy=’mod’, name=None, validate_indices=True, max_norm=None)

a = [[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]]

a = np.asarray(a)

idx1 = tf.Variable([0, 2, 3, 1], tf.int32)

idx2 = tf.Variable([[0, 2, 3, 1], [4, 0, 2, 2]], tf.int32)

out1 = tf.nn.embedding_lookup(a, idx1)

out2 = tf.nn.embedding_lookup(a, idx2)

init = tf.global_variables_initializer()

with tf.Session() as sess:

sess.run(init)

print sess.run(out1)

print out1

print ‘==================’

print sess.run(out2)

print out2

输出:

[[ 0.1 0.2 0.3]

[ 2.1 2.2 2.3]

[ 3.1 3.2 3.3]

[ 1.1 1.2 1.3]]

Tensor(“embedding_lookup:0”, shape=(4, 3), dtype=float64)

==========重点内容========

[[[ 0.1 0.2 0.3]

[ 2.1 2.2 2.3]

[ 3.1 3.2 3.3]

[ 1.1 1.2 1.3]]

[[ 4.1 4.2 4.3]

[ 0.1 0.2 0.3]

[ 2.1 2.2 2.3]

[ 2.1 2.2 2.3]]]

Tensor(“embedding_lookup_1:0”, shape=(2, 4, 3), dtype=float64)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  tensorflow word2vec