tensorflow/word2evc初步认识
2017-07-22 10:30
549 查看
word2vec也叫word embeddings,中文名“词向量”,作用就是将自然语言中的字词转为计算机可以理解的稠密向量(Dense Vector)。Word2Vec可以将One-Hot Encoder转化为低维度的连续值,也就是稠密向量,并且其中意思相近的词将被映射到向量空间中相近的位置。 word2vec模型其实就是简单化的神经网络
a = [[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]]
a = np.asarray(a)
idx1 = tf.Variable([0, 2, 3, 1], tf.int32)
idx2 = tf.Variable([[0, 2, 3, 1], [4, 0, 2, 2]], tf.int32)
out1 = tf.nn.embedding_lookup(a, idx1)
out2 = tf.nn.embedding_lookup(a, idx2)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print sess.run(out1)
print out1
print ‘==================’
print sess.run(out2)
print out2
输出:
[[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 3.1 3.2 3.3]
[ 1.1 1.2 1.3]]
Tensor(“embedding_lookup:0”, shape=(4, 3), dtype=float64)
==========重点内容========
[[[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 3.1 3.2 3.3]
[ 1.1 1.2 1.3]]
[[ 4.1 4.2 4.3]
[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 2.1 2.2 2.3]]]
Tensor(“embedding_lookup_1:0”, shape=(2, 4, 3), dtype=float64)
#coding:utf-8 import tensorflow as tf import numpy as np import collections import matplotlib.pyplot as plt """ 将文本通过word2vec转为向量 """ batch_size = 20 embedding_size = 2 #负样本个数 num_samples = 15 sentences = ["the quick brown fox jumped over the lazy dog", "I love cats and dogs", "we all love cats and dogs", "cats and dogs are great", "sung likes cats", "she loves dogs", "cats can be very independent", "cats are great companions when they want to be", "cats are playful", "cats are natural hunters", "It's raining cats and dogs", "dogs and cats love sung"] #按空格将句子拆分为单词 words = " ".join(sentences).split() #统计不同单词出现的次数 count = collections.Counter(words).most_common() #提取不同的单词 keywords = [i[0] for i in count] #字典,键是单词,值是键所在的索引位置 dict = {w:i for i,w in enumerate(keywords)} #语料长度 voc_size = len(dict) #每个单词在keywords中的索引 words_index = [dict[word] for word in words] #CBOW构造语境和目标词汇映射关系映射关系 #he is a handsome boy [he,a]-->is,[is,handsome]-->a """ 单词索引:[3,4,5,6,7,3,1,3,2,1......] 映射关系:[[[3,5],4],[[4,6],5],[[5,7],6]......] """ chow_pairs = [] for i in range(1,len(words_index)-1): chow_pairs.append([[words_index[i-1],words_index[i+1]],words_index[i]]) skip_gram_pairs = [] for c in chow_pairs: skip_gram_pairs.append([c[1],c[0][0]]) skip_gram_pairs.append([c[1],c[0][1]]) """ skip_gram_pairs存储 [[4,3],[4,5],[5,4],[5,6],[6,5],[6,7]......] """ def generate_batch(size): assert size < len(skip_gram_pairs) x_data = [] y_data = [] #np.random.choice(a,size,replace=False)从a中随机选择size个数 r = np.random.choice(range(len(skip_gram_pairs)),size,replace=False) for i in r: x_data.append(skip_gram_pairs[i][0]) y_data.append([skip_gram_pairs[i][1]]) return x_data,y_data """ 如果选到:[4,3],[5,4],[6,7] x_data:[4,5,6] y_data:[3, 4, 7] """ train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) #构建embedding层 #tf.random_unoform(size,a,b)a,b范围内指定size的随机数 embeddings = tf.Variable( tf.random_uniform([voc_size, embedding_size], -1.0, 1.0)) embed = tf.nn.embedding_lookup(embeddings, train_inputs) #构造nce噪声对比估计 nce_weights = tf.Variable( tf.random_uniform([voc_size, embedding_size],-1.0, 1.0)) nce_biases = tf.Variable(tf.zeros([voc_size])) loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, num_sampled=num_samples, voc_size)) train_op = tf.train.AdamOptimizer(1e-1).minimize(loss) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) for step in range(100): batch_inputs, batch_labels = generate_batch(batch_size) _, loss_val = sess.run([train_op, loss], feed_dict={train_inputs: batch_inputs, train_labels: batch_labels}) if step % 10 == 0: print("Loss at ", step, loss_val) # Report the loss trained_embeddings = embeddings.eval() #将词向量在二维平面中表示 if trained_embeddings.shape[1] == 2: labels = keywords[:10] for i, label in enumerate(labels): x, y = trained_embeddings[i,:] plt.scatter(x, y) #plt.annotate()添加注释xy表示符号位置,xytext表示文字位置 plt.annotate(label, xy=(x, y), xytext=(5, 2),textcoords='offset points', ha='right', va='bottom') plt.savefig("word2vec.png")
结果:
将词转为向量映射到二维平面语法:
tf.nn.embedding_lookup(params, ids, partition_strategy=’mod’, name=None, validate_indices=True, max_norm=None)a = [[0.1, 0.2, 0.3], [1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3], [4.1, 4.2, 4.3]]
a = np.asarray(a)
idx1 = tf.Variable([0, 2, 3, 1], tf.int32)
idx2 = tf.Variable([[0, 2, 3, 1], [4, 0, 2, 2]], tf.int32)
out1 = tf.nn.embedding_lookup(a, idx1)
out2 = tf.nn.embedding_lookup(a, idx2)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print sess.run(out1)
print out1
print ‘==================’
print sess.run(out2)
print out2
输出:
[[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 3.1 3.2 3.3]
[ 1.1 1.2 1.3]]
Tensor(“embedding_lookup:0”, shape=(4, 3), dtype=float64)
==========重点内容========
[[[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 3.1 3.2 3.3]
[ 1.1 1.2 1.3]]
[[ 4.1 4.2 4.3]
[ 0.1 0.2 0.3]
[ 2.1 2.2 2.3]
[ 2.1 2.2 2.3]]]
Tensor(“embedding_lookup_1:0”, shape=(2, 4, 3), dtype=float64)
相关文章推荐
- java面向对象(二) 初步认识封装性
- coreData初步认识
- JavaScript 中的面向对象的初步认识
- 初步认识PE格式 - 基础篇06|解密系列
- WCF初步认识
- ssh2项目中文件的上传和下载的初步认识
- asp.net控件开发基础(7) ----------初步认识复合控件
- coreData初步认识
- WebSocket详解(一):初步认识WebSocket技术
- 【C#基础】初相识——初步认识C#
- html的标签的初步认识
- CSS的初步认识
- 04-我对委托的初步认识
- 1.初步认识TypeScript
- Oracle数据的初步认识
- 初步认识Maven之生命周期
- 初步认识Node 之Express
- UIView 和 UILabel 的初步认识(2015-1-26 上海松江)
- android-Gradle初步认识
- docker初步认识 运维部落