基于负采样的skip-garm的语言模型实现-R
2017-07-12 18:54
429 查看
基本思路:
已知词w,在文章中统计其上下文u1,u2。。。在负样本集中选取负样本u3、u4。。。
词w的词向量与其对应的每个样本向量乘积,利用sigmod函数求得概率估计值。与标记值target的残差求梯度下降,优化输入词向量、权值向量、偏置向量。
问题:
初始化输入词向量、权值向量、偏置向量时如果值过大,那么wx+b就过大,导致sigmod值区域正负无穷,残差值出现INF值。
结果:
1、本次只利用了34句关于找工作的话题的话语来训练模型,模型最后的结果:
string1 <- "找好工作,主要是有经验"
string2 <- "工作找经验"
pro1 <- getpro(string1) #23.7
pro2 <- getpro(string2) #4.91
2、训练的词向量,降维展示在二维空间内:
“理解”、“沟通”、“长得帅” 三个词的距离很接近。。。。
原来长得帅也是找工作的一个充分条件。。。。
#设置工作目录
setwd("../buaa/sahomework")
#读取数据
answer <- read.csv("./data/answer3new",
encoding = "utf-8",
sep = ",",
header = FALSE,
stringsAsFactors = FALSE)
#处理数据dataframe
names(answer) <- c("questionid","id","username","zan","answer")
answer$questionid <- as.character(answer$questionid)
answer$id <- as.character(answer$id)
#先拿小样本尝试,取某一个问题的全部回答“655467276313526272”
answers <- answer[which(answer$questionid == "655467276313526272"),]
#answers分词
library(jiebaR)
wk <- worker()
anscorpus <- data.frame("anscorpus" = wk[answers$answer])
#先不处理停顿次
#停顿次是否需要去掉
# removeStopWords = function(x,words) {
# ret = character(0)
# index <- 1
# it_max <- length(x)
# while (index <= it_max) {
# if (length(words[words==x[index]]) <1)
# ret <- c(ret,x[index])
# index <- index +1
# }
# ret
# }
# stopwords <- data.frame("stopwords" = "的")
# corpus <- lapply(as.character(anscorpus), removeStopWords,stopwords)
# corpus <- data.frame("corpus" = unlist(strsplit(as.character(anscorpus),split=",")))
corpus <- anscorpus
#语料库落地存储
write.csv(corpus,file = "data/words.csv",col.names = FALSE,row.names = FALSE)
#处理corpus,按照词频进行排序,序号为该词的index ,负采样方便 190个词
corpusFreq <- data.frame(table(corpus))
corpusFreq <- corpusFreq[order(corpusFreq$Freq,decreasing = T),]
corpusFreq$id <- c(1:190)
summary(corpusFreq)
#词云展示词频
install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)
par(family='STXihei') #不起作用,需要在wordcloud中设置??
png(file = "wordcloud.png",bg = "white",width = 480,height = 480)
colors = brewer.pal(8,"Dark2")
wordcloud(corpusFreq$corpus,corpusFreq$Freq,
scale=c(3,0.5),
min.freq=-Inf,
max.words=190,
colors=colors,
random.order=F,
random.color=F,
ordered.colors=F,
family='STXihei')
dev.off()
#把回答翻译成id的文章 34句话, 302个词的词串
#以便提取上下文词语,句末添加"." -- 暂时不添加
charaIndex <- ""
unuseChara <- 0
for(i in c(1:dim(corpus)[1])){
if(corpus[i,1] %in% corpusFreq$corpus){
# print(i)
chara <- corpusFreq[corpusFreq$corpus == corpus[i,1], 3]
charaIndex <- paste(charaIndex, chara,sep = ",")
}else{
unuseChara <- unuseChara + 1
}
}
# for(j in c(1:dim(answers)[1])){
# charactors <- unlist(strsplit(answers[j,5],split = ""))#列名不能提取列??
# len <- length(charactors)
#
# for (i in c(1:len)) {
# if(charactors[i] %in% corpusFreq$corpus){
# chara <- corpusFreq[corpusFreq$corpus == charactors[i], 3]
# charaIndex <- paste(charaIndex, chara,sep = ",")
# }else{
# unuseChara <- unuseChara + 1
# }
# }
# # charaIndex <- paste(charaIndex,".",sep = ",")
# }
#生成上下文,corpusFreq$context纪录该词所有的num_skip=2的上下文
corpusFreq$context <- NULL
# num_skip <- 2
# batch_size <- 190
chara <- unlist(strsplit(charaIndex,split = ","))
chara <- chara[-1]#chara[1]是空 218个词
for (i in c(1:length(chara))) {
if(i > 1){
oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]
corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i-1],sep = ",")
}
if(i < length(chara)){
oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]
corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i+1],sep = ",")
}
}
names(corpusFreq)[4] <- "context"
#对上下进行修正,没有的补上
#构建负采样矩阵190*5
valid_sample <- matrix(0,nrow = 190,ncol = 5)
for(i in c(1:dim(corpusFreq)[1])){
quene <- c(1:dim(corpusFreq)[1])
quene[-i]
valid_sample[i,] <- sample(quene,5,replace = F)
}
#构建logits矩阵,每一行是一个词的2个正样本+5个负样本 结果是190*7
contextmatrix <- matrix(0,nrow = 190,ncol = 2)
for(i in c(1:dim(corpusFreq)[1])){
contextlist <- unlist(strsplit(corpusFreq[i,4],split = ","))
if(contextlist[1] == "NA"){
context <- contextlist[c(2:3)]
}else{
context <- contextlist[c(1:2)]
}
contextmatrix[i,] <- context
}
contextM <- data.frame(cbind(contextmatrix,valid_sample))
# contextM <- lapply(contextM[,],as.numeric)
# contextM <- data.frame(contextM)
# contextM[is.na(contextM)] <- 0
names(contextM) <- c("prefix","suffix","valid1","valid2","valid3","valid4","valid5")
#标记矩阵
target1 <- matrix(1,nrow = 190,ncol = 2)
target2 <- matrix(0,nrow = 190,ncol = 5)
target <- cbind(target1,target2)
# #交叉熵递归下降 求解train_input
# #交叉熵:logits - logits * target + ln(1 + exp(-logits))
# cross_entropy <- logits - logits * target + log(1 + exp(-logits))
# sum_row <- data.frame(rowSums(cross_entropy))
#轮训对一个样本进行随机梯度下降
sigmod = function(x){
return(1 / 1 + exp(-x))
}
logits <- logits <- matrix(0,nrow = 190,ncol = 7)
#x 190*128 labels 190*1 W 190*128 B 190*1
nce_weight <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)
nce_biases <- matrix(runif(190,-0.1,0.1),nrow = 190)
train_inputs <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)
# nce_weight <- nce_weight2
# nce_biases <- nce_biases2
# train_inputs <- train_inputs2
# train_labels <- matrix(runif(190,-1,1),nrow = 190)
#logit矩阵,方便调试sigmod函数,防止出现正负无穷
genrate_logits = function(){
logits <- matrix(0,nrow = 190,ncol = 7)
for(i in c(1:dim(train_inputs)[1])){
x <- t(data.frame(train_inputs[i,]))
w <- t(data.frame(nce_weight[as.integer(contextM[i,]),]))
logits[i,] <- x %*% w + nce_biases[i]
}
return(logits)
}
logits2 <- genrate_logits()
#梯度下降
maxiter <- 190
# minerror <- 0.01
step <- 0.01
# newerror <- 1
iter <- 0 #循环次数
len <- dim(train_inputs)[1]
i <- 1 #train_inputs中的第i个样本
while(iter <= maxiter){
# print("=========")
des <- matrix(0,nrow = 128,ncol = 1)
iter <- iter + 1
if(i > len){i <- i %% len}
print(i)
x <- t(data.frame(train_inputs[i,]))
w <- t(data.frame(nce_weight[as.numeric(contextM[i,]),]))
#wx + b 的sigmod值,1*7矩阵,计算每个样本的残差进行修正
logits[i,] <- x %*% w + matrix(nce_biases[as.numeric(contextM[i,]),],nrow = 1,ncol = 7)
#依次更新weight和biase
for(j in c(1:length(logits[1,]))){
#出现了-Inf和Inf,然后NaN,sigmod函数当值较大或者较小时函数值区域无穷,
#缩小初始化随机变量的
ae6d
取值范围
des <- (sigmod(logits[i,j]) - target[i,j]) * as.matrix(train_inputs[i,])#128*1
#更新x
train_inputs[i,] <- as.matrix(train_inputs[i,]) - step * des
# print("=====更新train=====")
print(des[1,1])
#更新w
nce_weight[as.integer(contextM[i,j]),] <-
as.matrix(nce_weight[as.integer(contextM[i,j]),]) - step * des
nce_biases[as.integer(contextM[i,j]),] <- nce_biases[as.integer(contextM[i,j]),] - step * (t(des) %*% des)
}
i <- i + 1
}
#对词向量进行可视化
#pca分析
pca <- princomp(train_inputs[,],cor = TRUE,scores = TRUE)
plot(pca, type="lines")
biplot(pca)
#计算MDS
dis <- dist(train_inputs,diag = TRUE,upper = TRUE )
# fit <- hclust(dis,method = "ward.D")
# plot(fit)
dismatrix <- as.matrix(dis)
mds <- cmdscale(dismatrix,k = 2)
par(family = "STXihei")
plot(mds[,1],mds[,2],type = "n",col = "red")
text(mds[,1],mds[,2],labels = corpusFreq$corpus,cex = 0.5,col = "black")
#计算语句出现概率
getpro = function(s){
testcorpus <- data.frame("corpus" = wk[s])
for (i in c(1:dim(testcorpus)[1])) {
testcorpus$id[i] <- corpusFreq[as.character(corpusFreq$corpus) == testcorpus[i,1],3]
}
pro <- 0
len <- dim(testcorpus)[1] - 1
for (i in c(2:len)){
prepro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i-1,2],] + nce_biases[testcorpus[i-1,2],])
sufpro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i+1,2],] + nce_biases[testcorpus[i+1,2],])
proi <- prepro * sufpro
pro <- pro + proi
}
return(pro)
}
string1 <- "找好工作,主要是有经验"
string2 <- "工作找经验"
pro1 <- getpro(string1) #23.7
pro2 <- getpro(string2) #4.91
已知词w,在文章中统计其上下文u1,u2。。。在负样本集中选取负样本u3、u4。。。
词w的词向量与其对应的每个样本向量乘积,利用sigmod函数求得概率估计值。与标记值target的残差求梯度下降,优化输入词向量、权值向量、偏置向量。
问题:
初始化输入词向量、权值向量、偏置向量时如果值过大,那么wx+b就过大,导致sigmod值区域正负无穷,残差值出现INF值。
结果:
1、本次只利用了34句关于找工作的话题的话语来训练模型,模型最后的结果:
string1 <- "找好工作,主要是有经验"
string2 <- "工作找经验"
pro1 <- getpro(string1) #23.7
pro2 <- getpro(string2) #4.91
2、训练的词向量,降维展示在二维空间内:
“理解”、“沟通”、“长得帅” 三个词的距离很接近。。。。
原来长得帅也是找工作的一个充分条件。。。。
#设置工作目录
setwd("../buaa/sahomework")
#读取数据
answer <- read.csv("./data/answer3new",
encoding = "utf-8",
sep = ",",
header = FALSE,
stringsAsFactors = FALSE)
#处理数据dataframe
names(answer) <- c("questionid","id","username","zan","answer")
answer$questionid <- as.character(answer$questionid)
answer$id <- as.character(answer$id)
#先拿小样本尝试,取某一个问题的全部回答“655467276313526272”
answers <- answer[which(answer$questionid == "655467276313526272"),]
#answers分词
library(jiebaR)
wk <- worker()
anscorpus <- data.frame("anscorpus" = wk[answers$answer])
#先不处理停顿次
#停顿次是否需要去掉
# removeStopWords = function(x,words) {
# ret = character(0)
# index <- 1
# it_max <- length(x)
# while (index <= it_max) {
# if (length(words[words==x[index]]) <1)
# ret <- c(ret,x[index])
# index <- index +1
# }
# ret
# }
# stopwords <- data.frame("stopwords" = "的")
# corpus <- lapply(as.character(anscorpus), removeStopWords,stopwords)
# corpus <- data.frame("corpus" = unlist(strsplit(as.character(anscorpus),split=",")))
corpus <- anscorpus
#语料库落地存储
write.csv(corpus,file = "data/words.csv",col.names = FALSE,row.names = FALSE)
#处理corpus,按照词频进行排序,序号为该词的index ,负采样方便 190个词
corpusFreq <- data.frame(table(corpus))
corpusFreq <- corpusFreq[order(corpusFreq$Freq,decreasing = T),]
corpusFreq$id <- c(1:190)
summary(corpusFreq)
#词云展示词频
install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)
par(family='STXihei') #不起作用,需要在wordcloud中设置??
png(file = "wordcloud.png",bg = "white",width = 480,height = 480)
colors = brewer.pal(8,"Dark2")
wordcloud(corpusFreq$corpus,corpusFreq$Freq,
scale=c(3,0.5),
min.freq=-Inf,
max.words=190,
colors=colors,
random.order=F,
random.color=F,
ordered.colors=F,
family='STXihei')
dev.off()
#把回答翻译成id的文章 34句话, 302个词的词串
#以便提取上下文词语,句末添加"." -- 暂时不添加
charaIndex <- ""
unuseChara <- 0
for(i in c(1:dim(corpus)[1])){
if(corpus[i,1] %in% corpusFreq$corpus){
# print(i)
chara <- corpusFreq[corpusFreq$corpus == corpus[i,1], 3]
charaIndex <- paste(charaIndex, chara,sep = ",")
}else{
unuseChara <- unuseChara + 1
}
}
# for(j in c(1:dim(answers)[1])){
# charactors <- unlist(strsplit(answers[j,5],split = ""))#列名不能提取列??
# len <- length(charactors)
#
# for (i in c(1:len)) {
# if(charactors[i] %in% corpusFreq$corpus){
# chara <- corpusFreq[corpusFreq$corpus == charactors[i], 3]
# charaIndex <- paste(charaIndex, chara,sep = ",")
# }else{
# unuseChara <- unuseChara + 1
# }
# }
# # charaIndex <- paste(charaIndex,".",sep = ",")
# }
#生成上下文,corpusFreq$context纪录该词所有的num_skip=2的上下文
corpusFreq$context <- NULL
# num_skip <- 2
# batch_size <- 190
chara <- unlist(strsplit(charaIndex,split = ","))
chara <- chara[-1]#chara[1]是空 218个词
for (i in c(1:length(chara))) {
if(i > 1){
oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]
corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i-1],sep = ",")
}
if(i < length(chara)){
oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]
corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i+1],sep = ",")
}
}
names(corpusFreq)[4] <- "context"
#对上下进行修正,没有的补上
#构建负采样矩阵190*5
valid_sample <- matrix(0,nrow = 190,ncol = 5)
for(i in c(1:dim(corpusFreq)[1])){
quene <- c(1:dim(corpusFreq)[1])
quene[-i]
valid_sample[i,] <- sample(quene,5,replace = F)
}
#构建logits矩阵,每一行是一个词的2个正样本+5个负样本 结果是190*7
contextmatrix <- matrix(0,nrow = 190,ncol = 2)
for(i in c(1:dim(corpusFreq)[1])){
contextlist <- unlist(strsplit(corpusFreq[i,4],split = ","))
if(contextlist[1] == "NA"){
context <- contextlist[c(2:3)]
}else{
context <- contextlist[c(1:2)]
}
contextmatrix[i,] <- context
}
contextM <- data.frame(cbind(contextmatrix,valid_sample))
# contextM <- lapply(contextM[,],as.numeric)
# contextM <- data.frame(contextM)
# contextM[is.na(contextM)] <- 0
names(contextM) <- c("prefix","suffix","valid1","valid2","valid3","valid4","valid5")
#标记矩阵
target1 <- matrix(1,nrow = 190,ncol = 2)
target2 <- matrix(0,nrow = 190,ncol = 5)
target <- cbind(target1,target2)
# #交叉熵递归下降 求解train_input
# #交叉熵:logits - logits * target + ln(1 + exp(-logits))
# cross_entropy <- logits - logits * target + log(1 + exp(-logits))
# sum_row <- data.frame(rowSums(cross_entropy))
#轮训对一个样本进行随机梯度下降
sigmod = function(x){
return(1 / 1 + exp(-x))
}
logits <- logits <- matrix(0,nrow = 190,ncol = 7)
#x 190*128 labels 190*1 W 190*128 B 190*1
nce_weight <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)
nce_biases <- matrix(runif(190,-0.1,0.1),nrow = 190)
train_inputs <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)
# nce_weight <- nce_weight2
# nce_biases <- nce_biases2
# train_inputs <- train_inputs2
# train_labels <- matrix(runif(190,-1,1),nrow = 190)
#logit矩阵,方便调试sigmod函数,防止出现正负无穷
genrate_logits = function(){
logits <- matrix(0,nrow = 190,ncol = 7)
for(i in c(1:dim(train_inputs)[1])){
x <- t(data.frame(train_inputs[i,]))
w <- t(data.frame(nce_weight[as.integer(contextM[i,]),]))
logits[i,] <- x %*% w + nce_biases[i]
}
return(logits)
}
logits2 <- genrate_logits()
#梯度下降
maxiter <- 190
# minerror <- 0.01
step <- 0.01
# newerror <- 1
iter <- 0 #循环次数
len <- dim(train_inputs)[1]
i <- 1 #train_inputs中的第i个样本
while(iter <= maxiter){
# print("=========")
des <- matrix(0,nrow = 128,ncol = 1)
iter <- iter + 1
if(i > len){i <- i %% len}
print(i)
x <- t(data.frame(train_inputs[i,]))
w <- t(data.frame(nce_weight[as.numeric(contextM[i,]),]))
#wx + b 的sigmod值,1*7矩阵,计算每个样本的残差进行修正
logits[i,] <- x %*% w + matrix(nce_biases[as.numeric(contextM[i,]),],nrow = 1,ncol = 7)
#依次更新weight和biase
for(j in c(1:length(logits[1,]))){
#出现了-Inf和Inf,然后NaN,sigmod函数当值较大或者较小时函数值区域无穷,
#缩小初始化随机变量的
ae6d
取值范围
des <- (sigmod(logits[i,j]) - target[i,j]) * as.matrix(train_inputs[i,])#128*1
#更新x
train_inputs[i,] <- as.matrix(train_inputs[i,]) - step * des
# print("=====更新train=====")
print(des[1,1])
#更新w
nce_weight[as.integer(contextM[i,j]),] <-
as.matrix(nce_weight[as.integer(contextM[i,j]),]) - step * des
nce_biases[as.integer(contextM[i,j]),] <- nce_biases[as.integer(contextM[i,j]),] - step * (t(des) %*% des)
}
i <- i + 1
}
#对词向量进行可视化
#pca分析
pca <- princomp(train_inputs[,],cor = TRUE,scores = TRUE)
plot(pca, type="lines")
biplot(pca)
#计算MDS
dis <- dist(train_inputs,diag = TRUE,upper = TRUE )
# fit <- hclust(dis,method = "ward.D")
# plot(fit)
dismatrix <- as.matrix(dis)
mds <- cmdscale(dismatrix,k = 2)
par(family = "STXihei")
plot(mds[,1],mds[,2],type = "n",col = "red")
text(mds[,1],mds[,2],labels = corpusFreq$corpus,cex = 0.5,col = "black")
#计算语句出现概率
getpro = function(s){
testcorpus <- data.frame("corpus" = wk[s])
for (i in c(1:dim(testcorpus)[1])) {
testcorpus$id[i] <- corpusFreq[as.character(corpusFreq$corpus) == testcorpus[i,1],3]
}
pro <- 0
len <- dim(testcorpus)[1] - 1
for (i in c(2:len)){
prepro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i-1,2],] + nce_biases[testcorpus[i-1,2],])
sufpro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i+1,2],] + nce_biases[testcorpus[i+1,2],])
proi <- prepro * sufpro
pro <- pro + proi
}
return(pro)
}
string1 <- "找好工作,主要是有经验"
string2 <- "工作找经验"
pro1 <- getpro(string1) #23.7
pro2 <- getpro(string2) #4.91
相关文章推荐
- TensorFlow实现基于LSTM的语言模型
- TensorFlow实现经典深度学习网络(6):TensorFlow实现基于LSTM的语言模型
- 基于循环神经网络实现基于字符的语言模型(char-level RNN Language Model)-tensorflow实现
- 学习笔记TF035:实现基于LSTM语言模型
- Tensorflow实例:实现基于LSTM的语言模型
- tensorflow38《TensorFlow实战》笔记-07-02 TensorFlow实现基于LSTM的语言模型 code
- 基于TensorFlow实现Skip-Gram模型
- Tensorflow实战-实现基于LSTM的语言模型
- 学习笔记TF035:实现基于LSTM语言模型
- 深度学习之六,基于RNN(GRU,LSTM)的语言模型分析与theano代码实现
- TensorFlow实战12:实现基于LSTM的语言模型
- Tensorflow实战学习(三十五)【实现基于LSTM语言模型】
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布
- 利用 AOP 实现 .NET 上完整的基于角色的访问控制(RBAC)模型
- 基于Java的开源的模型驱动转换器和抽象用户接口标识语言6.1发布