您的位置:首页 > 其它

基于负采样的skip-garm的语言模型实现-R

2017-07-12 18:54 429 查看
基本思路:

已知词w,在文章中统计其上下文u1,u2。。。在负样本集中选取负样本u3、u4。。。

词w的词向量与其对应的每个样本向量乘积,利用sigmod函数求得概率估计值。与标记值target的残差求梯度下降,优化输入词向量、权值向量、偏置向量。

问题:

初始化输入词向量、权值向量、偏置向量时如果值过大,那么wx+b就过大,导致sigmod值区域正负无穷,残差值出现INF值。

结果:

1、本次只利用了34句关于找工作的话题的话语来训练模型,模型最后的结果:

string1 <- "找好工作,主要是有经验"

string2 <- "工作找经验"

pro1 <- getpro(string1) #23.7

pro2 <- getpro(string2) #4.91

         2、训练的词向量,降维展示在二维空间内:

“理解”、“沟通”、“长得帅” 三个词的距离很接近。。。。

原来长得帅也是找工作的一个充分条件。。。。

#设置工作目录

setwd("../buaa/sahomework")

#读取数据

answer <- read.csv("./data/answer3new",

                   encoding = "utf-8",

                   sep = ",",

                   header = FALSE,

                   stringsAsFactors = FALSE)

#处理数据dataframe

names(answer) <- c("questionid","id","username","zan","answer")

answer$questionid <- as.character(answer$questionid)

answer$id <- as.character(answer$id)

#先拿小样本尝试,取某一个问题的全部回答“655467276313526272”

answers <- answer[which(answer$questionid == "655467276313526272"),]

#answers分词

library(jiebaR)

wk <- worker()

anscorpus <- data.frame("anscorpus" = wk[answers$answer])

#先不处理停顿次

#停顿次是否需要去掉

# removeStopWords = function(x,words) {

#   ret = character(0)

#   index <- 1

#   it_max <- length(x)

#   while (index <= it_max) {

#     if (length(words[words==x[index]]) <1) 

#       ret <- c(ret,x[index])

#     index <- index +1

#   }

#   ret

# }

# stopwords <- data.frame("stopwords" = "的")

# corpus <- lapply(as.character(anscorpus), removeStopWords,stopwords)

# corpus <- data.frame("corpus" = unlist(strsplit(as.character(anscorpus),split=",")))

corpus <- anscorpus

#语料库落地存储

write.csv(corpus,file = "data/words.csv",col.names = FALSE,row.names = FALSE)

#处理corpus,按照词频进行排序,序号为该词的index ,负采样方便 190个词

corpusFreq <- data.frame(table(corpus))

corpusFreq <- corpusFreq[order(corpusFreq$Freq,decreasing = T),]

corpusFreq$id <- c(1:190)

summary(corpusFreq)

#词云展示词频

install.packages("wordcloud")

library(RColorBrewer)

library(wordcloud)

par(family='STXihei') #不起作用,需要在wordcloud中设置??

png(file = "wordcloud.png",bg = "white",width = 480,height = 480)

colors = brewer.pal(8,"Dark2")

wordcloud(corpusFreq$corpus,corpusFreq$Freq,

          scale=c(3,0.5),

          min.freq=-Inf,

          max.words=190,

          colors=colors,

          random.order=F,

          random.color=F,

          ordered.colors=F,

          family='STXihei')

dev.off()

#把回答翻译成id的文章 34句话, 302个词的词串

#以便提取上下文词语,句末添加"." -- 暂时不添加

charaIndex <- ""

unuseChara <- 0

for(i in c(1:dim(corpus)[1])){

  if(corpus[i,1] %in% corpusFreq$corpus){

    # print(i)

    chara <- corpusFreq[corpusFreq$corpus == corpus[i,1], 3]

    charaIndex <- paste(charaIndex, chara,sep = ",")

  }else{

      unuseChara <- unuseChara + 1

    }

}

# for(j in c(1:dim(answers)[1])){

#   charactors <- unlist(strsplit(answers[j,5],split = ""))#列名不能提取列??

#   len <- length(charactors)

#   

#   for (i in c(1:len)) {

#     if(charactors[i] %in% corpusFreq$corpus){

#       chara <- corpusFreq[corpusFreq$corpus == charactors[i], 3]

#       charaIndex <- paste(charaIndex, chara,sep = ",")

#     }else{

#       unuseChara <- unuseChara + 1

#     }

#   }

#   # charaIndex <- paste(charaIndex,".",sep = ",")

# }

#生成上下文,corpusFreq$context纪录该词所有的num_skip=2的上下文

corpusFreq$context <- NULL

# num_skip <- 2

# batch_size <- 190

  chara <- unlist(strsplit(charaIndex,split = ","))

  chara <- chara[-1]#chara[1]是空 218个词

  for (i in c(1:length(chara))) {

    if(i > 1){

      oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]

      corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i-1],sep = ",") 

    }

    if(i < length(chara)){

      oldContext <- corpusFreq[which(corpusFreq$id == chara[i]),4]

      corpusFreq[which(corpusFreq$id == chara[i]),4] <- paste(oldContext,chara[i+1],sep = ",") 

    }

  }

  names(corpusFreq)[4] <- "context"

  #对上下进行修正,没有的补上

  

  

#构建负采样矩阵190*5

valid_sample <- matrix(0,nrow = 190,ncol = 5)

for(i in c(1:dim(corpusFreq)[1])){

  quene <- c(1:dim(corpusFreq)[1])

  quene[-i]

  valid_sample[i,] <- sample(quene,5,replace = F)

}

#构建logits矩阵,每一行是一个词的2个正样本+5个负样本 结果是190*7

contextmatrix <- matrix(0,nrow = 190,ncol = 2)

for(i in c(1:dim(corpusFreq)[1])){

  contextlist <- unlist(strsplit(corpusFreq[i,4],split = ","))

  if(contextlist[1] == "NA"){

    context <- contextlist[c(2:3)]

  }else{

    context <- contextlist[c(1:2)]

  }

  contextmatrix[i,] <- context

}

contextM <- data.frame(cbind(contextmatrix,valid_sample))

# contextM <- lapply(contextM[,],as.numeric)

# contextM <- data.frame(contextM)

# contextM[is.na(contextM)] <- 0

names(contextM) <- c("prefix","suffix","valid1","valid2","valid3","valid4","valid5")

#标记矩阵

target1 <- matrix(1,nrow = 190,ncol = 2)

target2 <- matrix(0,nrow = 190,ncol = 5)

target <- cbind(target1,target2)

# #交叉熵递归下降 求解train_input

# #交叉熵:logits - logits * target + ln(1 + exp(-logits))

# cross_entropy <- logits - logits * target + log(1 + exp(-logits))

# sum_row <- data.frame(rowSums(cross_entropy))

#轮训对一个样本进行随机梯度下降

sigmod = function(x){

  return(1 / 1 + exp(-x))

}

logits <- logits <- matrix(0,nrow = 190,ncol = 7)

#x 190*128 labels 190*1 W 190*128 B 190*1 

nce_weight <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)

nce_biases <- matrix(runif(190,-0.1,0.1),nrow = 190)

train_inputs <- matrix(runif(24320,-0.1,0.1),nrow = 190,ncol = 128)

# nce_weight <- nce_weight2

# nce_biases <- nce_biases2

# train_inputs <- train_inputs2

# train_labels <- matrix(runif(190,-1,1),nrow = 190)

#logit矩阵,方便调试sigmod函数,防止出现正负无穷

genrate_logits = function(){

  logits <- matrix(0,nrow = 190,ncol = 7)

  for(i in c(1:dim(train_inputs)[1])){

    x <- t(data.frame(train_inputs[i,]))

    w <- t(data.frame(nce_weight[as.integer(contextM[i,]),]))

    logits[i,] <- x %*% w + nce_biases[i]

  }

  return(logits)

}

logits2 <- genrate_logits()

#梯度下降

maxiter <- 190

# minerror <- 0.01

step <- 0.01

# newerror <- 1

iter <- 0 #循环次数

len <- dim(train_inputs)[1]

i <- 1 #train_inputs中的第i个样本

while(iter <= maxiter){

  # print("=========")

  des <- matrix(0,nrow = 128,ncol = 1)

  iter <- iter + 1

  if(i > len){i <- i %% len}

  print(i)

  x <- t(data.frame(train_inputs[i,]))

  w <- t(data.frame(nce_weight[as.numeric(contextM[i,]),]))

  #wx + b 的sigmod值,1*7矩阵,计算每个样本的残差进行修正

  logits[i,] <- x %*% w + matrix(nce_biases[as.numeric(contextM[i,]),],nrow = 1,ncol = 7)

  

  

  #依次更新weight和biase

  for(j in c(1:length(logits[1,]))){

    #出现了-Inf和Inf,然后NaN,sigmod函数当值较大或者较小时函数值区域无穷,

    #缩小初始化随机变量的
ae6d
取值范围

    des <- (sigmod(logits[i,j]) - target[i,j]) * as.matrix(train_inputs[i,])#128*1

    #更新x

    train_inputs[i,] <- as.matrix(train_inputs[i,]) - step * des

    # print("=====更新train=====")

    print(des[1,1])

    #更新w

    nce_weight[as.integer(contextM[i,j]),] <- 

      as.matrix(nce_weight[as.integer(contextM[i,j]),]) - step * des

    nce_biases[as.integer(contextM[i,j]),] <- nce_biases[as.integer(contextM[i,j]),] - step * (t(des) %*% des)

    

  }

  i <- i + 1

}

  

#对词向量进行可视化

#pca分析

pca <- princomp(train_inputs[,],cor = TRUE,scores = TRUE)

plot(pca, type="lines")

biplot(pca)

#计算MDS

dis <- dist(train_inputs,diag = TRUE,upper = TRUE )

# fit <- hclust(dis,method = "ward.D")

# plot(fit)

dismatrix <- as.matrix(dis)

mds <- cmdscale(dismatrix,k = 2)

par(family = "STXihei")

plot(mds[,1],mds[,2],type = "n",col = "red")

text(mds[,1],mds[,2],labels = corpusFreq$corpus,cex = 0.5,col = "black")

#计算语句出现概率

getpro = function(s){

  testcorpus <- data.frame("corpus" = wk[s])

  for (i in c(1:dim(testcorpus)[1])) {

    testcorpus$id[i] <- corpusFreq[as.character(corpusFreq$corpus) == testcorpus[i,1],3]

  }

  pro <- 0

  len <- dim(testcorpus)[1] - 1

  for (i in c(2:len)){

    prepro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i-1,2],] + nce_biases[testcorpus[i-1,2],])

    sufpro <- sigmod(matrix(train_inputs[testcorpus[i,2],],nrow = 1,ncol = 128) %*% nce_weight[testcorpus[i+1,2],] + nce_biases[testcorpus[i+1,2],])

    proi <- prepro * sufpro

    pro <- pro + proi

  }

  return(pro)

}

string1 <- "找好工作,主要是有经验"

string2 <- "工作找经验"

pro1 <- getpro(string1) #23.7

pro2 <- getpro(string2) #4.91
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息