word2vec之TrainModelThread程序详细注解
2016-08-03 19:44
387 查看
<pre name="code" class="cpp">void *TrainModelThread(void *id) { long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; real f, g; clock_t now; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //只有输入层需要,隐含层是一个累加和,输出层存入huffman树中。 real *neu1e = (real *)calloc(layer1_size, sizeof(real)); FILE *fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { /************每10000个词左右重新计算一次alpha.**********************/ if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if ((debug_mode > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, word_count_actual / (real)(iter * train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } /**********************读入一个句子,或者文章长于1000,则分成两句***************************************/ //将句子中每个词的vocab位置存入到sen[] //每次读入一句,但读一句后等待这句话处理完之后再读下一句。 if (sentence_length == 0) { //只有在一句执行完之后,,才会取下一句 while (1) { word = ReadWordIndex(fi); //读fi中的词,返回其在vocab中的位置。 if (feof(fi)) break; if (word == -1) continue; word_count++; if (word == 0) break; // 第0个词存的是句子结束符</s>,因此,这里一次性送入sen的就是一个句子或一篇文章。 // The subsampling randomly discards frequent words while keeping the ranking same if (sample > 0) { // real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; //(next_random & 0xFFFF) / (real)65536 应该是个小于1的值。也就是说ran 应该大于1. } sen[sentence_length] = word; //sen存的是词在vocab中的位置。 sentence_length++; if (sentence_length >= MAX_SENTENCE_LENGTH) break; //文章超过1000个词则分成两个句子。 } sentence_position = 0; } /**************************************************处理到文件尾的话,迭代数递减,***********************************/ //所有的词(这里单个线程处理其对应的词)会被执行local_iter次。这5次神经网络的参数不是重复的,而是持续更新的,像alpha、syn0。 //单个线程处理的词是一样的,这个后续可以看看有没可优化的地方。 if (feof(fi) || (word_count > train_words / num_threads)) { //train_file被读到末尾了,或者一个线程已经完成了它的份额。 word_count_actual += word_count - last_word_count; local_iter--; //读完所有词之后进行5次迭代是个啥意思? 也就是这些词不是过一次这个网络就行了,而是5词。 if (local_iter == 0) break; //只有这里才是跳出最外层循环的地方。 word_count = 0; last_word_count = 0; sentence_length = 0; //移动文件流读写位置,从距文件开头file_size / (long long)num_threads * (long long)id 位移量为新的读写位置 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); //将文件读指针重新移到到此线程所处理词的开头。 continue; } /*******************************进入神经网络******************************/ word = sen[sentence_position]; //从句首开始,虽然window=5,或别的,但是,以 if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; //这个点没有固定下来,导致窗口也是随机的,可以看看这点是否可以优化。 b = next_random % window; //b取0-4之间的随机值。 if (cbow) { //train the cbow architecture // in -> hidden cw = 0; //窗口大小随机,但有范围(3-11,窗口大小为单数,一共5种,因此,window实际可以理解为窗口变化的种数),以当前词为中心,(除最开始,和最末尾) for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; //给c赋值 if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; //累加词对应的向量。双重循环下来就是窗口额定数量的词每一维对应的向量累加。 //累加后neu1的维度依然是layer1_size。 //从输入层过度到隐含层。 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; cw++; //进入隐含层的词个数。 } if (cw) { for (c = 0; c < layer1_size; c++) neu1[c] /= cw; //归一化处理。 //遍历该叶子节点对应的路径,也就是每个父结点循环一次,这是什么原理呢? //这样一来,越是词频低的词,迭代层数越多, //每个词都要从叶子结点向根结点推一遍。 //这样的话可以通过父结点,建立叶子结点之间的联系。 if (hs) for (d = 0; d < vocab[word].codelen; d++) { f = 0; l2 = vocab[word].point[d] * layer1_size; // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //做内积 这个内积是什么原理呢? if (f <= -MAX_EXP) continue; //不在范围内的内积丢掉 else if (f >= MAX_EXP) continue; //-6<f<6 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; //sigmod函数, f=expTab[(int)((f+6)*1000/12)] // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; //计算梯度 // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //计算向量误差,实际就是各父结点的向量和乘梯度。 // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; //更新父结点们的向量值,父结点的向量就是各叶子结点各次向量的累加。 //关系就是这样建立起来的,各叶子结点的向量都累加进入了每一个父结点中,因此,拥有相同父结点的词就会联系起来了。 } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { //有负样本,处理负样本 if (d == 0) { target = word; label = 1; //正样本 } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; //负样本 } l2 = target * layer1_size; f = 0; //以下和上面差不多。 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; } // hidden -> in for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; //用误差更新向量(输入层参数),直接将误差叠加到输入向量上,这样好吗? } } } else { //train skip-gram for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; l1 = last_word * layer1_size; //和cbw相比少了做输入向量累加。 for (c = 0; c < layer1_size; c++) neu1e[c] = 0; // HIERARCHICAL SOFTMAX if (hs) for (d = 0; d < vocab[word].codelen; d++) { f = 0; l2 = vocab[word].point[d] * layer1_size; // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; //做内积 if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; } // Learn weights input -> hidden for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; } } sentence_position++; if (sentence_position >= sentence_length) { sentence_length = 0; continue; } } fclose(fi); free(neu1); free(neu1e); pthread_exit(NULL); }
相关文章推荐
- Word2Vec代码注解-distance
- word2vec源码详细赏析(一)
- 详细程序注解学OpenCL一 环境配置和入门程序
- 约瑟夫环问题求解--程序+详细注解
- 【学习笔记】用Hadoop在MapReduce中WordCount简单程序运行详细流程
- 对word2vec训练的model的结果操作笔记
- NLP: Word2Vec之Skip-Gram Model
- Neural Probabilistic Language Model, word2vec来龙去脉
- 详细程序注解学OpenCL一 环境配置和入门程序
- spring mvc 注解RequestParam,RequestBody,ModelAttribute详细说明
- Word2vec神经网络详细分析——TrainModelThread分析
- 词嵌入(word2vec)-NNLM(Neural Network Language Model)
- 详细程序注解学OpenCL一 环境配置和入门程序
- Word2Vec程序解析
- 词嵌入(word2vec)-NNLM(Neural Network Language Model)
- 【腾讯TMQ】Tensorflow 的 word2vec详细解释——basic篇
- 一个程序详细研究DataReader
- 把附件(WORD、EXCEL、PDF)或图片存贮到数据库的完整范例程序。(MYSQL、SQL SERVER、ORACLE)三个版本
- asp程序错误详细说明例表
- 在ASP.NET程序中存储和调用word文件