您的位置:首页 > 其它

fasttext源码剖析

2017-07-13 23:53 316 查看

fasttext源码剖析

目的:记录结合多方资料以及个人理解的剖析代码;
https://heleifz.github.io/14732610572844.html http://www.cnblogs.com/peghoty/p/3857839.html
一:代码总体模块关联图:

/**
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*/

#include "fasttext.h"

#include <math.h>

#include <iostream>
#include <iomanip>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>

namespace fasttext {
//获取词向量
void FastText::getVector(Vector& vec, const std::string& word) {
const std::vector<int32_t>& ngrams = dict_->getNgrams(word);
vec.zero();
for (auto it = ngrams.begin(); it != ngrams.end(); ++it) {
vec.addRow(*input_, *it);//ngram的累加
}
if (ngrams.size() > 0) {//ngram均值,来体现词向量
vec.mul(1.0 / ngrams.size());
}
}
//保存词向量
void FastText::saveVectors() {
std::ofstream ofs(args_->output + ".vec");
if (!ofs.is_open()) {
std::cout << "Error opening file for saving vectors." << std::endl;
exit(EXIT_FAILURE);
}
ofs << dict_->nwords() << " " << args_->dim << std::endl;
Vector vec(args_->dim);
for (int32_t i = 0; i < dict_->nwords(); i++) {
std::string word = dict_->getWord(i);//获取词
getVector(vec, word);//获取词的向量
ofs << word << " " << vec << std::endl;
}
ofs.close();
}
//保存模型
void FastText::saveModel() {
std::ofstream ofs(args_->output + ".bin", std::ofstream::binary);
if (!ofs.is_open()) {
std::cerr << "Model file cannot be opened for saving!" << std::endl;
exit(EXIT_FAILURE);
}
args_->save(ofs);
dict_->save(ofs);
input_->save(ofs);
output_->save(ofs);
ofs.close();
}
//加载模型
void FastText::loadModel(const std::string& filename) {
std::ifstream ifs(filename, std::ifstream::binary);
if (!ifs.is_open()) {
std::cerr << "Model file cannot be opened for loading!" << std::endl;
exit(EXIT_FAILURE);
}
loadModel(ifs);
ifs.close();
}

void FastText::loadModel(std::istream& in) {
args_ = std::make_shared<Args>();
dict_ = std::make_shared<Dictionary>(args_);
input_ = std::make_shared<Matrix>();
output_ = std::make_shared<Matrix>();
args_->load(in);
dict_->load(in);
input_->load(in);
output_->load(in);
model_ = std::make_shared<Model>(input_, output_, args_, 0);//传的是指针,改变可以带回
if (args_->model == model_name::sup) {//构建模型的过程
model_->setTargetCounts(dict_->getCounts(entry_type::label));
} else {
model_->setTargetCounts(dict_->getCounts(entry_type::word));
}
}
//打印提示信息
void FastText::printInfo(real progress, real loss) {
real t = real(clock() - start) / CLOCKS_PER_SEC;//多少秒
real wst = real(tokenCount) / t;//每秒处理词数
real lr = args_->lr * (1.0 - progress);//学习率
int eta = int(t / progress * (1 - progress) / args_->thread);
int etah = eta / 3600;
int etam = (eta - etah * 3600) / 60;
std::cout << std::fixed;
std::cout << "\rProgress: " << std::setprecision(1) << 100 * progress << "%";//完成度
std::cout << "  words/sec/thread: " << std::setprecision(0) << wst;//每秒每线程处理个数
std::cout << "  lr: " << std::setprecision(6) << lr;//学习率
std::cout << "  loss: " << std::setprecision(6) << loss;//损失度
std::cout << "  eta: " << etah << "h" << etam << "m ";
std::cout << std::flush;
}

void FastText::supervised(Model& model, real lr,
const std::vector<int32_t>& line,
const std::vector<int32_t>& labels) {
if (labels.size() == 0 || line.size() == 0) return;
std::uniform_int_distribution<> uniform(0, labels.size() - 1);
int32_t i = uniform(model.rng);
model.update(line, labels[i], lr);
}
//cbow模型
void FastText::cbow(Model& model, real lr,
const std::vector<int32_t>& line) {
std::vector<int32_t> bow;
std::uniform_int_distribution<> uniform(1, args_->ws);
for (int32_t w = 0; w < line.size(); w++) {
int32_t boundary = uniform(model.rng);//随机取个窗口--每个词的窗口不一样
bow.clear();
for (int32_t c = -boundary; c <= boundary; c++) {
if (c != 0 && w + c >= 0 && w + c < line.size()) {
const std::vector<int32_t>& ngrams = dict_->getNgrams(line[w + c]);//ngrams语言
bow.insert(bow.end(), ngrams.cbegin(), ngrams.cend());//加入上下文中
}
}
model.update(bow, line[w], lr);//根据上下文更新
}
}
//skipgram模型
void FastText::skipgram(Model& model, real lr,
const std::vector<int32_t>& line) {
std::uniform_int_distribution<> uniform(1, args_->ws);
for (int32_t w = 0; w < line.size(); w++) {
int32_t boundary = uniform(model.rng);//窗口随机
const std::vector<int32_t>& ngrams = dict_->getNgrams(line[w]);
for (int32_t c = -boundary; c <= boundary; c++) {//每个预测词的更新
if (c != 0 && w + c >= 0 && w + c < line.size()) {
model.update(ngrams, line[w + c], lr);//ngram作为上下文
}
}
}
}
//测试模型
void FastText::test(std::istream& in, int32_t k) {
int32_t nexamples = 0, nlabels = 0;
double precision = 0.0;
std::vector<int32_t> line, labels;

while (in.peek() != EOF) {
dict_->getLine(in, line, labels, model_->rng);//获取句子
dict_->addNgrams(line, args_->wordNgrams);//对句子增加其ngram
if (labels.size() > 0 && line.size() > 0) {
std::vector<std::pair<real, int32_t>> modelPredictions;
model_->predict(line, k, modelPredictions);//预测
for (auto it = modelPredictions.cbegin(); it != modelPredictions.cend(); it++) {
if (std::find(labels.begin(), labels.end(), it->second) != labels.end()) {
precision += 1.0;//准确数
}
}
nexamples++;
nlabels += labels.size();
}
}
std::cout << std::setprecision(3);
std::cout << "P@" << k << ": " << precision / (k * nexamples) << std::endl;
std::cout << "R@" << k << ": " << precision / nlabels << std::endl;
std::cout << "Number of examples: " << nexamples << std::endl;
}
//预测
void FastText::predict(std::istream& in, int32_t k,
std::vector<std::pair<real,std::string>>& predictions) const {
std::vector<int32_t> words, labels;
dict_->getLine(in, words, labels, model_->rng);
dict_->addNgrams(words, args_->wordNgrams);
if (words.empty()) return;
Vector hidden(args_->dim);
Vector output(dict_->nlabels());
std::vector<std::pair<real,int32_t>> modelPredictions;
model_->predict(words, k, modelPredictions, hidden, output);
predictions.clear();
for (auto it = modelPredictions.cbegin(); it != modelPredictions.cend(); it++) {
predictions.push_back(std::make_pair(it->first, dict_->getLabel(it->second)));//不同标签的预测分
}
}
//预测
void FastText::predict(std::istream& in, int32_t k, bool print_prob) {
std::vector<std::pair<real,std::string>> predictions;
while (in.peek() != EOF) {
predict(in, k, predictions);
if (predictions.empty()) {
std::cout << "n/a" << std::endl;
continue;
}
for (auto it = predictions.cbegin(); it != predictions.cend(); it++) {
if (it != predictions.cbegin()) {
std::cout << ' ';
}
std::cout << it->second;
if (print_prob) {
std::cout << ' ' << exp(it->first);
}
}
std::cout << std::endl;
}
}
//获取词向量
void FastText::wordVectors() {
std::string word;
Vector vec(args_->dim);
while (std::cin >> word) {
getVector(vec, word);//获取一个词的词向量,不仅仅是对已知的,还能对未知进行预测
std::cout << word << " " << vec << std::endl;
}
}
//句子的向量
void FastText::textVectors() {
std::vector<int32_t> line, labels;
Vector vec(args_->dim);
while (std::cin.peek() != EOF) {
dict_->getLine(std::cin, line, labels, model_->rng);//句子
dict_->addNgrams(line, args_->wordNgrams);//对应ngram
vec.zero();
for (auto it = line.cbegin(); it != line.cend(); ++it) {//句子的词以及ngram的索引
vec.addRow(*input_, *it);//将词的向量求出和
}
if (!line.empty()) {//求均值
vec.mul(1.0 / line.size());
}
std::cout << vec << std::endl;//表示句子的词向量
}
}

void FastText::printVectors() {
if (args_->model == model_name::sup) {
textVectors();
} else {//词向量
wordVectors();
}
}
//训练线程
void FastText::trainThread(int32_t threadId) {
std::ifstream ifs(args_->input);
utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);

Model model(input_, output_, args_, threadId);
if (args_->model == model_name::sup) {
model.setTargetCounts(dict_->getCounts(entry_type::label));
} else {
model.setTargetCounts(dict_->getCounts(entry_type::word));
}

const int64_t ntokens = dict_->ntokens();
int64_t localTokenCount = 0;
std::vector<int32_t> line, labels;
while (tokenCount < args_->epoch * ntokens) {//epoch迭代次数
real progress = real(tokenCount) / (args_->epoch * ntokens);//进度
real lr = args_->lr * (1.0 - progress);
localTokenCount += dict_->getLine(ifs, line, labels, model.rng);
if (args_->model == model_name::sup) {//分不同函数进行处理
dict_->addNgrams(line, args_->wordNgrams);
supervised(model, lr, line, labels);
} else if (args_->model == model_name::cbow) {
cbow(model, lr, line);
} else if (args_->model == model_name::sg) {
skipgram(model, lr, line);
}
if (localTokenCount > args_->lrUpdateRate) {//修正学习率
tokenCount += localTokenCount;
localTokenCount = 0;
if (threadId == 0 && args_->verbose > 1) {
printInfo(progress, model.getLoss());
}
}
}
if (threadId == 0 && args_->verbose > 0) {
printInfo(1.0, model.getLoss());
std::cout << std::endl;
}
ifs.close();
}
//加载Vectors过程, 字典
void FastText::loadVectors(std::string filename) {
std::ifstream in(filename);
std::vector<std::string> words;
std::shared_ptr<Matrix> mat; // temp. matrix for pretrained vectors
int64_t n, dim;
if (!in.is_open()) {
std::cerr << "Pretrained vectors file cannot be opened!" << std::endl;
exit(EXIT_FAILURE);
}
in >> n >> dim;
if (dim != args_->dim) {
std::cerr << "Dimension of pretrained vectors does not match -dim option"
<< std::endl;
exit(EXIT_FAILURE);
}
mat = std::make_shared<Matrix>(n, dim);
for (size_t i = 0; i < n; i++) {
std::string word;
in >> word;
words.push_back(word);
dict_->add(word);
for (size_t j = 0; j < dim; j++) {
in >> mat->data_[i * dim + j];
}
}
in.close();

dict_->threshold(1, 0);
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);

for (size_t i = 0; i < n; i++) {
int32_t idx = dict_->getId(words[i]);
if (idx < 0 || idx >= dict_->nwords()) continue;
for (size_t j = 0; j < dim; j++) {
input_->data_[idx * dim + j] = mat->data_[i * dim + j];
}
}
}
//训练
void FastText::train(std::shared_ptr<Args> args) {
args_ = args;
dict_ = std::make_shared<Dictionary>(args_);
if (args_->input == "-") {
// manage expectations
std::cerr << "Cannot use stdin for training!" << std::endl;
exit(EXIT_FAILURE);
}
std::ifstream ifs(args_->input);
if (!ifs.is_open()) {
std::cerr << "Input file cannot be opened!" << std::endl;
exit(EXIT_FAILURE);
}
dict_->readFromFile(ifs);
ifs.close();

if (args_->pretrainedVectors.size() != 0) {
loadVectors(args_->pretrainedVectors);
} else {
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);
}

if (args_->model == model_name::sup) {
output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);
} else {
output_ = std::make_shared<Matrix>(dict_->nwords(), args_->dim);
}
output_->zero();

start = clock();
tokenCount = 0;
std::vector<std::thread> threads;
for (int32_t i = 0; i < args_->thread; i++) {
threads.push_back(std::thread([=]() { trainThread(i); }));
}
for (auto it = threads.begin(); it != threads.end(); ++it) {
it->join();
}
model_ = std::make_shared<Model>(input_, output_, args_, 0);

saveModel();
if (args_->model != model_name::sup) {
saveVectors();
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: