您的位置:首页 > 编程语言 > C语言/C++

几种文本相似度算法的C++实现

2014-05-23 18:55 375 查看
1、最小编辑距离

namespace levenshtein
{
bool compare_char_(char c1, char c2)
{
return c1 == c2;
}

size_t ins_(char c)
{
return 1;
}

size_t del_(char c)
{
return 1;
}

size_t sub_(char c1, char c2)
{
return compare_char_(c1, c2) ? 0 : 2;
}

size_t compare_(const std::string& ref_s, const std::string& ref_l)
{
size_t len_s = ref_s.length();
size_t len_l = ref_l.length();

size_t** distance = new size_t*[len_s + 1];
for(size_t i = 0; i < len_s + 1; ++i)
{
distance[i] = new size_t[len_l + 1];
}

distance[0][0] = 0;

for(size_t i = 1; i < len_s + 1; ++i)
{
distance[i][0] = distance[i - 1][0] + del_(ref_s.at(i - 1));
}

for(size_t i = 1; i < len_l + 1; ++i)
{
distance[0][i] = distance[0][i - 1] + ins_(ref_l.at(i - 1));
}

for(size_t i = 1; i < len_s + 1; ++i)
{
for(size_t j = 1; j < len_l + 1; ++j)
{
size_t ins = distance[i][j - 1] + ins_(ref_l.at(j - 1));
size_t del = distance[i - 1][j] + del_(ref_s.at(i - 1));
size_t sub = distance[i - 1][j - 1] + sub_(ref_s.at(i - 1), ref_l.at(j - 1));

distance[i][j] = std::min(std::min(ins, del), sub);
}
}

return distance[len_s][len_l];
}

float compare(const std::string& ref1, const std::string& ref2)
{
if(ref1.empty() && ref2.empty())
{
return 1;
}

size_t distance = 0;
size_t len = 0;

if(ref1.length() < ref2.length())
{
distance = compare_(ref1, ref2);
len = ref2.length();
}
else
{
distance = compare_(ref2, ref1);
len = ref1.length();
}

return distance < len ? 1 - static_cast<float>(distance) / len : 0;
}
}	//levenshtein


2、余弦定理

namespace cosine
{
bool word_segment_(const std::string& substr)
{
return true;
}

float compare(const std::string& ref1, const std::string& ref2)
{
std::map<std::string, std::pair<size_t, size_t>> container;

for(size_t i = 0, start = 0; i < ref1.length(); ++i)
{
std::string substr = ref1.substr(start, i - start + 1);
if(word_segment_(substr))
{
++container[substr].first;
start = i + 1;
}
}

for(size_t i = 0, start = 0; i < ref2.length(); ++i)
{
std::string substr = ref2.substr(start, i - start + 1);
if(word_segment_(substr))
{
++container[substr].second;
start = i + 1;
}
}

unsigned long product = 0;
unsigned long modulo1 = 0;
unsigned long modulo2 = 0;

for(std::map<std::string, std::pair<size_t, size_t>>::const_iterator it = container.begin(); it != container.end(); ++it)
{
const std::pair<size_t, size_t>& cnt = it->second;
product += cnt.first * cnt.second;
modulo1 += cnt.first * cnt.first;
modulo2 += cnt.second * cnt.second;
}

return product / (std::sqrt(static_cast<float>(modulo1)) * std::sqrt(static_cast<float>(modulo2)));
}
}	//cosine
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: