您的位置:首页 > 其它

基于字的文本相似度算法——Jacard算法

2016-10-05 18:42 246 查看
一、算法原理

基于字的文本相似度Jacard 算法的原理是:

(1)计算两个文本中字的交集

(2)计算两个文本中字的并集

(3)交集内的字的个数除以并集内的字的个数即为文本相似度值

(4)根据设置的阈值判断是否相似

二、算法的C++实现

这里引用的StringUtil.hpp文件引自:
https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp
/*
* JaccardSimilarity.hpp
*
* Created: 2016年10月2日
* Author: tang
*/

#ifndef SRC_JACCARD_SIMILARITY_HPP_
#define SRC_JACCARD_SIMILARITY_HPP_
#include <algorithm>
#include <iostream>
#include <vector>
#include <set>
#include "StringUtil.hpp"

using namespace std;

class JaccardSimilarity
{
public:

JaccardSimilarity()
{
}

double CalculateTextSimilarity(string &str1,string &str2)
{
vector<uint16_t> words_for_str1;
vector<uint16_t> words_for_str2;
vector<uint16_t>::iterator it;

if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||
!utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
{
cout<<"TransCode Error"<<endl;
return 0.;
}

for(it=words_for_str1.begin();it!=words_for_str1.end();)
{
if(codeFilter(*it))
{
++it;
}
else
{
it=words_for_str1.erase(it);
}
}

for(it=words_for_str2.begin();it!=words_for_str2.end();)
{
if(codeFilter(*it))
{
++it;
}
else
{
it=words_for_str2.erase(it);
}
}

if(words_for_str1.size()+words_for_str2.size()<1)
return 1.;

vector<uint16_t> words_intersection;
vector<uint16_t> words_union;
std::sort(words_for_str1.begin(),words_for_str1.end());
std::sort(words_for_str2.begin(),words_for_str2.end());
std::set_intersection(words_for_str1.begin(),words_for_str1.end(),
words_for_str2.begin(),words_for_str2.end(),
std::inserter(words_intersection,words_intersection.begin()));

std::set_union(words_for_str1.begin(),words_for_str1.end(),
words_for_str2.begin(),words_for_str2.end(),
std::inserter(words_union,words_union.begin()));

double inter=words_intersection.size();
double wunion=words_union.size();

return inter/wunion;
}

bool codeFilter(int code)
{
if ((code < 0x4e00 || code > 0x9fa5) &&
!(code >= '0' && code <= '9') &&
!(code >= 'a' && code <= 'z') &&
!(code >= 'A' && code <= 'Z'))
return false;

return true;
}

};

#endif /* SRC_JACCARD_SIMILARITY_HPP_ */

三、算法的java实现
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

public class JaccardSimilarity{

public JaccardSimilarity() {
}

public boolean codeFilter(int code) {
if ((code < 19968 || code > 40869)
&& !(code >= '0' && code <= '9')
&& !(code >= 'a' && code <= 'z')
&& !(code >= 'A' && code <= 'Z')) {
return false;
}
return true;
}

public double CalculateTextSim(String content, String compareContent) {
if(null == content || null == compareContent)
return 0.0;
Map<String, Integer> cntMap = new HashMap<String, Integer>();
Set<String> cntSet = new HashSet<String>();
Map<String, Integer> cmpCntMap = new HashMap<String, Integer>();
Set<String> cmpCntSet = new HashSet<String>();

for (int i = 0; i != content.length(); i++) {
int k = 0;
if (codeFilter(content.codePointAt(i))) {
if (cntMap.containsKey("" + content.charAt(i))) {
Integer count = cntMap.get("" + content.charAt(i));
count = count + 1;
cntMap.put("" + content.charAt(i), count);
k = count;
} else {
cntMap.put("" + content.charAt(i), new Integer(1));
k = 1;
}
String tmpString = content.charAt(i) + "" + k;
cntSet.add(tmpString);
}
}

for (int i = 0; i != compareContent.length(); i++) {
int k = 0;
if (codeFilter(compareContent.codePointAt(i))) {
if (cmpCntMap.containsKey("" + compareContent.charAt(i))) {
Integer count = cmpCntMap.get("" + compareContent.charAt(i));
count = count + 1;
cmpCntMap.put("" + compareContent.charAt(i), count);
k = count;
} else {
cmpCntMap.put("" + compareContent.charAt(i), new Integer(1));
k = 1;
}

String tmpString = compareContent.charAt(i) + "" + k;
cmpCntSet.add(tmpString);
}
}

Set<String> tmpSet = new HashSet<String>();
tmpSet.addAll(cntSet);
cntSet.retainAll(cmpCntSet);
double intCount = cntSet.size();

tmpSet.addAll(cmpCntSet);

if (tmpSet.size() == 0)
return 0;
double uniCount = tmpSet.size();

return intCount / uniCount;
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  算法