您的位置:首页 > 其它

计算两个中文字符串相似度——编辑距离算法

2014-09-12 21:05 405 查看
Levenshtein 距离,又称编辑距离,指的是两个字符串之间,由一个转换成另一个所需的最少编辑操作次数。

许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。

编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫Levenshtein Distance。

1、Java

public static void levenshtein(String str1, String str2) {
// 计算两个字符串的长度。
int len1 = str1.length();
int len2 = str2.length();
// 建立上面说的数组,比字符长度大一个空间
int[][] dif = new int[len1 + 1][len2 + 1];
// 赋初值,步骤B。
for (int a = 0; a <= len1; a++) {
dif[a][0] = a;
}
for (int a = 0; a <= len2; a++) {
dif[0][a] = a;
}
// 计算两个字符是否一样,计算左上的值
int temp;
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {

System.out.println("i = " + i + " j = " + j + " str1 = "
+ str1.charAt(i - 1) + " str2 = " + str2.charAt(j - 1));
if (str1.charAt(i - 1) == str2.charAt(j - 1)) {
temp = 0;
} else {
temp = 1;
}
// 取三个值中最小的
dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1,
dif[i - 1][j] + 1);

System.out.println("i = " + i + ", j = " + j + ", dif[i][j] = "
+ dif[i][j]);
}
}
System.out.println("字符串\"" + str1 + "\"与\"" + str2 + "\"的比较");
// 取数组右下角的值,同样不同位置代表不同字符串的比较
System.out.println("差异步骤:" + dif[len1][len2]);
// 计算相似度
float similarity = 1 - (float) dif[len1][len2]
/ Math.max(str1.length(), str2.length());
System.out.println("相似度:" + similarity);
}
</span></span>


2、LotusScript

Function toCompute(str1 As String ,str2 As String) As Double

Dim len1 As Integer
Dim len2 As Integer
Dim maxlen As Integer
Dim i As long
Dim j  As long
Dim temp As long
Dim similarity As Double

If str1= "" Or str2 = "" Then

toCompute = 0

Else

len1 = Len(str1)
len2 = Len(str2)

Dim dif(0 To 120, 0 To 120) As Integer

If len1 > 120 Then
len1 = 120
End If

If len2 > 120 Then
len2 = 120
End If

If len1 > len2 Then
maxlen = len1
Else
maxlen = len2
End If

For i = 0 To len1 Step 1
dif(i,0) = i
Next

For i = 0 To len2  Step 1
dif(0,i) = i
Next

For i = 1 To len1  Step 1

For j = 1 To len2  Step 1

'Print "i = "& i & " j  = " & j &" str1 = " & Right$(Left$(str1,i),1) &" str2 = " &Right$(Left$(str2,j),1)

If Right$(Left$(str1,i),1) = Right$(Left$(str2,j),1) Then
temp = 0
Else
temp = 1
End If

dif(i,j) = min(dif(i-1,j-1)+ temp ,dif(i,j-1)+1,dif(i-1,j)+1)

Next
Next

'Print "差异步骤: " & dif(len1 ,len2)
similarity = 1 - dif(len1 ,len2 )/maxlen
'Print "差异度:" & similarity
toCompute = similarity

'Call toLogFile("str1 = " & str1 &" str2 = " &str2 & " 相似度: " & similarity)
End If
End Function
</span></span>


优化

1、Visual Basic

Module Module1

Sub Main()
Dim str1 As String

Dim str2 As String

str1 = "今天是星期五"
str2 = "明天星期四"

Dim dis As New clsDistance(str1, str2)
Dim result As Integer

result = dis.CacuDistance()

Console.WriteLine(result)

End Sub

Public Class clsDistance
Private mCharA() As Char
Private mCharB() As Char
Private mCharALen As Integer
Private mCharBLen As Integer

Public Sub New(ByVal StrA As String, ByVal StrB As String)

mCharA = StrA.ToCharArray
mCharB = StrB.ToCharArray
mCharALen = mCharA.Length
mCharBLen = mCharB.Length

End Sub

Public Function CacuDistance() As Integer
Dim i As Integer

If mCharALen = 0 Then Return mCharBLen
If mCharBLen = 0 Then Return mCharALen

Console.WriteLine(mCharALen)
Console.WriteLine(mCharBLen)

Dim j As Integer = Min(mCharALen, mCharBLen) - 1
Dim tP1 As Integer, tP2 As Integer

tP1 = -1
tP2 = -1

For i = 0 To j
If mCharA(i) <> mCharB(i) Then
tP1 = i
Exit For
End If
Next

If tP1 = -1 Then Return Math.Abs(mCharALen - mCharBLen)

For i = 0 To j - tP1
If mCharA(mCharALen - i - 1) <> mCharB(mCharBLen - i - 1) Then
tP2 = i
Exit For
End If
Next

If tP2 = -1 Then Return Math.Abs(mCharALen - mCharBLen)
Console.WriteLine("tp1: = " & tP1)
Console.WriteLine("tp2 : = " & tP2)

Dim tA(mCharALen - tP1 - tP2) As Integer

For i = 0 To tA.GetUpperBound(0)
tA(i) = i
Next
For i = 0 To tA.GetUpperBound(0) Step 1
Console.WriteLine(" i = " + CStr(i) + " " & tA(i))
Next

Console.WriteLine("Bound: = " & tA.GetUpperBound(0))

Dim tN1 As Integer, tN2 As Integer, tN3 As Integer

For i = 0 To mCharBLen - tP1 - tP2 - 1
tN1 = tA(0)
tN2 = tN1 + 1
Console.WriteLine("i = " & i & " " & mCharB(mCharBLen - tP2 - i - 1))

For j = 1 To tA.GetUpperBound(0)

Console.WriteLine("j = " & j & " " & mCharA(mCharALen - tP2 - j))

If mCharA(mCharALen - tP2 - j) = mCharB(mCharBLen - tP2 - i - 1) Then
tN3 = tN1
Else
tN3 = Min(tA(j), tN1, tN2) + 1
End If
tA(j - 1) = tN2
tN2 = tN3
tN1 = tA(j)

Console.WriteLine("tn1 = " & tN1)
Console.WriteLine("tn2 = " & tN2)
Console.WriteLine("tn3 = " & tN3)
Next
tA(tA.GetUpperBound(0)) = tN2

Console.WriteLine(tA.GetUpperBound(0) & " " & tA(tA.GetUpperBound(0)))
Next

For i = 0 To tA.GetUpperBound(0) Step 1
Console.WriteLine(" i = " + CStr(i) + " " & tA(i))
Next
Return tA(tA.GetUpperBound(0))

End Function

Public Function Min(ByVal ParamArray Num() As Integer) As Integer
Dim tN As Integer, i As Integer
If Num.Length = 0 Then Return Nothing
tN = Num(0)

For i = 1 To Num.GetUpperBound(0)
If Num(i) < tN Then tN = Num(i)
Next

Return tN
End Function

End Class

End Module
</span>


2、Java

public static int clsDistance(String str1, String str2) {

int j;
int i;

int mCharALen, mCharBLen;

mCharALen = str1.length();
mCharBLen = str2.length();

int tp1 = -1;
int tp2 = -1;

j = Math.min(mCharALen , mCharBLen) - 1;

for (i = 0; i <= j; i++) {
if (str1.charAt(i) != str2.charAt(i)) {
tp1 = i;
break;
}

}

if (tp1 == -1) {
return Math.abs(mCharBLen - mCharALen);
}

for (i = 0; i <= j - tp1; i++) {

if (str1.charAt(mCharALen - i - 1) != str2.charAt(mCharBLen - i
- 1)) {
tp2 = i;
break;
}
}

if (tp2 == -1) {
return Math.abs(mCharALen - mCharBLen);
}
int taBound = mCharALen - tp1 - tp2;

int tA[] = new int[taBound + 1];

for (i = 0; i < tA.length; i++) {
tA[i] = i ;

}
System.out.println(Arrays.toString(tA));
int tN1, tN2, tN3;

for (i = 0; i < mCharBLen - tp1 - tp2 ; i++) {
tN1 = tA[0];
tN2 = tN1 + 1;

System.out.println("\n" + i + " " + str2.charAt(mCharBLen
- tp2 - i - 1));

for (j = 1; j < tA.length  ; j++) {

System.out.print(str1.charAt(mCharALen - tp2 - j ) +"	");

if (str1.charAt(mCharALen - tp2 - j  ) == str2.charAt(mCharBLen
- tp2 - i - 1)) {

tN3 = tN1;
} else {
tN3 = Math.min(tA[j], Math.min(tN1, tN2)) + 1;

}

tA[j - 1] = tN2;
tN2 = tN3;
tN1 = tA[j];

System.out.println("\ntN1 = " + tN1);
System.out.println("tN2 = " + tN2);
System.out.println("tN3 = " + tN3);
}

tA[tA.length - 1] = tN2;
System.out.println("\n"+tA[tA.length - 1] );
}

System.out.println("\n" +Arrays.toString(tA));
return tA[tA.length - 1];

}</span>


3、Lotus Script

%REM
Function clsDistance
Description: Comments for Function
%END REM
Function clsDistance(str1 As String ,str2 As String) As Double
Dim mCharALen As Integer
Dim mCharBLen As Integer
Dim i As Integer
Dim simularity As Double
Dim maxlen As Integer

mCharALen = Len(str1)
mCharBLen = Len(str2)

If mCharALen > mCharBLen Then
maxlen = mCharALen
Else
maxlen = mCharBLen
End If

If str1= "" Or str2 = "" Then
clsDistance = 0
Exit function
End If

Dim j As Integer

If mCharALen > mCharBLen Then
j = mCharBLen - 1
Else
j = mCharALen - 1
End If

Dim tP1 , tP2  As Integer
tP1 = -1
tP2 = -1

For i = 0 To j Step 1

If Right$(Left$(str1,i+1),1) <> Right$(Left$(str2,i+1),1) Then
tP1 = i
Exit For
End If

Next

If tP1 = -1 Then
clsDistance = 1 - Abs(mCharALen - mCharBLen) / maxlen
Exit Function
End If

For i = 0 To j - tP1
If Right$(Left$(str1,mCharALen - i),1) <> Right$(Left$(str2,mCharBLen - i),1) Then
tP2 = i
Exit For
End If
Next

If tP2 = -1 Then
clsDistance = 1 - Abs(mCharALen - mCharBLen) /  maxlen
Exit Function
End If

Dim tA(15000) As Integer
Dim tABound As Integer
tABound = mCharALen - tP1 - tP2 + 1

For i = 0 To tABound Step 1
tA(i) = i
Next

Dim tN1 As Integer, tN2 As Integer, tN3 As Integer

For i = 0 To mCharBLen - tP1 - tP2
tN1 = tA(0)
tN2 = tN1 + 1

For j = 1 To tABound

If Right$(Left$(str1,mCharALen - tP2 - j + 1),1) = Right$(Left$(str2,mCharBLen - tP2 - i),1) Then
tN3 = tN1
Else
tN3 = Min(tA(j), tN1, tN2) + 1
End If

tA(j - 1) = tN2
tN2 = tN3
tN1 = tA(j)
Next

tA(tABound) = tN2
Next

simularity = 1 - tA(tABound) / maxlen

clsDistance = simularity
End Function</span>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: