You*_*usf 55 excel vba excel-vba levenshtein-distance
我有excel表格的数据,我想得到Levenshtein他们之间的距离.我已经尝试导出为文本,从脚本(php)读入,运行Levenshtein(计算Levenshtein距离),再次将其保存为excel.
但我正在寻找一种以编程方式计算VBA中的Levenshtein距离的方法.我该怎么做呢?
smi*_*man 57
维基百科翻译:
Option Explicit
Public Function Levenshtein(s1 As String, s2 As String)
Dim i As Integer
Dim j As Integer
Dim l1 As Integer
Dim l2 As Integer
Dim d() As Integer
Dim min1 As Integer
Dim min2 As Integer
l1 = Len(s1)
l2 = Len(s2)
ReDim d(l1, l2)
For i = 0 To l1
d(i, 0) = i
Next
For j = 0 To l2
d(0, j) = j
Next
For i = 1 To l1
For j = 1 To l2
If Mid(s1, i, 1) = Mid(s2, j, 1) Then
d(i, j) = d(i - 1, j - 1)
Else
min1 = d(i - 1, j) + 1
min2 = d(i, j - 1) + 1
If min2 < min1 Then
min1 = min2
End If
min2 = d(i - 1, j - 1) + 1
If min2 < min1 Then
min1 = min2
End If
d(i, j) = min1
End If
Next
Next
Levenshtein = d(l1, l2)
End Function
Run Code Online (Sandbox Code Playgroud)
?莱文斯坦( "星期六", "星期天")
3
aev*_*nko 29
感谢smirkingman为好的代码发布.这是一个优化版本.
1)使用Asc(Mid $(s1,i,1)代替.数值比较通常比文本更快.
2)使用Mid $而不是Mid,因为后者是变体ver.并添加$是字符串ver.
3)使用应用功能min.(仅限个人喜好)
4)使用Long而不是Integers,因为它是excel本身使用的.
Function Levenshtein(ByVal string1 As String, ByVal string2 As String) As Long
Dim i As Long, j As Long
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long
string1_length = Len(string1)
string2_length = Len(string2)
ReDim distance(string1_length, string2_length)
For i = 0 To string1_length
distance(i, 0) = i
Next
For j = 0 To string2_length
distance(0, j) = j
Next
For i = 1 To string1_length
For j = 1 To string2_length
If Asc(Mid$(string1, i, 1)) = Asc(Mid$(string2, j, 1)) Then
distance(i, j) = distance(i - 1, j - 1)
Else
distance(i, j) = Application.WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
End If
Next
Next
Levenshtein = distance(string1_length, string2_length)
End Function
Run Code Online (Sandbox Code Playgroud)
更新:
对于那些想要它的人:我认为可以说大多数人使用Levenshtein距离来计算模糊匹配百分比.这是一种方法,我添加了一个可以指定min的优化.匹配%返回(默认为70%+.输入百分比,如"50"或"80",或"0"运行公式,无论如何).
速度提升来自这样一个事实:函数将通过检查2个字符串的长度来检查它是否可能在你给它的百分比内.请注意,在某些方面可以优化此功能,但为了便于阅读,我保留了此功能.我将结果中的距离连接起来以证明功能,但你可以摆弄它:)
Function FuzzyMatch(ByVal string1 As String, _
ByVal string2 As String, _
Optional min_percentage As Long = 70) As String
Dim i As Long, j As Long
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long, result As Long
string1_length = Len(string1)
string2_length = Len(string2)
' Check if not too long
If string1_length >= string2_length * (min_percentage / 100) Then
' Check if not too short
If string1_length <= string2_length * ((200 - min_percentage) / 100) Then
ReDim distance(string1_length, string2_length)
For i = 0 To string1_length: distance(i, 0) = i: Next
For j = 0 To string2_length: distance(0, j) = j: Next
For i = 1 To string1_length
For j = 1 To string2_length
If Asc(Mid$(string1, i, 1)) = Asc(Mid$(string2, j, 1)) Then
distance(i, j) = distance(i - 1, j - 1)
Else
distance(i, j) = Application.WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
End If
Next
Next
result = distance(string1_length, string2_length) 'The distance
End If
End If
If result <> 0 Then
FuzzyMatch = (CLng((100 - ((result / string1_length) * 100)))) & _
"% (" & result & ")" 'Convert to percentage
Else
FuzzyMatch = "Not a match"
End If
End Function
Run Code Online (Sandbox Code Playgroud)
小智 23
使用字节数组获得17倍的速度增益
Option Explicit
Public Declare Function GetTickCount Lib "kernel32" () As Long
Sub test()
Dim s1 As String, s2 As String, lTime As Long, i As Long
s1 = Space(100)
s2 = String(100, "a")
lTime = GetTickCount
For i = 1 To 100
LevenshteinStrings s1, s2 ' the original fn from Wikibooks and Stackoverflow
Next
Debug.Print GetTickCount - lTime; " ms" ' 3900 ms for all diff
lTime = GetTickCount
For i = 1 To 100
Levenshtein s1, s2
Next
Debug.Print GetTickCount - lTime; " ms" ' 234 ms
End Sub
'Option Base 0 assumed
'POB: fn with byte array is 17 times faster
Function Levenshtein(ByVal string1 As String, ByVal string2 As String) As Long
Dim i As Long, j As Long, bs1() As Byte, bs2() As Byte
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long
Dim min1 As Long, min2 As Long, min3 As Long
string1_length = Len(string1)
string2_length = Len(string2)
ReDim distance(string1_length, string2_length)
bs1 = string1
bs2 = string2
For i = 0 To string1_length
distance(i, 0) = i
Next
For j = 0 To string2_length
distance(0, j) = j
Next
For i = 1 To string1_length
For j = 1 To string2_length
'slow way: If Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
If bs1((i - 1) * 2) = bs2((j - 1) * 2) Then ' *2 because Unicode every 2nd byte is 0
distance(i, j) = distance(i - 1, j - 1)
Else
'distance(i, j) = Application.WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
' spell it out, 50 times faster than worksheetfunction.min
min1 = distance(i - 1, j) + 1
min2 = distance(i, j - 1) + 1
min3 = distance(i - 1, j - 1) + 1
If min1 <= min2 And min1 <= min3 Then
distance(i, j) = min1
ElseIf min2 <= min1 And min2 <= min3 Then
distance(i, j) = min2
Else
distance(i, j) = min3
End If
End If
Next
Next
Levenshtein = distance(string1_length, string2_length)
End Function
Run Code Online (Sandbox Code Playgroud)
Apo*_*s55 15
我认为它变得更快......除了改进以前的速度和结果代码之外没有做太多的事情,因为%
' Levenshtein3 tweaked for UTLIMATE speed and CORRECT results
' Solution based on Longs
' Intermediate arrays holding Asc()make difference
' even Fixed length Arrays have impact on speed (small indeed)
' Levenshtein version 3 will return correct percentage
'
Function Levenshtein3(ByVal string1 As String, ByVal string2 As String) As Long
Dim i As Long, j As Long, string1_length As Long, string2_length As Long
Dim distance(0 To 60, 0 To 50) As Long, smStr1(1 To 60) As Long, smStr2(1 To 50) As Long
Dim min1 As Long, min2 As Long, min3 As Long, minmin As Long, MaxL As Long
string1_length = Len(string1): string2_length = Len(string2)
distance(0, 0) = 0
For i = 1 To string1_length: distance(i, 0) = i: smStr1(i) = Asc(LCase(Mid$(string1, i, 1))): Next
For j = 1 To string2_length: distance(0, j) = j: smStr2(j) = Asc(LCase(Mid$(string2, j, 1))): Next
For i = 1 To string1_length
For j = 1 To string2_length
If smStr1(i) = smStr2(j) Then
distance(i, j) = distance(i - 1, j - 1)
Else
min1 = distance(i - 1, j) + 1
min2 = distance(i, j - 1) + 1
min3 = distance(i - 1, j - 1) + 1
If min2 < min1 Then
If min2 < min3 Then minmin = min2 Else minmin = min3
Else
If min1 < min3 Then minmin = min1 Else minmin = min3
End If
distance(i, j) = minmin
End If
Next
Next
' Levenshtein3 will properly return a percent match (100%=exact) based on similarities and Lengths etc...
MaxL = string1_length: If string2_length > MaxL Then MaxL = string2_length
Levenshtein3 = 100 - CLng((distance(string1_length, string2_length) * 100) / MaxL)
End Function
Run Code Online (Sandbox Code Playgroud)