How would the Jaro–Winkler distance string comparison algorithm be implemented in C#?
可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
回答1:
public static class JaroWinklerDistance
{
/* The Winkler modification will not be applied unless the
* percent match was at or above the mWeightThreshold percent
* without the modification.
* Winkler's paper used a default value of 0.7
*/
private static readonly double mWeightThreshold = 0.7;
/* Size of the prefix to be concidered by the Winkler modification.
* Winkler's paper used a default value of 4
*/
private static readonly int mNumChars = 4;
/// <summary>
/// Returns the Jaro-Winkler distance between the specified
/// strings. The distance is symmetric and will fall in the
/// range 0 (perfect match) to 1 (no match).
/// </summary>
/// <param name="aString1">First String</param>
/// <param name="aString2">Second String</param>
/// <returns></returns>
public static double distance(string aString1, string aString2) {
return 1.0 - proximity(aString1,aString2);
}
/// <summary>
/// Returns the Jaro-Winkler distance between the specified
/// strings. The distance is symmetric and will fall in the
/// range 0 (no match) to 1 (perfect match).
/// </summary>
/// <param name="aString1">First String</param>
/// <param name="aString2">Second String</param>
/// <returns></returns>
public static double proximity(string aString1, string aString2)
{
int lLen1 = aString1.Length;
int lLen2 = aString2.Length;
if (lLen1 == 0)
return lLen2 == 0 ? 1.0 : 0.0;
int lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);
// default initialized to false
bool[] lMatched1 = new bool[lLen1];
bool[] lMatched2 = new bool[lLen2];
int lNumCommon = 0;
for (int i = 0; i < lLen1; ++i) {
int lStart = Math.Max(0,i-lSearchRange);
int lEnd = Math.Min(i+lSearchRange+1,lLen2);
for (int j = lStart; j < lEnd; ++j) {
if (lMatched2[j]) continue;
if (aString1[i] != aString2[j])
continue;
lMatched1[i] = true;
lMatched2[j] = true;
++lNumCommon;
break;
}
}
if (lNumCommon == 0) return 0.0;
int lNumHalfTransposed = 0;
int k = 0;
for (int i = 0; i < lLen1; ++i) {
if (!lMatched1[i]) continue;
while (!lMatched2[k]) ++k;
if (aString1[i] != aString2[k])
++lNumHalfTransposed;
++k;
}
// System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
int lNumTransposed = lNumHalfTransposed/2;
// System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
double lNumCommonD = lNumCommon;
double lWeight = (lNumCommonD/lLen1
+ lNumCommonD/lLen2
+ (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;
if (lWeight <= mWeightThreshold) return lWeight;
int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
int lPos = 0;
while (lPos < lMax && aString1[lPos] == aString2[lPos])
++lPos;
if (lPos == 0) return lWeight;
return lWeight + 0.1 * lPos * (1.0 - lWeight);
}
}
回答2:
You can take a look on Lucene.Net ,
it implement Jaro–Winkler distance algorithm ,
and its score is different from which leebickmtu post ,
you can take it as reference
the url is below :
http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html
回答3:
Use below class to use jaro winkler. i have customized both algorithm jaro and jaro-winkler.
Visit on Github for DLL.
using System;
using System.Linq;
namespace Search
{
public static class EditDistance
{
private struct JaroMetrics
{
public int Matches;
public int Transpositions;
}
private static EditDistance.JaroMetrics Matches(string s1, string s2)
{
string text;
string text2;
if (s1.Length > s2.Length)
{
text = s1;
text2 = s2;
}
else
{
text = s2;
text2 = s1;
}
int num = Math.Max(text.Length / 2 - 1, 0);
int[] array = new int[text2.Length];
int i;
for (i = 0; i < array.Length; i++)
{
array[i] = -1;
}
bool[] array2 = new bool[text.Length];
int num2 = 0;
for (int j = 0; j < text2.Length; j++)
{
char c = text2[j];
int k = Math.Max(j - num, 0);
int num3 = Math.Min(j + num + 1, text.Length);
while (k < num3)
{
if (!array2[k] && c == text[k])
{
array[j] = k;
array2[k] = true;
num2++;
break;
}
k++;
}
}
char[] array3 = new char[num2];
char[] ms2 = new char[num2];
i = 0;
int num4 = 0;
while (i < text2.Length)
{
if (array[i] != -1)
{
array3[num4] = text2[i];
num4++;
}
i++;
}
i = 0;
num4 = 0;
while (i < text.Length)
{
if (array2[i])
{
ms2[num4] = text[i];
num4++;
}
i++;
}
int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
EditDistance.JaroMetrics result;
result.Matches = num2;
result.Transpositions = num5 / 2;
return result;
}
public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
{
prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
float num = s1.Jaro(s2);
int num2 = 0;
for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
{
if (s1[i] != s2[i])
{
break;
}
num2++;
}
return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
}
public static float JaroWinkler(this string s1, string s2, float prefixScale)
{
return s1.JaroWinkler(s2, prefixScale, 0.7f);
}
public static float JaroWinkler(this string s1, string s2)
{
return s1.JaroWinkler(s2, 0.1f, 0.7f);
}
public static float Jaro(this string s1, string s2)
{
EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
float num = (float)jaroMetrics.Matches;
int transpositions = jaroMetrics.Transpositions;
float result;
if (num == 0f)
{
result = 0f;
}
else
{
float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
result = num2;
}
return result;
}
public static int LevenshteinDistance(this string source, string target)
{
int result;
if (string.IsNullOrEmpty(source))
{
if (string.IsNullOrEmpty(target))
{
result = 0;
}
else
{
result = target.Length;
}
}
else if (string.IsNullOrEmpty(target))
{
result = source.Length;
}
else
{
if (source.Length > target.Length)
{
string text = target;
target = source;
source = text;
}
int length = target.Length;
int length2 = source.Length;
int[,] array = new int[2, length + 1];
for (int i = 1; i <= length; i++)
{
array[0, i] = i;
}
int num = 0;
for (int j = 1; j <= length2; j++)
{
num = (j & 1);
array[num, 0] = j;
int num2 = num ^ 1;
for (int i = 1; i <= length; i++)
{
int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
}
}
result = array[num, length];
}
return result;
}
}
}