Jaro–Winkler distance algorithm in C#

2019-01-16 21:02发布

问题:

How would the Jaro–Winkler distance string comparison algorithm be implemented in C#?

回答1:

public static class JaroWinklerDistance
{
    /* The Winkler modification will not be applied unless the 
     * percent match was at or above the mWeightThreshold percent 
     * without the modification. 
     * Winkler's paper used a default value of 0.7
     */
    private static readonly double mWeightThreshold = 0.7;

    /* Size of the prefix to be concidered by the Winkler modification. 
     * Winkler's paper used a default value of 4
     */
    private static readonly int mNumChars = 4;


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (perfect match) to 1 (no match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double distance(string aString1, string aString2) {
        return 1.0 - proximity(aString1,aString2);
    }


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (no match) to 1 (perfect match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double proximity(string aString1, string aString2)
    {
        int lLen1 = aString1.Length;
        int lLen2 = aString2.Length;
        if (lLen1 == 0)
            return lLen2 == 0 ? 1.0 : 0.0;

        int  lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);

        // default initialized to false
        bool[] lMatched1 = new bool[lLen1];
        bool[] lMatched2 = new bool[lLen2];

        int lNumCommon = 0;
        for (int i = 0; i < lLen1; ++i) {
            int lStart = Math.Max(0,i-lSearchRange);
            int lEnd = Math.Min(i+lSearchRange+1,lLen2);
            for (int j = lStart; j < lEnd; ++j) {
                if (lMatched2[j]) continue;
                if (aString1[i] != aString2[j])
                    continue;
                lMatched1[i] = true;
                lMatched2[j] = true;
                ++lNumCommon;
                break;
            }
        }
        if (lNumCommon == 0) return 0.0;

        int lNumHalfTransposed = 0;
        int k = 0;
        for (int i = 0; i < lLen1; ++i) {
            if (!lMatched1[i]) continue;
            while (!lMatched2[k]) ++k;
            if (aString1[i] != aString2[k])
                ++lNumHalfTransposed;
            ++k;
        }
        // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
        int lNumTransposed = lNumHalfTransposed/2;

        // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
        double lNumCommonD = lNumCommon;
        double lWeight = (lNumCommonD/lLen1
                         + lNumCommonD/lLen2
                         + (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;

        if (lWeight <= mWeightThreshold) return lWeight;
        int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
        int lPos = 0;
        while (lPos < lMax && aString1[lPos] == aString2[lPos])
            ++lPos;
        if (lPos == 0) return lWeight;
        return lWeight + 0.1 * lPos * (1.0 - lWeight);

    }


}


回答2:

You can take a look on Lucene.Net ,

it implement Jaro–Winkler distance algorithm ,

and its score is different from which leebickmtu post ,

you can take it as reference

the url is below :

http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html



回答3:

Use below class to use jaro winkler. i have customized both algorithm jaro and jaro-winkler.

Visit on Github for DLL.

using System;
using System.Linq;

namespace Search
{
    public static class EditDistance
    {
        private struct JaroMetrics
        {
            public int Matches;

            public int Transpositions;
        }

        private static EditDistance.JaroMetrics Matches(string s1, string s2)
        {
            string text;
            string text2;
            if (s1.Length > s2.Length)
            {
                text = s1;
                text2 = s2;
            }
            else
            {
                text = s2;
                text2 = s1;
            }
            int num = Math.Max(text.Length / 2 - 1, 0);
            int[] array = new int[text2.Length];
            int i;
            for (i = 0; i < array.Length; i++)
            {
                array[i] = -1;
            }
            bool[] array2 = new bool[text.Length];
            int num2 = 0;
            for (int j = 0; j < text2.Length; j++)
            {
                char c = text2[j];
                int k = Math.Max(j - num, 0);
                int num3 = Math.Min(j + num + 1, text.Length);
                while (k < num3)
                {
                    if (!array2[k] && c == text[k])
                    {
                        array[j] = k;
                        array2[k] = true;
                        num2++;
                        break;
                    }
                    k++;
                }
            }
            char[] array3 = new char[num2];
            char[] ms2 = new char[num2];
            i = 0;
            int num4 = 0;
            while (i < text2.Length)
            {
                if (array[i] != -1)
                {
                    array3[num4] = text2[i];
                    num4++;
                }
                i++;
            }
            i = 0;
            num4 = 0;
            while (i < text.Length)
            {
                if (array2[i])
                {
                    ms2[num4] = text[i];
                    num4++;
                }
                i++;
            }
            int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
            EditDistance.JaroMetrics result;
            result.Matches = num2;
            result.Transpositions = num5 / 2;
            return result;
        }



        public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
        {
            prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
            prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
            float num = s1.Jaro(s2);
            int num2 = 0;
            for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
            {
                if (s1[i] != s2[i])
                {
                    break;
                }
                num2++;
            }
            return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
        }

        public static float JaroWinkler(this string s1, string s2, float prefixScale)
        {
            return s1.JaroWinkler(s2, prefixScale, 0.7f);
        }

        public static float JaroWinkler(this string s1, string s2)
        {
            return s1.JaroWinkler(s2, 0.1f, 0.7f);
        }

        public static float Jaro(this string s1, string s2)
        {
            EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
            float num = (float)jaroMetrics.Matches;
            int transpositions = jaroMetrics.Transpositions;
            float result;
            if (num == 0f)
            {
                result = 0f;
            }
            else
            {
                float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
                result = num2;
            }
            return result;
        }

        public static int LevenshteinDistance(this string source, string target)
        {
            int result;
            if (string.IsNullOrEmpty(source))
            {
                if (string.IsNullOrEmpty(target))
                {
                    result = 0;
                }
                else
                {
                    result = target.Length;
                }
            }
            else if (string.IsNullOrEmpty(target))
            {
                result = source.Length;
            }
            else
            {
                if (source.Length > target.Length)
                {
                    string text = target;
                    target = source;
                    source = text;
                }
                int length = target.Length;
                int length2 = source.Length;
                int[,] array = new int[2, length + 1];
                for (int i = 1; i <= length; i++)
                {
                    array[0, i] = i;
                }
                int num = 0;
                for (int j = 1; j <= length2; j++)
                {
                    num = (j & 1);
                    array[num, 0] = j;
                    int num2 = num ^ 1;
                    for (int i = 1; i <= length; i++)
                    {
                        int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
                        array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
                    }
                }
                result = array[num, length];
            }
            return result;
        }
    }
}