001/*
002 * CDDL HEADER START
003 *
004 * The contents of this file are subject to the terms of the
005 * Common Development and Distribution License, Version 1.0 only
006 * (the "License").  You may not use this file except in compliance
007 * with the License.
008 *
009 * You can obtain a copy of the license at legal-notices/CDDLv1_0.txt
010 * or http://forgerock.org/license/CDDLv1.0.html.
011 * See the License for the specific language governing permissions
012 * and limitations under the License.
013 *
014 * When distributing Covered Code, include this CDDL HEADER in each
015 * file and include the License file at legal-notices/CDDLv1_0.txt.
016 * If applicable, add the following below this CDDL HEADER, with the
017 * fields enclosed by brackets "[]" replaced with your own identifying
018 * information:
019 *      Portions Copyright [yyyy] [name of copyright owner]
020 *
021 * CDDL HEADER END
022 *
023 *
024 *      Copyright 2008 Sun Microsystems, Inc.
025 *      Portions Copyright 2014 ForgeRock AS
026 */
027package org.opends.server.util;
028
029
030
031import static org.forgerock.util.Reject.*;
032
033
034
035/**
036 * This class provides an implementation of the Levenshtein distance algorithm,
037 * which may be used to determine the minimum number of changes required to
038 * transform one string into another.  For the purpose of this algorithm, a
039 * change is defined as replacing one character with another, inserting a new
040 * character, or removing an existing character.
041 * <BR><BR>
042 * The basic algorithm works as follows for a source string S of length X and
043 * a target string T of length Y:
044 * <OL>
045 *   <LI>Create a matrix M with dimensions of X+1, Y+1.</LI>
046 *   <LI>Fill the first row with sequentially-increasing values 0 through
047 *       X.</LI>
048 *   <LI>Fill the first column with sequentially-increasing values 0 through
049 *       Y.</LI>
050 *   <LI>Create a nested loop iterating over the characters in the strings.  In
051 *       the outer loop, iterate through characters in S from 0 to X-1 using an
052 *       iteration counter I.  In the inner loop, iterate through the characters
053 *       in T from 0 to Y-1 using an iterator counter J.  Calculate the
054 *       following three things and place the smallest value in the matrix in
055 *       row I+1 column J+1:
056 *     <UL>
057 *       <LI>One more than the value in the matrix position immediately to the
058 *           left (i.e., 1 + M[I][J+1]).</LI>
059 *       <LI>One more than the value in the matrix position immediately above
060 *           (i.e., 1 + M[I+1][J]).</LI>
061 *       <LI>Define a value V to be zero if S[I] is the same as T[J], or one if
062 *           they are different.  Add that value to the value in the matrix
063 *           position immediately above and to the left (i.e.,
064 *           V + M[I][J]).</LI>
065 *     </UL>
066 *   </LI>
067 *   <LI>The Levenshtein distance value, which is the least number of changes
068 *       needed to transform the source string into the target string, will be
069 *       the value in the bottom right corner of the matrix (i.e.,
070 *       M[X][Y]).</LI>
071 * </OL>
072 * <BR><BR>
073 * Note that this is a completely "clean room" implementation, developed from a
074 * description of the algorithm, rather than copying an existing implementation.
075 * Doing it in this way eliminates copyright and licensing concerns associated
076 * with using an existing implementation.
077 */
078@org.opends.server.types.PublicAPI(
079     stability=org.opends.server.types.StabilityLevel.UNCOMMITTED,
080     mayInstantiate=false,
081     mayExtend=false,
082     mayInvoke=true)
083public final class LevenshteinDistance
084{
085  /**
086   * Calculates the Levenshtein distance between the provided string values.
087   *
088   * @param  source  The source string to compare.  It must not be {@code null}.
089   * @param  target  The target string to compare.  It must not be {@code null}.
090   *
091   * @return  The minimum number of changes required to turn the source string
092   *          into the target string.
093   */
094  public static int calculate(String source, String target)
095  {
096    ifNull(source, target);
097
098    // sl == source length; tl == target length
099    int sl = source.length();
100    int tl = target.length();
101
102
103    // If either of the lengths is zero, then the distance is the length of the
104    // other string.
105    if (sl == 0)
106    {
107      return tl;
108    }
109    else if (tl == 0)
110    {
111      return sl;
112    }
113
114
115    // w == matrix width; h == matrix height
116    int w = sl+1;
117    int h = tl+1;
118
119
120    // m == matrix array
121    // Create the array and fill it with values 0..sl in the first dimension and
122    // 0..tl in the second dimension.
123    int[][] m = new int[w][h];
124    for (int i=0; i < w; i++)
125    {
126      m[i][0] = i;
127    }
128
129    for (int i=1; i < h; i++)
130    {
131      m[0][i] = i;
132    }
133
134    for (int i=0,x=1; i < sl; i++,x++)
135    {
136      char s = source.charAt(i);
137
138      for (int j=0,y=1; j < tl; j++,y++)
139      {
140        char t = target.charAt(j);
141
142
143        // Figure out what to put in the appropriate matrix slot.  It should be
144        // the lowest of:
145        // - One more than the value to the left
146        // - One more than the value to the top
147        // - If the characters are equal, the value to the upper left, otherwise
148        //   one more than the value to the upper left.
149        m[x][y] = Math.min(Math.min((m[i][y] + 1), (m[x][j] + 1)),
150                           (m[i][j] + ((s == t) ? 0 : 1)));
151      }
152    }
153
154    // The Levenshtein distance should now be the value in the lower right
155    // corner of the matrix.
156    return m[sl][tl];
157  }
158}
159