001/* 002 * CDDL HEADER START 003 * 004 * The contents of this file are subject to the terms of the 005 * Common Development and Distribution License, Version 1.0 only 006 * (the "License"). You may not use this file except in compliance 007 * with the License. 008 * 009 * You can obtain a copy of the license at legal-notices/CDDLv1_0.txt 010 * or http://forgerock.org/license/CDDLv1.0.html. 011 * See the License for the specific language governing permissions 012 * and limitations under the License. 013 * 014 * When distributing Covered Code, include this CDDL HEADER in each 015 * file and include the License file at legal-notices/CDDLv1_0.txt. 016 * If applicable, add the following below this CDDL HEADER, with the 017 * fields enclosed by brackets "[]" replaced with your own identifying 018 * information: 019 * Portions Copyright [yyyy] [name of copyright owner] 020 * 021 * CDDL HEADER END 022 * 023 * 024 * Copyright 2008 Sun Microsystems, Inc. 025 * Portions Copyright 2014 ForgeRock AS 026 */ 027package org.opends.server.util; 028 029 030 031import static org.forgerock.util.Reject.*; 032 033 034 035/** 036 * This class provides an implementation of the Levenshtein distance algorithm, 037 * which may be used to determine the minimum number of changes required to 038 * transform one string into another. For the purpose of this algorithm, a 039 * change is defined as replacing one character with another, inserting a new 040 * character, or removing an existing character. 041 * <BR><BR> 042 * The basic algorithm works as follows for a source string S of length X and 043 * a target string T of length Y: 044 * <OL> 045 * <LI>Create a matrix M with dimensions of X+1, Y+1.</LI> 046 * <LI>Fill the first row with sequentially-increasing values 0 through 047 * X.</LI> 048 * <LI>Fill the first column with sequentially-increasing values 0 through 049 * Y.</LI> 050 * <LI>Create a nested loop iterating over the characters in the strings. In 051 * the outer loop, iterate through characters in S from 0 to X-1 using an 052 * iteration counter I. In the inner loop, iterate through the characters 053 * in T from 0 to Y-1 using an iterator counter J. Calculate the 054 * following three things and place the smallest value in the matrix in 055 * row I+1 column J+1: 056 * <UL> 057 * <LI>One more than the value in the matrix position immediately to the 058 * left (i.e., 1 + M[I][J+1]).</LI> 059 * <LI>One more than the value in the matrix position immediately above 060 * (i.e., 1 + M[I+1][J]).</LI> 061 * <LI>Define a value V to be zero if S[I] is the same as T[J], or one if 062 * they are different. Add that value to the value in the matrix 063 * position immediately above and to the left (i.e., 064 * V + M[I][J]).</LI> 065 * </UL> 066 * </LI> 067 * <LI>The Levenshtein distance value, which is the least number of changes 068 * needed to transform the source string into the target string, will be 069 * the value in the bottom right corner of the matrix (i.e., 070 * M[X][Y]).</LI> 071 * </OL> 072 * <BR><BR> 073 * Note that this is a completely "clean room" implementation, developed from a 074 * description of the algorithm, rather than copying an existing implementation. 075 * Doing it in this way eliminates copyright and licensing concerns associated 076 * with using an existing implementation. 077 */ 078@org.opends.server.types.PublicAPI( 079 stability=org.opends.server.types.StabilityLevel.UNCOMMITTED, 080 mayInstantiate=false, 081 mayExtend=false, 082 mayInvoke=true) 083public final class LevenshteinDistance 084{ 085 /** 086 * Calculates the Levenshtein distance between the provided string values. 087 * 088 * @param source The source string to compare. It must not be {@code null}. 089 * @param target The target string to compare. It must not be {@code null}. 090 * 091 * @return The minimum number of changes required to turn the source string 092 * into the target string. 093 */ 094 public static int calculate(String source, String target) 095 { 096 ifNull(source, target); 097 098 // sl == source length; tl == target length 099 int sl = source.length(); 100 int tl = target.length(); 101 102 103 // If either of the lengths is zero, then the distance is the length of the 104 // other string. 105 if (sl == 0) 106 { 107 return tl; 108 } 109 else if (tl == 0) 110 { 111 return sl; 112 } 113 114 115 // w == matrix width; h == matrix height 116 int w = sl+1; 117 int h = tl+1; 118 119 120 // m == matrix array 121 // Create the array and fill it with values 0..sl in the first dimension and 122 // 0..tl in the second dimension. 123 int[][] m = new int[w][h]; 124 for (int i=0; i < w; i++) 125 { 126 m[i][0] = i; 127 } 128 129 for (int i=1; i < h; i++) 130 { 131 m[0][i] = i; 132 } 133 134 for (int i=0,x=1; i < sl; i++,x++) 135 { 136 char s = source.charAt(i); 137 138 for (int j=0,y=1; j < tl; j++,y++) 139 { 140 char t = target.charAt(j); 141 142 143 // Figure out what to put in the appropriate matrix slot. It should be 144 // the lowest of: 145 // - One more than the value to the left 146 // - One more than the value to the top 147 // - If the characters are equal, the value to the upper left, otherwise 148 // one more than the value to the upper left. 149 m[x][y] = Math.min(Math.min((m[i][y] + 1), (m[x][j] + 1)), 150 (m[i][j] + ((s == t) ? 0 : 1))); 151 } 152 } 153 154 // The Levenshtein distance should now be the value in the lower right 155 // corner of the matrix. 156 return m[sl][tl]; 157 } 158} 159