svn-gvsig-desktop / branches / v2_0_0_prep / libraries / libGeocoding / src / org / gvsig / geocoding / distance / impl / LevenshteinDistance.java @ 32187
History | View | Annotate | Download (5.34 KB)
1 |
/* gvSIG. Geographic Information System of the Valencian Government
|
---|---|
2 |
*
|
3 |
* Copyright (C) 2007-2008 Infrastructures and Transports Department
|
4 |
* of the Valencian Government (CIT)
|
5 |
*
|
6 |
* This program is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU General Public License
|
8 |
* as published by the Free Software Foundation; either version 2
|
9 |
* of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* This program is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14 |
* GNU General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU General Public License
|
17 |
* along with this program; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
19 |
* MA 02110-1301, USA.
|
20 |
*
|
21 |
*/
|
22 |
|
23 |
/*
|
24 |
* AUTHORS (In addition to CIT):
|
25 |
* 2008 Prodevelop S.L vsanjaime programador
|
26 |
*/
|
27 |
|
28 |
package org.gvsig.geocoding.distance.impl; |
29 |
|
30 |
import org.gvsig.geocoding.distance.StringsDistance; |
31 |
|
32 |
/**
|
33 |
* This class calculate the levenshtein distance between two strings
|
34 |
*
|
35 |
* @author <a href="mailto:jsanz@prodevelop.es"> Jorge Gaspar Sanz Salinas</a>
|
36 |
* @author <a href="mailto:vsanjaime@prodevelop.es"> Vicent Sanjaime Calvet</a>
|
37 |
*/
|
38 |
|
39 |
public class LevenshteinDistance implements StringsDistance{ |
40 |
|
41 |
/**
|
42 |
* calculate levenshtein distance between two strings
|
43 |
*
|
44 |
* @param str1
|
45 |
* @param str2
|
46 |
* @return
|
47 |
*/
|
48 |
public static int computeLevenshteinDistance(String str1, String str2) { |
49 |
return computeLevenshteinDistance(str1.toCharArray(), str2
|
50 |
.toCharArray()); |
51 |
} |
52 |
|
53 |
/**
|
54 |
* get levenshtein distance between two strings
|
55 |
*
|
56 |
* @param s
|
57 |
* @param t
|
58 |
* @return
|
59 |
*/
|
60 |
public static int getLevenshteinDistance(String s, String t) { |
61 |
if (s == null || t == null) { |
62 |
throw new IllegalArgumentException("Strings must not be null"); |
63 |
} |
64 |
|
65 |
/*
|
66 |
* The difference between this impl. and the previous is that, rather
|
67 |
* than creating and retaining a matrix of size s.length()+1 by
|
68 |
* t.length()+1, we maintain two single-dimensional arrays of length
|
69 |
* s.length()+1. The first, d, is the 'current working' distance array
|
70 |
* that maintains the newest distance cost counts as we iterate through
|
71 |
* the characters of String s. Each time we increment the index of
|
72 |
* String t we are comparing, d is copied to p, the second int[]. Doing
|
73 |
* so allows us to retain the previous cost counts as required by the
|
74 |
* algorithm (taking the minimum of the cost count to the left, up one,
|
75 |
* and diagonally up and to the left of the current cost count being
|
76 |
* calculated). (Note that the arrays aren't really copied anymore, just
|
77 |
* switched...this is clearly much better than cloning an array or doing
|
78 |
* a System.arraycopy() each time through the outer loop.)
|
79 |
*
|
80 |
* Effectively, the difference between the two implementations is this
|
81 |
* one does not cause an out of memory condition when calculating the LD
|
82 |
* over two very large strings.
|
83 |
*/
|
84 |
|
85 |
int n = s.length(); // length of s |
86 |
int m = t.length(); // length of t |
87 |
|
88 |
if (n == 0) { |
89 |
return m;
|
90 |
} else if (m == 0) { |
91 |
return n;
|
92 |
} |
93 |
|
94 |
int p[] = new int[n + 1]; // 'previous' cost array, horizontally |
95 |
int d[] = new int[n + 1]; // cost array, horizontally |
96 |
int _d[]; // placeholder to assist in swapping p and d |
97 |
|
98 |
// indexes into strings s and t
|
99 |
int i; // iterates through s |
100 |
int j; // iterates through t |
101 |
|
102 |
char t_j; // jth character of t |
103 |
|
104 |
int cost; // cost |
105 |
|
106 |
for (i = 0; i <= n; i++) { |
107 |
p[i] = i; |
108 |
} |
109 |
|
110 |
for (j = 1; j <= m; j++) { |
111 |
t_j = t.charAt(j - 1);
|
112 |
d[0] = j;
|
113 |
|
114 |
for (i = 1; i <= n; i++) { |
115 |
cost = s.charAt(i - 1) == t_j ? 0 : 1; |
116 |
// minimum of cell to the left+1, to the top+1, diagonally left
|
117 |
// and up +cost
|
118 |
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] |
119 |
+ cost); |
120 |
} |
121 |
|
122 |
// copy current distance counts to 'previous row' distance counts
|
123 |
_d = p; |
124 |
p = d; |
125 |
d = _d; |
126 |
} |
127 |
|
128 |
// our last action in the above loop was to switch d and p, so p now
|
129 |
// actually has the most recent cost counts
|
130 |
return p[n];
|
131 |
} |
132 |
|
133 |
/**
|
134 |
*
|
135 |
* @param a
|
136 |
* @param b
|
137 |
* @param c
|
138 |
* @return
|
139 |
*/
|
140 |
private static int minimum(int a, int b, int c) { |
141 |
if (a <= b && a <= c)
|
142 |
return a;
|
143 |
if (b <= a && b <= c)
|
144 |
return b;
|
145 |
return c;
|
146 |
} |
147 |
|
148 |
/**
|
149 |
* calculate levenshtein distance between two chars
|
150 |
*
|
151 |
* @param str1
|
152 |
* @param str2
|
153 |
* @return
|
154 |
*/
|
155 |
private static int computeLevenshteinDistance(char[] str1, char[] str2) { |
156 |
int[][] distance = new int[str1.length + 1][]; |
157 |
|
158 |
for (int i = 0; i <= str1.length; i++) { |
159 |
distance[i] = new int[str2.length + 1]; |
160 |
distance[i][0] = i;
|
161 |
} |
162 |
for (int j = 0; j < str2.length + 1; j++) |
163 |
distance[0][j] = j;
|
164 |
|
165 |
for (int i = 1; i <= str1.length; i++) |
166 |
for (int j = 1; j <= str2.length; j++) |
167 |
distance[i][j] = minimum(distance[i - 1][j] + 1, |
168 |
distance[i][j - 1] + 1, distance[i - 1][j - 1] |
169 |
+ ((str1[i - 1] == str2[j - 1]) ? 0 : 1)); |
170 |
|
171 |
return distance[str1.length][str2.length];
|
172 |
} |
173 |
|
174 |
/**
|
175 |
* get levenshtein distance between two strings
|
176 |
* @param s first string
|
177 |
* @param t second string
|
178 |
*/
|
179 |
public int getStringsDistance(String s, String t) { |
180 |
|
181 |
return computeLevenshteinDistance(s, t);
|
182 |
} |
183 |
} |