Statistics
| Revision:

svn-gvsig-desktop / branches / v2_0_0_prep / libraries / libGeocoding / src / org / gvsig / geocoding / distance / impl / LevenshteinDistance.java @ 32187

History | View | Annotate | Download (5.34 KB)

1
/* gvSIG. Geographic Information System of the Valencian Government
2
 *
3
 * Copyright (C) 2007-2008 Infrastructures and Transports Department
4
 * of the Valencian Government (CIT)
5
 * 
6
 * This program is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU General Public License
8
 * as published by the Free Software Foundation; either version 2
9
 * of the License, or (at your option) any later version.
10
 * 
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 * 
16
 * You should have received a copy of the GNU General Public License
17
 * along with this program; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 
19
 * MA  02110-1301, USA.
20
 * 
21
 */
22

    
23
/*
24
 * AUTHORS (In addition to CIT):
25
 * 2008 Prodevelop S.L  vsanjaime   programador
26
 */
27

    
28
package org.gvsig.geocoding.distance.impl;
29

    
30
import org.gvsig.geocoding.distance.StringsDistance;
31

    
32
/**
33
 * This class calculate the levenshtein distance between two strings
34
 * 
35
 * @author <a href="mailto:jsanz@prodevelop.es"> Jorge Gaspar Sanz Salinas</a>
36
 * @author <a href="mailto:vsanjaime@prodevelop.es"> Vicent Sanjaime Calvet</a>
37
 */
38

    
39
public class LevenshteinDistance implements StringsDistance{
40

    
41
        /**
42
         * calculate levenshtein distance between two strings
43
         * 
44
         * @param str1
45
         * @param str2
46
         * @return
47
         */
48
        public static int computeLevenshteinDistance(String str1, String str2) {
49
                return computeLevenshteinDistance(str1.toCharArray(), str2
50
                                .toCharArray());
51
        }
52

    
53
        /**
54
         * get levenshtein distance between two strings
55
         * 
56
         * @param s
57
         * @param t
58
         * @return
59
         */
60
        public static int getLevenshteinDistance(String s, String t) {
61
                if (s == null || t == null) {
62
                        throw new IllegalArgumentException("Strings must not be null");
63
                }
64

    
65
                /*
66
                 * The difference between this impl. and the previous is that, rather
67
                 * than creating and retaining a matrix of size s.length()+1 by
68
                 * t.length()+1, we maintain two single-dimensional arrays of length
69
                 * s.length()+1. The first, d, is the 'current working' distance array
70
                 * that maintains the newest distance cost counts as we iterate through
71
                 * the characters of String s. Each time we increment the index of
72
                 * String t we are comparing, d is copied to p, the second int[]. Doing
73
                 * so allows us to retain the previous cost counts as required by the
74
                 * algorithm (taking the minimum of the cost count to the left, up one,
75
                 * and diagonally up and to the left of the current cost count being
76
                 * calculated). (Note that the arrays aren't really copied anymore, just
77
                 * switched...this is clearly much better than cloning an array or doing
78
                 * a System.arraycopy() each time through the outer loop.)
79
                 * 
80
                 * Effectively, the difference between the two implementations is this
81
                 * one does not cause an out of memory condition when calculating the LD
82
                 * over two very large strings.
83
                 */
84

    
85
                int n = s.length(); // length of s
86
                int m = t.length(); // length of t
87

    
88
                if (n == 0) {
89
                        return m;
90
                } else if (m == 0) {
91
                        return n;
92
                }
93

    
94
                int p[] = new int[n + 1]; // 'previous' cost array, horizontally
95
                int d[] = new int[n + 1]; // cost array, horizontally
96
                int _d[]; // placeholder to assist in swapping p and d
97

    
98
                // indexes into strings s and t
99
                int i; // iterates through s
100
                int j; // iterates through t
101

    
102
                char t_j; // jth character of t
103

    
104
                int cost; // cost
105

    
106
                for (i = 0; i <= n; i++) {
107
                        p[i] = i;
108
                }
109

    
110
                for (j = 1; j <= m; j++) {
111
                        t_j = t.charAt(j - 1);
112
                        d[0] = j;
113

    
114
                        for (i = 1; i <= n; i++) {
115
                                cost = s.charAt(i - 1) == t_j ? 0 : 1;
116
                                // minimum of cell to the left+1, to the top+1, diagonally left
117
                                // and up +cost
118
                                d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1]
119
                                                + cost);
120
                        }
121

    
122
                        // copy current distance counts to 'previous row' distance counts
123
                        _d = p;
124
                        p = d;
125
                        d = _d;
126
                }
127

    
128
                // our last action in the above loop was to switch d and p, so p now
129
                // actually has the most recent cost counts
130
                return p[n];
131
        }
132

    
133
        /**
134
         * 
135
         * @param a
136
         * @param b
137
         * @param c
138
         * @return
139
         */
140
        private static int minimum(int a, int b, int c) {
141
                if (a <= b && a <= c)
142
                        return a;
143
                if (b <= a && b <= c)
144
                        return b;
145
                return c;
146
        }
147

    
148
        /**
149
         * calculate levenshtein distance between two chars
150
         * 
151
         * @param str1
152
         * @param str2
153
         * @return
154
         */
155
        private static int computeLevenshteinDistance(char[] str1, char[] str2) {
156
                int[][] distance = new int[str1.length + 1][];
157

    
158
                for (int i = 0; i <= str1.length; i++) {
159
                        distance[i] = new int[str2.length + 1];
160
                        distance[i][0] = i;
161
                }
162
                for (int j = 0; j < str2.length + 1; j++)
163
                        distance[0][j] = j;
164

    
165
                for (int i = 1; i <= str1.length; i++)
166
                        for (int j = 1; j <= str2.length; j++)
167
                                distance[i][j] = minimum(distance[i - 1][j] + 1,
168
                                                distance[i][j - 1] + 1, distance[i - 1][j - 1]
169
                                                                + ((str1[i - 1] == str2[j - 1]) ? 0 : 1));
170

    
171
                return distance[str1.length][str2.length];
172
        }
173

    
174
        /**
175
         * get levenshtein distance between two strings
176
         * @param s first string
177
         * @param t second string
178
         */
179
        public int getStringsDistance(String s, String t) {
180
                
181
                return computeLevenshteinDistance(s, t);
182
        }
183
}