root / trunk / libraries / libGeocoding / src-filter / org / gvsig / data / vectorial / filter / PropertyDistanceImpl.java @ 23019
History | View | Annotate | Download (5.97 KB)
1 |
/* gvSIG. Geographic Information System of the Valencian Government
|
---|---|
2 |
*
|
3 |
* Copyright (C) 2007-2008 Infrastructures and Transports Department
|
4 |
* of the Valencian Government (CIT)
|
5 |
*
|
6 |
* This program is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU General Public License
|
8 |
* as published by the Free Software Foundation; either version 2
|
9 |
* of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* This program is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14 |
* GNU General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU General Public License
|
17 |
* along with this program; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
19 |
* MA 02110-1301, USA.
|
20 |
*
|
21 |
*/
|
22 |
|
23 |
/*
|
24 |
* AUTHORS (In addition to CIT):
|
25 |
* 2008 Prodevelop S.L main development
|
26 |
*/
|
27 |
|
28 |
package org.gvsig.data.vectorial.filter; |
29 |
|
30 |
import java.util.ArrayList; |
31 |
import java.util.List; |
32 |
|
33 |
import org.opengis.filter.Filter; |
34 |
import org.opengis.filter.FilterVisitor; |
35 |
import org.opengis.filter.expression.Expression; |
36 |
|
37 |
/**
|
38 |
* class
|
39 |
*
|
40 |
*
|
41 |
* @author <a href="mailto:jsanz@prodevelop.es"> Jorge Gaspar Sanz Salinas</a>
|
42 |
* @author <a href="mailto:vsanjaime@prodevelop.es"> Vicente Sanjaime Calvet</a>
|
43 |
*
|
44 |
*/
|
45 |
|
46 |
public class PropertyDistanceImpl implements PropertyDistance { |
47 |
|
48 |
private static final String REGEX_TOKENIZER = "\\s"; |
49 |
private Expression attribute; |
50 |
private String pattern; |
51 |
private float threshold; |
52 |
|
53 |
public PropertyDistanceImpl(Expression arg0, Expression arg1, |
54 |
float threshold) {
|
55 |
this.attribute = arg0;
|
56 |
this.pattern = arg1.toString();
|
57 |
this.threshold = threshold;
|
58 |
} |
59 |
|
60 |
public PropertyDistanceImpl(Expression arg0, String arg1, float threshold) { |
61 |
this.attribute = arg0;
|
62 |
this.pattern = arg1;
|
63 |
this.threshold = threshold;
|
64 |
} |
65 |
|
66 |
public Object accept(FilterVisitor visitor, Object extraData) { |
67 |
return visitor.visitNullFilter(extraData);
|
68 |
} |
69 |
|
70 |
/**
|
71 |
* Evaluate the text distance between a feature attribute value and
|
72 |
*/
|
73 |
@SuppressWarnings("unchecked") |
74 |
public boolean evaluate(Object feature) { |
75 |
|
76 |
if (attribute == null) { |
77 |
return false; |
78 |
} |
79 |
|
80 |
Object value = attribute.evaluate(feature);
|
81 |
|
82 |
if (null == value) { |
83 |
return false; |
84 |
} |
85 |
|
86 |
String[] valueChains = value.toString().split(REGEX_TOKENIZER); |
87 |
String[] pattChains = this.pattern.split(REGEX_TOKENIZER); |
88 |
|
89 |
DistanceFilterFactory2Impl fact = new DistanceFilterFactory2Impl();
|
90 |
|
91 |
/**
|
92 |
* Really evaluate the filter when we have just one string per input
|
93 |
*/
|
94 |
if (valueChains.length == 1 && pattChains.length == 1) { |
95 |
int dist = getLevenshteinDistance(valueChains[0], pattChains[0]); |
96 |
double distNorm = 1.0 - dist / (pattern.length() * 1.0); |
97 |
return distNorm >= threshold;
|
98 |
} else {
|
99 |
/**
|
100 |
* Otherwise we will iterate to assure all patters appear in one of
|
101 |
* the value chains
|
102 |
*/
|
103 |
Filter filtAND;
|
104 |
List listAND = new ArrayList(); |
105 |
for (int i = 0; i < pattChains.length; i++) { |
106 |
String patt = pattChains[i];
|
107 |
Filter filtOR;
|
108 |
List listOR = new ArrayList(); |
109 |
for (int j = 0; j < valueChains.length; j++) { |
110 |
Expression exp2 = fact.literal(valueChains[j]);
|
111 |
PropertyDistance dist = fact.distance(exp2, patt, |
112 |
this.threshold);
|
113 |
listOR.add(dist); |
114 |
} |
115 |
filtOR = fact.or(listOR); |
116 |
listAND.add(filtOR); |
117 |
} |
118 |
filtAND = fact.and(listAND); |
119 |
return filtAND.evaluate(null); |
120 |
} |
121 |
} |
122 |
|
123 |
private int getLevenshteinDistance(String s, String t) { |
124 |
if (s == null || t == null) { |
125 |
throw new IllegalArgumentException("Strings must not be null"); |
126 |
} |
127 |
|
128 |
/*
|
129 |
* The difference between this impl. and the previous is that, rather
|
130 |
* than creating and retaining a matrix of size s.length()+1 by
|
131 |
* t.length()+1, we maintain two single-dimensional arrays of length
|
132 |
* s.length()+1. The first, d, is the 'current working' distance array
|
133 |
* that maintains the newest distance cost counts as we iterate through
|
134 |
* the characters of String s. Each time we increment the index of
|
135 |
* String t we are comparing, d is copied to p, the second int[]. Doing
|
136 |
* so allows us to retain the previous cost counts as required by the
|
137 |
* algorithm (taking the minimum of the cost count to the left, up one,
|
138 |
* and diagonally up and to the left of the current cost count being
|
139 |
* calculated). (Note that the arrays aren't really copied anymore, just
|
140 |
* switched...this is clearly much better than cloning an array or doing
|
141 |
* a System.arraycopy() each time through the outer loop.)
|
142 |
*
|
143 |
* Effectively, the difference between the two implementations is this
|
144 |
* one does not cause an out of memory condition when calculating the LD
|
145 |
* over two very large strings.
|
146 |
*/
|
147 |
|
148 |
int n = s.length(); // length of s |
149 |
int m = t.length(); // length of t |
150 |
|
151 |
if (n == 0) { |
152 |
return m;
|
153 |
} else if (m == 0) { |
154 |
return n;
|
155 |
} |
156 |
|
157 |
int p[] = new int[n + 1]; // 'previous' cost array, horizontally |
158 |
int d[] = new int[n + 1]; // cost array, horizontally |
159 |
int _d[]; // placeholder to assist in swapping p and d |
160 |
|
161 |
// indexes into strings s and t
|
162 |
int i; // iterates through s |
163 |
int j; // iterates through t |
164 |
|
165 |
char t_j; // jth character of t |
166 |
|
167 |
int cost; // cost |
168 |
|
169 |
for (i = 0; i <= n; i++) { |
170 |
p[i] = i; |
171 |
} |
172 |
|
173 |
for (j = 1; j <= m; j++) { |
174 |
t_j = t.charAt(j - 1);
|
175 |
d[0] = j;
|
176 |
|
177 |
for (i = 1; i <= n; i++) { |
178 |
cost = s.charAt(i - 1) == t_j ? 0 : 1; |
179 |
// minimum of cell to the left+1, to the top+1, diagonally left
|
180 |
// and up +cost
|
181 |
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] |
182 |
+ cost); |
183 |
} |
184 |
|
185 |
// copy current distance counts to 'previous row' distance counts
|
186 |
_d = p; |
187 |
p = d; |
188 |
d = _d; |
189 |
} |
190 |
|
191 |
// our last action in the above loop was to switch d and p, so p now
|
192 |
// actually has the most recent cost counts
|
193 |
return p[n];
|
194 |
} |
195 |
|
196 |
} |