root / tags / v1_0_2_Build_912 / extensions / extScripting / scripts / jython / Lib / urlparse.py @ 11422
History | View | Annotate | Download (8.46 KB)
1 |
"""Parse (absolute and relative) URLs.
|
---|---|
2 |
|
3 |
See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
|
4 |
UC Irvine, June 1995.
|
5 |
"""
|
6 |
|
7 |
__all__ = ["urlparse", "urlunparse", "urljoin"] |
8 |
|
9 |
# A classification of schemes ('' means apply by default)
|
10 |
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file', |
11 |
'https', 'shttp', |
12 |
'prospero', 'rtsp', 'rtspu', ''] |
13 |
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais', |
14 |
'file',
|
15 |
'https', 'shttp', 'snews', |
16 |
'prospero', 'rtsp', 'rtspu', ''] |
17 |
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais', |
18 |
'snews', 'sip', |
19 |
] |
20 |
uses_params = ['ftp', 'hdl', 'prospero', 'http', |
21 |
'https', 'shttp', 'rtsp', 'rtspu', 'sip', |
22 |
'']
|
23 |
uses_query = ['http', 'wais', |
24 |
'https', 'shttp', |
25 |
'gopher', 'rtsp', 'rtspu', 'sip', |
26 |
'']
|
27 |
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais', |
28 |
'https', 'shttp', 'snews', |
29 |
'file', 'prospero', ''] |
30 |
|
31 |
# Characters valid in scheme names
|
32 |
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
|
33 |
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
34 |
'0123456789'
|
35 |
'+-.')
|
36 |
|
37 |
MAX_CACHE_SIZE = 20
|
38 |
_parse_cache = {} |
39 |
|
40 |
def clear_cache(): |
41 |
"""Clear the parse cache."""
|
42 |
global _parse_cache
|
43 |
_parse_cache = {} |
44 |
|
45 |
|
46 |
def urlparse(url, scheme = '', allow_fragments = 1): |
47 |
"""Parse a URL into 6 components:
|
48 |
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
|
49 |
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
|
50 |
Note that we don't break the components up in smaller bits
|
51 |
(e.g. netloc is a single string) and we don't expand % escapes."""
|
52 |
key = url, scheme, allow_fragments |
53 |
cached = _parse_cache.get(key, None)
|
54 |
if cached:
|
55 |
return cached
|
56 |
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth |
57 |
clear_cache() |
58 |
netloc = path = params = query = fragment = ''
|
59 |
i = url.find(':')
|
60 |
if i > 0: |
61 |
if url[:i] == 'http': # optimize the common case |
62 |
scheme = url[:i].lower() |
63 |
url = url[i+1:]
|
64 |
if url[:2] == '//': |
65 |
i = url.find('/', 2) |
66 |
if i < 0: |
67 |
i = len(url)
|
68 |
netloc = url[2:i]
|
69 |
url = url[i:] |
70 |
if allow_fragments:
|
71 |
i = url.rfind('#')
|
72 |
if i >= 0: |
73 |
fragment = url[i+1:]
|
74 |
url = url[:i] |
75 |
i = url.find('?')
|
76 |
if i >= 0: |
77 |
query = url[i+1:]
|
78 |
url = url[:i] |
79 |
i = url.find(';')
|
80 |
if i >= 0: |
81 |
params = url[i+1:]
|
82 |
url = url[:i] |
83 |
tuple = scheme, netloc, url, params, query, fragment |
84 |
_parse_cache[key] = tuple
|
85 |
return tuple |
86 |
for c in url[:i]: |
87 |
if c not in scheme_chars: |
88 |
break
|
89 |
else:
|
90 |
scheme, url = url[:i].lower(), url[i+1:]
|
91 |
if scheme in uses_netloc: |
92 |
if url[:2] == '//': |
93 |
i = url.find('/', 2) |
94 |
if i < 0: |
95 |
i = len(url)
|
96 |
netloc, url = url[2:i], url[i:]
|
97 |
if allow_fragments and scheme in uses_fragment: |
98 |
i = url.rfind('#')
|
99 |
if i >= 0: |
100 |
url, fragment = url[:i], url[i+1:]
|
101 |
if scheme in uses_query: |
102 |
i = url.find('?')
|
103 |
if i >= 0: |
104 |
url, query = url[:i], url[i+1:]
|
105 |
if scheme in uses_params: |
106 |
i = url.find(';')
|
107 |
if i >= 0: |
108 |
url, params = url[:i], url[i+1:]
|
109 |
tuple = scheme, netloc, url, params, query, fragment |
110 |
_parse_cache[key] = tuple
|
111 |
return tuple |
112 |
|
113 |
def urlunparse((scheme, netloc, url, params, query, fragment)): |
114 |
"""Put a parsed URL back together again. This may result in a
|
115 |
slightly different, but equivalent URL, if the URL that was parsed
|
116 |
originally had redundant delimiters, e.g. a ? with an empty query
|
117 |
(the draft states that these are equivalent)."""
|
118 |
if netloc or (scheme in uses_netloc and url[:2] == '//'): |
119 |
if url and url[:1] != '/': url = '/' + url |
120 |
url = '//' + (netloc or '') + url |
121 |
if scheme:
|
122 |
url = scheme + ':' + url
|
123 |
if params:
|
124 |
url = url + ';' + params
|
125 |
if query:
|
126 |
url = url + '?' + query
|
127 |
if fragment:
|
128 |
url = url + '#' + fragment
|
129 |
return url
|
130 |
|
131 |
def urljoin(base, url, allow_fragments = 1): |
132 |
"""Join a base URL and a possibly relative URL to form an absolute
|
133 |
interpretation of the latter."""
|
134 |
if not base: |
135 |
return url
|
136 |
if not url: |
137 |
return base
|
138 |
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ |
139 |
urlparse(base, '', allow_fragments)
|
140 |
scheme, netloc, path, params, query, fragment = \ |
141 |
urlparse(url, bscheme, allow_fragments) |
142 |
if scheme != bscheme or scheme not in uses_relative: |
143 |
return url
|
144 |
if scheme in uses_netloc: |
145 |
if netloc:
|
146 |
return urlunparse((scheme, netloc, path,
|
147 |
params, query, fragment)) |
148 |
netloc = bnetloc |
149 |
if path[:1] == '/': |
150 |
return urlunparse((scheme, netloc, path,
|
151 |
params, query, fragment)) |
152 |
if not path: |
153 |
if not params: |
154 |
params = bparams |
155 |
if not query: |
156 |
query = bquery |
157 |
return urlunparse((scheme, netloc, bpath,
|
158 |
params, query, fragment)) |
159 |
segments = bpath.split('/')[:-1] + path.split('/') |
160 |
# XXX The stuff below is bogus in various ways...
|
161 |
if segments[-1] == '.': |
162 |
segments[-1] = '' |
163 |
while '.' in segments: |
164 |
segments.remove('.')
|
165 |
while 1: |
166 |
i = 1
|
167 |
n = len(segments) - 1 |
168 |
while i < n:
|
169 |
if (segments[i] == '..' |
170 |
and segments[i-1] not in ('', '..')): |
171 |
del segments[i-1:i+1] |
172 |
break
|
173 |
i = i+1
|
174 |
else:
|
175 |
break
|
176 |
if segments == ['', '..']: |
177 |
segments[-1] = '' |
178 |
elif len(segments) >= 2 and segments[-1] == '..': |
179 |
segments[-2:] = [''] |
180 |
return urlunparse((scheme, netloc, '/'.join(segments), |
181 |
params, query, fragment)) |
182 |
|
183 |
def urldefrag(url): |
184 |
"""Removes any existing fragment from URL.
|
185 |
|
186 |
Returns a tuple of the defragmented URL and the fragment. If
|
187 |
the URL contained no fragments, the second element is the
|
188 |
empty string.
|
189 |
"""
|
190 |
s, n, p, a, q, frag = urlparse(url) |
191 |
defrag = urlunparse((s, n, p, a, q, ''))
|
192 |
return defrag, frag
|
193 |
|
194 |
|
195 |
test_input = """
|
196 |
http://a/b/c/d
|
197 |
|
198 |
g:h = <URL:g:h>
|
199 |
http:g = <URL:http://a/b/c/g>
|
200 |
http: = <URL:http://a/b/c/d>
|
201 |
g = <URL:http://a/b/c/g>
|
202 |
./g = <URL:http://a/b/c/g>
|
203 |
g/ = <URL:http://a/b/c/g/>
|
204 |
/g = <URL:http://a/g>
|
205 |
//g = <URL:http://g>
|
206 |
?y = <URL:http://a/b/c/d?y>
|
207 |
g?y = <URL:http://a/b/c/g?y>
|
208 |
g?y/./x = <URL:http://a/b/c/g?y/./x>
|
209 |
. = <URL:http://a/b/c/>
|
210 |
./ = <URL:http://a/b/c/>
|
211 |
.. = <URL:http://a/b/>
|
212 |
../ = <URL:http://a/b/>
|
213 |
../g = <URL:http://a/b/g>
|
214 |
../.. = <URL:http://a/>
|
215 |
../../g = <URL:http://a/g>
|
216 |
../../../g = <URL:http://a/../g>
|
217 |
./../g = <URL:http://a/b/g>
|
218 |
./g/. = <URL:http://a/b/c/g/>
|
219 |
/./g = <URL:http://a/./g>
|
220 |
g/./h = <URL:http://a/b/c/g/h>
|
221 |
g/../h = <URL:http://a/b/c/h>
|
222 |
http:g = <URL:http://a/b/c/g>
|
223 |
http: = <URL:http://a/b/c/d>
|
224 |
http:?y = <URL:http://a/b/c/d?y>
|
225 |
http:g?y = <URL:http://a/b/c/g?y>
|
226 |
http:g?y/./x = <URL:http://a/b/c/g?y/./x>
|
227 |
"""
|
228 |
# XXX The result for //g is actually http://g/; is this a problem?
|
229 |
|
230 |
def test(): |
231 |
import sys |
232 |
base = ''
|
233 |
if sys.argv[1:]: |
234 |
fn = sys.argv[1]
|
235 |
if fn == '-': |
236 |
fp = sys.stdin |
237 |
else:
|
238 |
fp = open(fn)
|
239 |
else:
|
240 |
import StringIO |
241 |
fp = StringIO.StringIO(test_input) |
242 |
while 1: |
243 |
line = fp.readline() |
244 |
if not line: break |
245 |
words = line.split() |
246 |
if not words: |
247 |
continue
|
248 |
url = words[0]
|
249 |
parts = urlparse(url) |
250 |
print '%-10s : %s' % (url, parts) |
251 |
abs = urljoin(base, url) |
252 |
if not base: |
253 |
base = abs
|
254 |
wrapped = '<URL:%s>' % abs |
255 |
print '%-10s = %s' % (url, wrapped) |
256 |
if len(words) == 3 and words[1] == '=': |
257 |
if wrapped != words[2]: |
258 |
print 'EXPECTED', words[2], '!!!!!!!!!!' |
259 |
|
260 |
if __name__ == '__main__': |
261 |
test() |