Statistics
| Revision:

root / tags / v1_0_2_Build_912 / extensions / extScripting / scripts / jython / Lib / urlparse.py @ 11422

History | View | Annotate | Download (8.46 KB)

1
"""Parse (absolute and relative) URLs.
2

3
See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4
UC Irvine, June 1995.
5
"""
6

    
7
__all__ = ["urlparse", "urlunparse", "urljoin"]
8

    
9
# A classification of schemes ('' means apply by default)
10
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
11
                 'https', 'shttp',
12
                 'prospero', 'rtsp', 'rtspu', '']
13
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
14
               'file',
15
               'https', 'shttp', 'snews',
16
               'prospero', 'rtsp', 'rtspu', '']
17
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
18
                    'snews', 'sip',
19
                    ]
20
uses_params = ['ftp', 'hdl', 'prospero', 'http',
21
               'https', 'shttp', 'rtsp', 'rtspu', 'sip',
22
               '']
23
uses_query = ['http', 'wais',
24
              'https', 'shttp',
25
              'gopher', 'rtsp', 'rtspu', 'sip',
26
              '']
27
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
28
                 'https', 'shttp', 'snews',
29
                 'file', 'prospero', '']
30

    
31
# Characters valid in scheme names
32
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
33
                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
34
                '0123456789'
35
                '+-.')
36

    
37
MAX_CACHE_SIZE = 20
38
_parse_cache = {}
39

    
40
def clear_cache():
41
    """Clear the parse cache."""
42
    global _parse_cache
43
    _parse_cache = {}
44

    
45

    
46
def urlparse(url, scheme = '', allow_fragments = 1):
47
    """Parse a URL into 6 components:
48
    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
49
    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
50
    Note that we don't break the components up in smaller bits
51
    (e.g. netloc is a single string) and we don't expand % escapes."""
52
    key = url, scheme, allow_fragments
53
    cached = _parse_cache.get(key, None)
54
    if cached:
55
        return cached
56
    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
57
        clear_cache()
58
    netloc = path = params = query = fragment = ''
59
    i = url.find(':')
60
    if i > 0:
61
        if url[:i] == 'http': # optimize the common case
62
            scheme = url[:i].lower()
63
            url = url[i+1:]
64
            if url[:2] == '//':
65
                i = url.find('/', 2)
66
                if i < 0:
67
                    i = len(url)
68
                netloc = url[2:i]
69
                url = url[i:]
70
            if allow_fragments:
71
                i = url.rfind('#')
72
                if i >= 0:
73
                    fragment = url[i+1:]
74
                    url = url[:i]
75
            i = url.find('?')
76
            if i >= 0:
77
                query = url[i+1:]
78
                url = url[:i]
79
            i = url.find(';')
80
            if i >= 0:
81
                params = url[i+1:]
82
                url = url[:i]
83
            tuple = scheme, netloc, url, params, query, fragment
84
            _parse_cache[key] = tuple
85
            return tuple
86
        for c in url[:i]:
87
            if c not in scheme_chars:
88
                break
89
        else:
90
            scheme, url = url[:i].lower(), url[i+1:]
91
    if scheme in uses_netloc:
92
        if url[:2] == '//':
93
            i = url.find('/', 2)
94
            if i < 0:
95
                i = len(url)
96
            netloc, url = url[2:i], url[i:]
97
    if allow_fragments and scheme in uses_fragment:
98
        i = url.rfind('#')
99
        if i >= 0:
100
            url, fragment = url[:i], url[i+1:]
101
    if scheme in uses_query:
102
        i = url.find('?')
103
        if i >= 0:
104
            url, query = url[:i], url[i+1:]
105
    if scheme in uses_params:
106
        i = url.find(';')
107
        if i >= 0:
108
            url, params = url[:i], url[i+1:]
109
    tuple = scheme, netloc, url, params, query, fragment
110
    _parse_cache[key] = tuple
111
    return tuple
112

    
113
def urlunparse((scheme, netloc, url, params, query, fragment)):
114
    """Put a parsed URL back together again.  This may result in a
115
    slightly different, but equivalent URL, if the URL that was parsed
116
    originally had redundant delimiters, e.g. a ? with an empty query
117
    (the draft states that these are equivalent)."""
118
    if netloc or (scheme in uses_netloc and url[:2] == '//'):
119
        if url and url[:1] != '/': url = '/' + url
120
        url = '//' + (netloc or '') + url
121
    if scheme:
122
        url = scheme + ':' + url
123
    if params:
124
        url = url + ';' + params
125
    if query:
126
        url = url + '?' + query
127
    if fragment:
128
        url = url + '#' + fragment
129
    return url
130

    
131
def urljoin(base, url, allow_fragments = 1):
132
    """Join a base URL and a possibly relative URL to form an absolute
133
    interpretation of the latter."""
134
    if not base:
135
        return url
136
    if not url:
137
        return base
138
    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
139
            urlparse(base, '', allow_fragments)
140
    scheme, netloc, path, params, query, fragment = \
141
            urlparse(url, bscheme, allow_fragments)
142
    if scheme != bscheme or scheme not in uses_relative:
143
        return url
144
    if scheme in uses_netloc:
145
        if netloc:
146
            return urlunparse((scheme, netloc, path,
147
                               params, query, fragment))
148
        netloc = bnetloc
149
    if path[:1] == '/':
150
        return urlunparse((scheme, netloc, path,
151
                           params, query, fragment))
152
    if not path:
153
        if not params:
154
            params = bparams
155
            if not query:
156
                query = bquery
157
        return urlunparse((scheme, netloc, bpath,
158
                           params, query, fragment))
159
    segments = bpath.split('/')[:-1] + path.split('/')
160
    # XXX The stuff below is bogus in various ways...
161
    if segments[-1] == '.':
162
        segments[-1] = ''
163
    while '.' in segments:
164
        segments.remove('.')
165
    while 1:
166
        i = 1
167
        n = len(segments) - 1
168
        while i < n:
169
            if (segments[i] == '..'
170
                and segments[i-1] not in ('', '..')):
171
                del segments[i-1:i+1]
172
                break
173
            i = i+1
174
        else:
175
            break
176
    if segments == ['', '..']:
177
        segments[-1] = ''
178
    elif len(segments) >= 2 and segments[-1] == '..':
179
        segments[-2:] = ['']
180
    return urlunparse((scheme, netloc, '/'.join(segments),
181
                       params, query, fragment))
182

    
183
def urldefrag(url):
184
    """Removes any existing fragment from URL.
185

186
    Returns a tuple of the defragmented URL and the fragment.  If
187
    the URL contained no fragments, the second element is the
188
    empty string.
189
    """
190
    s, n, p, a, q, frag = urlparse(url)
191
    defrag = urlunparse((s, n, p, a, q, ''))
192
    return defrag, frag
193

    
194

    
195
test_input = """
196
      http://a/b/c/d
197

198
      g:h        = <URL:g:h>
199
      http:g     = <URL:http://a/b/c/g>
200
      http:      = <URL:http://a/b/c/d>
201
      g          = <URL:http://a/b/c/g>
202
      ./g        = <URL:http://a/b/c/g>
203
      g/         = <URL:http://a/b/c/g/>
204
      /g         = <URL:http://a/g>
205
      //g        = <URL:http://g>
206
      ?y         = <URL:http://a/b/c/d?y>
207
      g?y        = <URL:http://a/b/c/g?y>
208
      g?y/./x    = <URL:http://a/b/c/g?y/./x>
209
      .          = <URL:http://a/b/c/>
210
      ./         = <URL:http://a/b/c/>
211
      ..         = <URL:http://a/b/>
212
      ../        = <URL:http://a/b/>
213
      ../g       = <URL:http://a/b/g>
214
      ../..      = <URL:http://a/>
215
      ../../g    = <URL:http://a/g>
216
      ../../../g = <URL:http://a/../g>
217
      ./../g     = <URL:http://a/b/g>
218
      ./g/.      = <URL:http://a/b/c/g/>
219
      /./g       = <URL:http://a/./g>
220
      g/./h      = <URL:http://a/b/c/g/h>
221
      g/../h     = <URL:http://a/b/c/h>
222
      http:g     = <URL:http://a/b/c/g>
223
      http:      = <URL:http://a/b/c/d>
224
      http:?y         = <URL:http://a/b/c/d?y>
225
      http:g?y        = <URL:http://a/b/c/g?y>
226
      http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
227
"""
228
# XXX The result for //g is actually http://g/; is this a problem?
229

    
230
def test():
231
    import sys
232
    base = ''
233
    if sys.argv[1:]:
234
        fn = sys.argv[1]
235
        if fn == '-':
236
            fp = sys.stdin
237
        else:
238
            fp = open(fn)
239
    else:
240
        import StringIO
241
        fp = StringIO.StringIO(test_input)
242
    while 1:
243
        line = fp.readline()
244
        if not line: break
245
        words = line.split()
246
        if not words:
247
            continue
248
        url = words[0]
249
        parts = urlparse(url)
250
        print '%-10s : %s' % (url, parts)
251
        abs = urljoin(base, url)
252
        if not base:
253
            base = abs
254
        wrapped = '<URL:%s>' % abs
255
        print '%-10s = %s' % (url, wrapped)
256
        if len(words) == 3 and words[1] == '=':
257
            if wrapped != words[2]:
258
                print 'EXPECTED', words[2], '!!!!!!!!!!'
259

    
260
if __name__ == '__main__':
261
    test()