/tags/v1_0_2_Build_912/extensions/extScripting/scripts/jython/Lib/urlparse.py - | gvSIG desktop 1 - gvSIG

root / tags / v1_0_2_Build_912 / extensions / extScripting / scripts / jython / Lib / urlparse.py @ 11422

History | View | Annotate | Download (8.46 KB)

       """Parse (absolute and relative) URLs.
       See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
       UC Irvine, June 1995.
       """
       __all__ = ["urlparse", "urlunparse", "urljoin"]
       # A classification of schemes ('' means apply by default)
       uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
                        'https', 'shttp',
                        'prospero', 'rtsp', 'rtspu', '']
       uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
                      'file',
                      'https', 'shttp', 'snews',
                      'prospero', 'rtsp', 'rtspu', '']
       non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
                           'snews', 'sip',
+                          ]
       uses_params = ['ftp', 'hdl', 'prospero', 'http',
                      'https', 'shttp', 'rtsp', 'rtspu', 'sip',
                      '']
       uses_query = ['http', 'wais',
                     'https', 'shttp',
                     'gopher', 'rtsp', 'rtspu', 'sip',
                     '']
       uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
                        'https', 'shttp', 'snews',
                        'file', 'prospero', '']
       # Characters valid in scheme names
       scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                       'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                       '0123456789'
                       '+-.')
       MAX_CACHE_SIZE = 20
       _parse_cache = {}
       def clear_cache():
           """Clear the parse cache."""
           global _parse_cache
           _parse_cache = {}
       def urlparse(url, scheme = '', allow_fragments = 1):
           """Parse a URL into 6 components:
           <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
           Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
           Note that we don't break the components up in smaller bits
           (e.g. netloc is a single string) and we don't expand % escapes."""
           key = url, scheme, allow_fragments
           cached = _parse_cache.get(key, None)
           if cached:
               return cached
           if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
               clear_cache()
           netloc = path = params = query = fragment = ''
           i = url.find(':')
           if i > 0:
               if url[:i] == 'http': # optimize the common case
                   scheme = url[:i].lower()
                   url = url[i+1:]
                   if url[:2] == '//':
                       i = url.find('/', 2)
                       if i < 0:
                           i = len(url)
                       netloc = url[2:i]
                       url = url[i:]
                   if allow_fragments:
                       i = url.rfind('#')
                       if i >= 0:
                           fragment = url[i+1:]
                           url = url[:i]
                   i = url.find('?')
                   if i >= 0:
                       query = url[i+1:]
                       url = url[:i]
                   i = url.find(';')
                   if i >= 0:
                       params = url[i+1:]
                       url = url[:i]
                   tuple = scheme, netloc, url, params, query, fragment
                   _parse_cache[key] = tuple
                   return tuple
               for c in url[:i]:
                   if c not in scheme_chars:
                       break
               else:
                   scheme, url = url[:i].lower(), url[i+1:]
           if scheme in uses_netloc:
               if url[:2] == '//':
                   i = url.find('/', 2)
                   if i < 0:
                       i = len(url)
                   netloc, url = url[2:i], url[i:]
           if allow_fragments and scheme in uses_fragment:
               i = url.rfind('#')
               if i >= 0:
                   url, fragment = url[:i], url[i+1:]
           if scheme in uses_query:
               i = url.find('?')
               if i >= 0:
                   url, query = url[:i], url[i+1:]
           if scheme in uses_params:
               i = url.find(';')
               if i >= 0:
                   url, params = url[:i], url[i+1:]
           tuple = scheme, netloc, url, params, query, fragment
           _parse_cache[key] = tuple
           return tuple
       def urlunparse((scheme, netloc, url, params, query, fragment)):
           """Put a parsed URL back together again.  This may result in a
           slightly different, but equivalent URL, if the URL that was parsed
           originally had redundant delimiters, e.g. a ? with an empty query
           (the draft states that these are equivalent)."""
           if netloc or (scheme in uses_netloc and url[:2] == '//'):
               if url and url[:1] != '/': url = '/' + url
               url = '//' + (netloc or '') + url
           if scheme:
               url = scheme + ':' + url
           if params:
               url = url + ';' + params
           if query:
               url = url + '?' + query
           if fragment:
               url = url + '#' + fragment
           return url
       def urljoin(base, url, allow_fragments = 1):
           """Join a base URL and a possibly relative URL to form an absolute
           interpretation of the latter."""
           if not base:
               return url
           if not url:
               return base
           bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
                   urlparse(base, '', allow_fragments)
           scheme, netloc, path, params, query, fragment = \
                   urlparse(url, bscheme, allow_fragments)
           if scheme != bscheme or scheme not in uses_relative:
               return url
           if scheme in uses_netloc:
               if netloc:
                   return urlunparse((scheme, netloc, path,
                                      params, query, fragment))
               netloc = bnetloc
           if path[:1] == '/':
               return urlunparse((scheme, netloc, path,
                                  params, query, fragment))
           if not path:
               if not params:
                   params = bparams
                   if not query:
                       query = bquery
               return urlunparse((scheme, netloc, bpath,
                                  params, query, fragment))
           segments = bpath.split('/')[:-1] + path.split('/')
           # XXX The stuff below is bogus in various ways...
           if segments[-1] == '.':
               segments[-1] = ''
           while '.' in segments:
               segments.remove('.')
           while 1:
               i = 1
               n = len(segments) - 1
               while i < n:
                   if (segments[i] == '..'
                       and segments[i-1] not in ('', '..')):
                       del segments[i-1:i+1]
                       break
                   i = i+1
               else:
                   break
           if segments == ['', '..']:
               segments[-1] = ''
           elif len(segments) >= 2 and segments[-1] == '..':
               segments[-2:] = ['']
           return urlunparse((scheme, netloc, '/'.join(segments),
                              params, query, fragment))
       def urldefrag(url):
           """Removes any existing fragment from URL.
           Returns a tuple of the defragmented URL and the fragment.  If
           the URL contained no fragments, the second element is the
           empty string.
           """
           s, n, p, a, q, frag = urlparse(url)
           defrag = urlunparse((s, n, p, a, q, ''))
           return defrag, frag
       test_input = """
             http://a/b/c/d
             g:h        = <URL:g:h>
             http:g     = <URL:http://a/b/c/g>
             http:      = <URL:http://a/b/c/d>
             g          = <URL:http://a/b/c/g>
             ./g        = <URL:http://a/b/c/g>
             g/         = <URL:http://a/b/c/g/>
             /g         = <URL:http://a/g>
             //g        = <URL:http://g>
             ?y         = <URL:http://a/b/c/d?y>
             g?y        = <URL:http://a/b/c/g?y>
             g?y/./x    = <URL:http://a/b/c/g?y/./x>
             .          = <URL:http://a/b/c/>
             ./         = <URL:http://a/b/c/>
             ..         = <URL:http://a/b/>
             ../        = <URL:http://a/b/>
             ../g       = <URL:http://a/b/g>
             ../..      = <URL:http://a/>
             ../../g    = <URL:http://a/g>
             ../../../g = <URL:http://a/../g>
             ./../g     = <URL:http://a/b/g>
             ./g/.      = <URL:http://a/b/c/g/>
             /./g       = <URL:http://a/./g>
             g/./h      = <URL:http://a/b/c/g/h>
             g/../h     = <URL:http://a/b/c/h>
             http:g     = <URL:http://a/b/c/g>
             http:      = <URL:http://a/b/c/d>
             http:?y         = <URL:http://a/b/c/d?y>
             http:g?y        = <URL:http://a/b/c/g?y>
             http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
       """
       # XXX The result for //g is actually http://g/; is this a problem?
       def test():
           import sys
           base = ''
           if sys.argv[1:]:
               fn = sys.argv[1]
               if fn == '-':
                   fp = sys.stdin
               else:
                   fp = open(fn)
           else:
               import StringIO
               fp = StringIO.StringIO(test_input)
           while 1:
               line = fp.readline()
               if not line: break
               words = line.split()
               if not words:
                   continue
               url = words[0]
               parts = urlparse(url)
               print '%-10s : %s' % (url, parts)
               abs = urljoin(base, url)
               if not base:
                   base = abs
               wrapped = '<URL:%s>' % abs
               print '%-10s = %s' % (url, wrapped)
               if len(words) == 3 and words[1] == '=':
                   if wrapped != words[2]:
                       print 'EXPECTED', words[2], '!!!!!!!!!!'
       if __name__ == '__main__':
           test()