1420216e5SHanoh Haimfrom __future__ import absolute_import
2420216e5SHanoh Haimfrom collections import namedtuple
3420216e5SHanoh Haim
4420216e5SHanoh Haimfrom ..exceptions import LocationParseError
5420216e5SHanoh Haim
6420216e5SHanoh Haim
7420216e5SHanoh Haimurl_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
8420216e5SHanoh Haim
9420216e5SHanoh Haim
10420216e5SHanoh Haimclass Url(namedtuple('Url', url_attrs)):
11420216e5SHanoh Haim    """
12420216e5SHanoh Haim    Datastructure for representing an HTTP URL. Used as a return value for
13420216e5SHanoh Haim    :func:`parse_url`. Both the scheme and host are normalized as they are
14420216e5SHanoh Haim    both case-insensitive according to RFC 3986.
15420216e5SHanoh Haim    """
16420216e5SHanoh Haim    __slots__ = ()
17420216e5SHanoh Haim
18420216e5SHanoh Haim    def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
19420216e5SHanoh Haim                query=None, fragment=None):
20420216e5SHanoh Haim        if path and not path.startswith('/'):
21420216e5SHanoh Haim            path = '/' + path
22420216e5SHanoh Haim        if scheme:
23420216e5SHanoh Haim            scheme = scheme.lower()
24420216e5SHanoh Haim        if host:
25420216e5SHanoh Haim            host = host.lower()
26420216e5SHanoh Haim        return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
27420216e5SHanoh Haim                                       query, fragment)
28420216e5SHanoh Haim
29420216e5SHanoh Haim    @property
30420216e5SHanoh Haim    def hostname(self):
31420216e5SHanoh Haim        """For backwards-compatibility with urlparse. We're nice like that."""
32420216e5SHanoh Haim        return self.host
33420216e5SHanoh Haim
34420216e5SHanoh Haim    @property
35420216e5SHanoh Haim    def request_uri(self):
36420216e5SHanoh Haim        """Absolute path including the query string."""
37420216e5SHanoh Haim        uri = self.path or '/'
38420216e5SHanoh Haim
39420216e5SHanoh Haim        if self.query is not None:
40420216e5SHanoh Haim            uri += '?' + self.query
41420216e5SHanoh Haim
42420216e5SHanoh Haim        return uri
43420216e5SHanoh Haim
44420216e5SHanoh Haim    @property
45420216e5SHanoh Haim    def netloc(self):
46420216e5SHanoh Haim        """Network location including host and port"""
47420216e5SHanoh Haim        if self.port:
48420216e5SHanoh Haim            return '%s:%d' % (self.host, self.port)
49420216e5SHanoh Haim        return self.host
50420216e5SHanoh Haim
51420216e5SHanoh Haim    @property
52420216e5SHanoh Haim    def url(self):
53420216e5SHanoh Haim        """
54420216e5SHanoh Haim        Convert self into a url
55420216e5SHanoh Haim
56420216e5SHanoh Haim        This function should more or less round-trip with :func:`.parse_url`. The
57420216e5SHanoh Haim        returned url may not be exactly the same as the url inputted to
58420216e5SHanoh Haim        :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
59420216e5SHanoh Haim        with a blank port will have : removed).
60420216e5SHanoh Haim
61420216e5SHanoh Haim        Example: ::
62420216e5SHanoh Haim
63420216e5SHanoh Haim            >>> U = parse_url('http://google.com/mail/')
64420216e5SHanoh Haim            >>> U.url
65420216e5SHanoh Haim            'http://google.com/mail/'
66420216e5SHanoh Haim            >>> Url('http', 'username:password', 'host.com', 80,
67420216e5SHanoh Haim            ... '/path', 'query', 'fragment').url
68420216e5SHanoh Haim            'http://username:password@host.com:80/path?query#fragment'
69420216e5SHanoh Haim        """
70420216e5SHanoh Haim        scheme, auth, host, port, path, query, fragment = self
71420216e5SHanoh Haim        url = ''
72420216e5SHanoh Haim
73420216e5SHanoh Haim        # We use "is not None" we want things to happen with empty strings (or 0 port)
74420216e5SHanoh Haim        if scheme is not None:
75420216e5SHanoh Haim            url += scheme + '://'
76420216e5SHanoh Haim        if auth is not None:
77420216e5SHanoh Haim            url += auth + '@'
78420216e5SHanoh Haim        if host is not None:
79420216e5SHanoh Haim            url += host
80420216e5SHanoh Haim        if port is not None:
81420216e5SHanoh Haim            url += ':' + str(port)
82420216e5SHanoh Haim        if path is not None:
83420216e5SHanoh Haim            url += path
84420216e5SHanoh Haim        if query is not None:
85420216e5SHanoh Haim            url += '?' + query
86420216e5SHanoh Haim        if fragment is not None:
87420216e5SHanoh Haim            url += '#' + fragment
88420216e5SHanoh Haim
89420216e5SHanoh Haim        return url
90420216e5SHanoh Haim
91420216e5SHanoh Haim    def __str__(self):
92420216e5SHanoh Haim        return self.url
93420216e5SHanoh Haim
94420216e5SHanoh Haim
95420216e5SHanoh Haimdef split_first(s, delims):
96420216e5SHanoh Haim    """
97420216e5SHanoh Haim    Given a string and an iterable of delimiters, split on the first found
98420216e5SHanoh Haim    delimiter. Return two split parts and the matched delimiter.
99420216e5SHanoh Haim
100420216e5SHanoh Haim    If not found, then the first part is the full input string.
101420216e5SHanoh Haim
102420216e5SHanoh Haim    Example::
103420216e5SHanoh Haim
104420216e5SHanoh Haim        >>> split_first('foo/bar?baz', '?/=')
105420216e5SHanoh Haim        ('foo', 'bar?baz', '/')
106420216e5SHanoh Haim        >>> split_first('foo/bar?baz', '123')
107420216e5SHanoh Haim        ('foo/bar?baz', '', None)
108420216e5SHanoh Haim
109420216e5SHanoh Haim    Scales linearly with number of delims. Not ideal for large number of delims.
110420216e5SHanoh Haim    """
111420216e5SHanoh Haim    min_idx = None
112420216e5SHanoh Haim    min_delim = None
113420216e5SHanoh Haim    for d in delims:
114420216e5SHanoh Haim        idx = s.find(d)
115420216e5SHanoh Haim        if idx < 0:
116420216e5SHanoh Haim            continue
117420216e5SHanoh Haim
118420216e5SHanoh Haim        if min_idx is None or idx < min_idx:
119420216e5SHanoh Haim            min_idx = idx
120420216e5SHanoh Haim            min_delim = d
121420216e5SHanoh Haim
122420216e5SHanoh Haim    if min_idx is None or min_idx < 0:
123420216e5SHanoh Haim        return s, '', None
124420216e5SHanoh Haim
125420216e5SHanoh Haim    return s[:min_idx], s[min_idx + 1:], min_delim
126420216e5SHanoh Haim
127420216e5SHanoh Haim
128420216e5SHanoh Haimdef parse_url(url):
129420216e5SHanoh Haim    """
130420216e5SHanoh Haim    Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
131420216e5SHanoh Haim    performed to parse incomplete urls. Fields not provided will be None.
132420216e5SHanoh Haim
133420216e5SHanoh Haim    Partly backwards-compatible with :mod:`urlparse`.
134420216e5SHanoh Haim
135420216e5SHanoh Haim    Example::
136420216e5SHanoh Haim
137420216e5SHanoh Haim        >>> parse_url('http://google.com/mail/')
138420216e5SHanoh Haim        Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
139420216e5SHanoh Haim        >>> parse_url('google.com:80')
140420216e5SHanoh Haim        Url(scheme=None, host='google.com', port=80, path=None, ...)
141420216e5SHanoh Haim        >>> parse_url('/foo?bar')
142420216e5SHanoh Haim        Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
143420216e5SHanoh Haim    """
144420216e5SHanoh Haim
145420216e5SHanoh Haim    # While this code has overlap with stdlib's urlparse, it is much
146420216e5SHanoh Haim    # simplified for our needs and less annoying.
147420216e5SHanoh Haim    # Additionally, this implementations does silly things to be optimal
148420216e5SHanoh Haim    # on CPython.
149420216e5SHanoh Haim
150420216e5SHanoh Haim    if not url:
151420216e5SHanoh Haim        # Empty
152420216e5SHanoh Haim        return Url()
153420216e5SHanoh Haim
154420216e5SHanoh Haim    scheme = None
155420216e5SHanoh Haim    auth = None
156420216e5SHanoh Haim    host = None
157420216e5SHanoh Haim    port = None
158420216e5SHanoh Haim    path = None
159420216e5SHanoh Haim    fragment = None
160420216e5SHanoh Haim    query = None
161420216e5SHanoh Haim
162420216e5SHanoh Haim    # Scheme
163420216e5SHanoh Haim    if '://' in url:
164420216e5SHanoh Haim        scheme, url = url.split('://', 1)
165420216e5SHanoh Haim
166420216e5SHanoh Haim    # Find the earliest Authority Terminator
167420216e5SHanoh Haim    # (http://tools.ietf.org/html/rfc3986#section-3.2)
168420216e5SHanoh Haim    url, path_, delim = split_first(url, ['/', '?', '#'])
169420216e5SHanoh Haim
170420216e5SHanoh Haim    if delim:
171420216e5SHanoh Haim        # Reassemble the path
172420216e5SHanoh Haim        path = delim + path_
173420216e5SHanoh Haim
174420216e5SHanoh Haim    # Auth
175420216e5SHanoh Haim    if '@' in url:
176420216e5SHanoh Haim        # Last '@' denotes end of auth part
177420216e5SHanoh Haim        auth, url = url.rsplit('@', 1)
178420216e5SHanoh Haim
179420216e5SHanoh Haim    # IPv6
180420216e5SHanoh Haim    if url and url[0] == '[':
181420216e5SHanoh Haim        host, url = url.split(']', 1)
182420216e5SHanoh Haim        host += ']'
183420216e5SHanoh Haim
184420216e5SHanoh Haim    # Port
185420216e5SHanoh Haim    if ':' in url:
186420216e5SHanoh Haim        _host, port = url.split(':', 1)
187420216e5SHanoh Haim
188420216e5SHanoh Haim        if not host:
189420216e5SHanoh Haim            host = _host
190420216e5SHanoh Haim
191420216e5SHanoh Haim        if port:
192420216e5SHanoh Haim            # If given, ports must be integers. No whitespace, no plus or
193420216e5SHanoh Haim            # minus prefixes, no non-integer digits such as ^2 (superscript).
194420216e5SHanoh Haim            if not port.isdigit():
195420216e5SHanoh Haim                raise LocationParseError(url)
196420216e5SHanoh Haim            try:
197420216e5SHanoh Haim                port = int(port)
198420216e5SHanoh Haim            except ValueError:
199420216e5SHanoh Haim                raise LocationParseError(url)
200420216e5SHanoh Haim        else:
201420216e5SHanoh Haim            # Blank ports are cool, too. (rfc3986#section-3.2.3)
202420216e5SHanoh Haim            port = None
203420216e5SHanoh Haim
204420216e5SHanoh Haim    elif not host and url:
205420216e5SHanoh Haim        host = url
206420216e5SHanoh Haim
207420216e5SHanoh Haim    if not path:
208420216e5SHanoh Haim        return Url(scheme, auth, host, port, path, query, fragment)
209420216e5SHanoh Haim
210420216e5SHanoh Haim    # Fragment
211420216e5SHanoh Haim    if '#' in path:
212420216e5SHanoh Haim        path, fragment = path.split('#', 1)
213420216e5SHanoh Haim
214420216e5SHanoh Haim    # Query
215420216e5SHanoh Haim    if '?' in path:
216420216e5SHanoh Haim        path, query = path.split('?', 1)
217420216e5SHanoh Haim
218420216e5SHanoh Haim    return Url(scheme, auth, host, port, path, query, fragment)
219420216e5SHanoh Haim
220420216e5SHanoh Haim
221420216e5SHanoh Haimdef get_host(url):
222420216e5SHanoh Haim    """
223420216e5SHanoh Haim    Deprecated. Use :func:`parse_url` instead.
224420216e5SHanoh Haim    """
225420216e5SHanoh Haim    p = parse_url(url)
226420216e5SHanoh Haim    return p.scheme or 'http', p.hostname, p.port