url.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. from __future__ import absolute_import
  2. import re
  3. from collections import namedtuple
  4. from ..exceptions import LocationParseError
  5. from ..packages import six
  6. url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]
  7. # We only want to normalize urls with an HTTP(S) scheme.
  8. # urllib3 infers URLs without a scheme (None) to be http.
  9. NORMALIZABLE_SCHEMES = ("http", "https", None)
  10. # Almost all of these patterns were derived from the
  11. # 'rfc3986' module: https://github.com/python-hyper/rfc3986
  12. PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
  13. SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
  14. URI_RE = re.compile(
  15. r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
  16. r"(?://([^/?#]*))?"
  17. r"([^?#]*)"
  18. r"(?:\?([^#]*))?"
  19. r"(?:#(.*))?$",
  20. re.UNICODE | re.DOTALL,
  21. )
  22. IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
  23. HEX_PAT = "[0-9A-Fa-f]{1,4}"
  24. LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)
  25. _subs = {"hex": HEX_PAT, "ls32": LS32_PAT}
  26. _variations = [
  27. # 6( h16 ":" ) ls32
  28. "(?:%(hex)s:){6}%(ls32)s",
  29. # "::" 5( h16 ":" ) ls32
  30. "::(?:%(hex)s:){5}%(ls32)s",
  31. # [ h16 ] "::" 4( h16 ":" ) ls32
  32. "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
  33. # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
  34. "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
  35. # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
  36. "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
  37. # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
  38. "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
  39. # [ *4( h16 ":" ) h16 ] "::" ls32
  40. "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
  41. # [ *5( h16 ":" ) h16 ] "::" h16
  42. "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
  43. # [ *6( h16 ":" ) h16 ] "::"
  44. "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
  45. ]
  46. UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._!\-~"
  47. IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
  48. ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
  49. IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"
  50. REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
  51. TARGET_RE = re.compile(r"^(/[^?]*)(?:\?([^#]+))?(?:#(.*))?$")
  52. IPV4_RE = re.compile("^" + IPV4_PAT + "$")
  53. IPV6_RE = re.compile("^" + IPV6_PAT + "$")
  54. IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
  55. BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
  56. ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
  57. SUBAUTHORITY_PAT = (u"^(?:(.*)@)?(%s|%s|%s)(?::([0-9]{0,5}))?$") % (
  58. REG_NAME_PAT,
  59. IPV4_PAT,
  60. IPV6_ADDRZ_PAT,
  61. )
  62. SUBAUTHORITY_RE = re.compile(SUBAUTHORITY_PAT, re.UNICODE | re.DOTALL)
  63. UNRESERVED_CHARS = set(
  64. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
  65. )
  66. SUB_DELIM_CHARS = set("!$&'()*+,;=")
  67. USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
  68. PATH_CHARS = USERINFO_CHARS | {"@", "/"}
  69. QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}
  70. class Url(namedtuple("Url", url_attrs)):
  71. """
  72. Data structure for representing an HTTP URL. Used as a return value for
  73. :func:`parse_url`. Both the scheme and host are normalized as they are
  74. both case-insensitive according to RFC 3986.
  75. """
  76. __slots__ = ()
  77. def __new__(
  78. cls,
  79. scheme=None,
  80. auth=None,
  81. host=None,
  82. port=None,
  83. path=None,
  84. query=None,
  85. fragment=None,
  86. ):
  87. if path and not path.startswith("/"):
  88. path = "/" + path
  89. if scheme is not None:
  90. scheme = scheme.lower()
  91. return super(Url, cls).__new__(
  92. cls, scheme, auth, host, port, path, query, fragment
  93. )
  94. @property
  95. def hostname(self):
  96. """For backwards-compatibility with urlparse. We're nice like that."""
  97. return self.host
  98. @property
  99. def request_uri(self):
  100. """Absolute path including the query string."""
  101. uri = self.path or "/"
  102. if self.query is not None:
  103. uri += "?" + self.query
  104. return uri
  105. @property
  106. def netloc(self):
  107. """Network location including host and port"""
  108. if self.port:
  109. return "%s:%d" % (self.host, self.port)
  110. return self.host
  111. @property
  112. def url(self):
  113. """
  114. Convert self into a url
  115. This function should more or less round-trip with :func:`.parse_url`. The
  116. returned url may not be exactly the same as the url inputted to
  117. :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
  118. with a blank port will have : removed).
  119. Example: ::
  120. >>> U = parse_url('http://google.com/mail/')
  121. >>> U.url
  122. 'http://google.com/mail/'
  123. >>> Url('http', 'username:password', 'host.com', 80,
  124. ... '/path', 'query', 'fragment').url
  125. 'http://username:password@host.com:80/path?query#fragment'
  126. """
  127. scheme, auth, host, port, path, query, fragment = self
  128. url = u""
  129. # We use "is not None" we want things to happen with empty strings (or 0 port)
  130. if scheme is not None:
  131. url += scheme + u"://"
  132. if auth is not None:
  133. url += auth + u"@"
  134. if host is not None:
  135. url += host
  136. if port is not None:
  137. url += u":" + str(port)
  138. if path is not None:
  139. url += path
  140. if query is not None:
  141. url += u"?" + query
  142. if fragment is not None:
  143. url += u"#" + fragment
  144. return url
  145. def __str__(self):
  146. return self.url
  147. def split_first(s, delims):
  148. """
  149. .. deprecated:: 1.25
  150. Given a string and an iterable of delimiters, split on the first found
  151. delimiter. Return two split parts and the matched delimiter.
  152. If not found, then the first part is the full input string.
  153. Example::
  154. >>> split_first('foo/bar?baz', '?/=')
  155. ('foo', 'bar?baz', '/')
  156. >>> split_first('foo/bar?baz', '123')
  157. ('foo/bar?baz', '', None)
  158. Scales linearly with number of delims. Not ideal for large number of delims.
  159. """
  160. min_idx = None
  161. min_delim = None
  162. for d in delims:
  163. idx = s.find(d)
  164. if idx < 0:
  165. continue
  166. if min_idx is None or idx < min_idx:
  167. min_idx = idx
  168. min_delim = d
  169. if min_idx is None or min_idx < 0:
  170. return s, "", None
  171. return s[:min_idx], s[min_idx + 1 :], min_delim
  172. def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):
  173. """Percent-encodes a URI component without reapplying
  174. onto an already percent-encoded component.
  175. """
  176. if component is None:
  177. return component
  178. component = six.ensure_text(component)
  179. # Try to see if the component we're encoding is already percent-encoded
  180. # so we can skip all '%' characters but still encode all others.
  181. percent_encodings = PERCENT_RE.findall(component)
  182. # Normalize existing percent-encoded bytes.
  183. for enc in percent_encodings:
  184. if not enc.isupper():
  185. component = component.replace(enc, enc.upper())
  186. uri_bytes = component.encode("utf-8", "surrogatepass")
  187. is_percent_encoded = len(percent_encodings) == uri_bytes.count(b"%")
  188. encoded_component = bytearray()
  189. for i in range(0, len(uri_bytes)):
  190. # Will return a single character bytestring on both Python 2 & 3
  191. byte = uri_bytes[i : i + 1]
  192. byte_ord = ord(byte)
  193. if (is_percent_encoded and byte == b"%") or (
  194. byte_ord < 128 and byte.decode() in allowed_chars
  195. ):
  196. encoded_component.extend(byte)
  197. continue
  198. encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
  199. return encoded_component.decode(encoding)
  200. def _remove_path_dot_segments(path):
  201. # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
  202. segments = path.split("/") # Turn the path into a list of segments
  203. output = [] # Initialize the variable to use to store output
  204. for segment in segments:
  205. # '.' is the current directory, so ignore it, it is superfluous
  206. if segment == ".":
  207. continue
  208. # Anything other than '..', should be appended to the output
  209. elif segment != "..":
  210. output.append(segment)
  211. # In this case segment == '..', if we can, we should pop the last
  212. # element
  213. elif output:
  214. output.pop()
  215. # If the path starts with '/' and the output is empty or the first string
  216. # is non-empty
  217. if path.startswith("/") and (not output or output[0]):
  218. output.insert(0, "")
  219. # If the path starts with '/.' or '/..' ensure we add one more empty
  220. # string to add a trailing '/'
  221. if path.endswith(("/.", "/..")):
  222. output.append("")
  223. return "/".join(output)
  224. def _normalize_host(host, scheme):
  225. if host:
  226. if isinstance(host, six.binary_type):
  227. host = six.ensure_str(host)
  228. if scheme in NORMALIZABLE_SCHEMES:
  229. is_ipv6 = IPV6_ADDRZ_RE.match(host)
  230. if is_ipv6:
  231. match = ZONE_ID_RE.search(host)
  232. if match:
  233. start, end = match.span(1)
  234. zone_id = host[start:end]
  235. if zone_id.startswith("%25") and zone_id != "%25":
  236. zone_id = zone_id[3:]
  237. else:
  238. zone_id = zone_id[1:]
  239. zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)
  240. return host[:start].lower() + zone_id + host[end:]
  241. else:
  242. return host.lower()
  243. elif not IPV4_RE.match(host):
  244. return six.ensure_str(
  245. b".".join([_idna_encode(label) for label in host.split(".")])
  246. )
  247. return host
  248. def _idna_encode(name):
  249. if name and any([ord(x) > 128 for x in name]):
  250. try:
  251. from pip._vendor import idna
  252. except ImportError:
  253. six.raise_from(
  254. LocationParseError("Unable to parse URL without the 'idna' module"),
  255. None,
  256. )
  257. try:
  258. return idna.encode(name.lower(), strict=True, std3_rules=True)
  259. except idna.IDNAError:
  260. six.raise_from(
  261. LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None
  262. )
  263. return name.lower().encode("ascii")
  264. def _encode_target(target):
  265. """Percent-encodes a request target so that there are no invalid characters"""
  266. if not target.startswith("/"):
  267. return target
  268. path, query, fragment = TARGET_RE.match(target).groups()
  269. target = _encode_invalid_chars(path, PATH_CHARS)
  270. query = _encode_invalid_chars(query, QUERY_CHARS)
  271. fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
  272. if query is not None:
  273. target += "?" + query
  274. if fragment is not None:
  275. target += "#" + target
  276. return target
  277. def parse_url(url):
  278. """
  279. Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
  280. performed to parse incomplete urls. Fields not provided will be None.
  281. This parser is RFC 3986 compliant.
  282. The parser logic and helper functions are based heavily on
  283. work done in the ``rfc3986`` module.
  284. :param str url: URL to parse into a :class:`.Url` namedtuple.
  285. Partly backwards-compatible with :mod:`urlparse`.
  286. Example::
  287. >>> parse_url('http://google.com/mail/')
  288. Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
  289. >>> parse_url('google.com:80')
  290. Url(scheme=None, host='google.com', port=80, path=None, ...)
  291. >>> parse_url('/foo?bar')
  292. Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
  293. """
  294. if not url:
  295. # Empty
  296. return Url()
  297. source_url = url
  298. if not SCHEME_RE.search(url):
  299. url = "//" + url
  300. try:
  301. scheme, authority, path, query, fragment = URI_RE.match(url).groups()
  302. normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES
  303. if scheme:
  304. scheme = scheme.lower()
  305. if authority:
  306. auth, host, port = SUBAUTHORITY_RE.match(authority).groups()
  307. if auth and normalize_uri:
  308. auth = _encode_invalid_chars(auth, USERINFO_CHARS)
  309. if port == "":
  310. port = None
  311. else:
  312. auth, host, port = None, None, None
  313. if port is not None:
  314. port = int(port)
  315. if not (0 <= port <= 65535):
  316. raise LocationParseError(url)
  317. host = _normalize_host(host, scheme)
  318. if normalize_uri and path:
  319. path = _remove_path_dot_segments(path)
  320. path = _encode_invalid_chars(path, PATH_CHARS)
  321. if normalize_uri and query:
  322. query = _encode_invalid_chars(query, QUERY_CHARS)
  323. if normalize_uri and fragment:
  324. fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
  325. except (ValueError, AttributeError):
  326. return six.raise_from(LocationParseError(source_url), None)
  327. # For the sake of backwards compatibility we put empty
  328. # string values for path if there are any defined values
  329. # beyond the path in the URL.
  330. # TODO: Remove this when we break backwards compatibility.
  331. if not path:
  332. if query is not None or fragment is not None:
  333. path = ""
  334. else:
  335. path = None
  336. # Ensure that each part of the URL is a `str` for
  337. # backwards compatibility.
  338. if isinstance(url, six.text_type):
  339. ensure_func = six.ensure_text
  340. else:
  341. ensure_func = six.ensure_str
  342. def ensure_type(x):
  343. return x if x is None else ensure_func(x)
  344. return Url(
  345. scheme=ensure_type(scheme),
  346. auth=ensure_type(auth),
  347. host=ensure_type(host),
  348. port=port,
  349. path=ensure_type(path),
  350. query=ensure_type(query),
  351. fragment=ensure_type(fragment),
  352. )
  353. def get_host(url):
  354. """
  355. Deprecated. Use :func:`parse_url` instead.
  356. """
  357. p = parse_url(url)
  358. return p.scheme or "http", p.hostname, p.port