PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
+# https://url.spec.whatwg.org/#percent-encoded-bytes
+
+# The fragment percent-encode set is the C0 control percent-encode set
+# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
+FRAG_SAFE = "".join(
+ [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]
+)
+
+# The query percent-encode set is the C0 control percent-encode set
+# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
+QUERY_SAFE = "".join(
+ [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]
+)
+
+# The path percent-encode set is the query percent-encode set
+# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
+PATH_SAFE = "".join(
+ [
+ chr(i)
+ for i in range(0x20, 0x7F)
+ if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)
+ ]
+)
+
+# The userinfo percent-encode set is the path percent-encode set
+# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),
+# U+005B ([) to U+005E (^), inclusive, and U+007C (|).
+USERNAME_SAFE = "".join(
+ [
+ chr(i)
+ for i in range(0x20, 0x7F)
+ if i
+ not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ + (0x3F, 0x60, 0x7B, 0x7D)
+ + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
+ ]
+)
+PASSWORD_SAFE = "".join(
+ [
+ chr(i)
+ for i in range(0x20, 0x7F)
+ if i
+ not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ + (0x3F, 0x60, 0x7B, 0x7D)
+ + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
+ ]
+)
+# Note... The terminology 'userinfo' percent-encode set in the WHATWG document
+# is used for the username and password quoting. For the joint userinfo component
+# we remove U+003A (:) from the safe set.
+USERINFO_SAFE = "".join(
+ [
+ chr(i)
+ for i in range(0x20, 0x7F)
+ if i
+ not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ + (0x3F, 0x60, 0x7B, 0x7D)
+ + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
+ ]
+)
# {scheme}: (optional)
# //{authority} (optional)
# Replace "username" and/or "password" with "userinfo".
if "username" in kwargs or "password" in kwargs:
- username = quote(kwargs.pop("username", "") or "")
- password = quote(kwargs.pop("password", "") or "")
+ username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)
+ password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)
kwargs["userinfo"] = f"{username}:{password}" if password else username
# Replace "raw_path" with "path" and "query".
authority = kwargs.get("authority", url_dict["authority"]) or ""
path = kwargs.get("path", url_dict["path"]) or ""
query = kwargs.get("query", url_dict["query"])
- fragment = kwargs.get("fragment", url_dict["fragment"])
+ frag = kwargs.get("fragment", url_dict["fragment"])
# The AUTHORITY_REGEX will always match, but may have empty components.
authority_match = AUTHORITY_REGEX.match(authority)
# We end up with a parsed representation of the URL,
# with components that are plain ASCII bytestrings.
parsed_scheme: str = scheme.lower()
- parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
+ parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)
parsed_host: str = encode_host(host)
parsed_port: int | None = normalize_port(port, scheme)
if has_scheme or has_authority:
path = normalize_path(path)
- # The GEN_DELIMS set is... : / ? # [ ] @
- # These do not need to be percent-quoted unless they serve as delimiters for the
- # specific component.
- WHATWG_SAFE = '`{}%|^\\"'
-
- # For 'path' we need to drop ? and # from the GEN_DELIMS set.
- parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
- # For 'query' we need to drop '#' from the GEN_DELIMS set.
- parsed_query: str | None = (
- None
- if query is None
- else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
- )
- # For 'fragment' we can include all of the GEN_DELIMS set.
- parsed_fragment: str | None = (
- None
- if fragment is None
- else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
- )
+ parsed_path: str = quote(path, safe=PATH_SAFE)
+ parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)
+ parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)
# The parsed ASCII bytestrings are our canonical form.
# All properties of the URL are derived from these.
parsed_port,
parsed_path,
parsed_query,
- parsed_fragment,
+ parsed_frag,
)