2 * Copyright (C) 1996-2025 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
9 /* DEBUG: section 23 URL Parsing */
12 #include "anyp/Host.h"
16 #include "HttpRequest.h"
17 #include "parser/Tokenizer.h"
19 #include "SquidConfig.h"
20 #include "SquidMath.h"
22 static const char valid_hostname_chars_u
[] =
23 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
24 "abcdefghijklmnopqrstuvwxyz"
28 static const char valid_hostname_chars
[] =
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
30 "abcdefghijklmnopqrstuvwxyz"
35 /// Characters which are valid within a URI userinfo section
36 static const CharacterSet
&
40 * RFC 3986 section 3.2.1
42 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
43 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
44 * pct-encoded = "%" HEXDIG HEXDIG
45 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
47 static const auto userInfoValid
= CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
54 * Governed by RFC 3986 section 2.1
57 AnyP::Uri::Encode(const SBuf
&buf
, const CharacterSet
&ignore
)
62 Parser::Tokenizer
tk(buf
);
64 // optimization for the arguably common "no encoding necessary" case
65 if (tk
.prefix(goodSection
, ignore
) && tk
.atEnd())
69 output
.reserveSpace(buf
.length() * 3); // worst case: encode all chars
70 output
.append(goodSection
); // may be empty
73 // TODO: Add Tokenizer::parseOne(void).
74 const auto ch
= tk
.remaining()[0];
75 output
.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch
))); // TODO: Optimize using a table
78 if (tk
.prefix(goodSection
, ignore
))
79 output
.append(goodSection
);
86 AnyP::Uri::Decode(const SBuf
&buf
)
89 Parser::Tokenizer
tok(buf
);
90 while (!tok
.atEnd()) {
92 static const auto unencodedChars
= CharacterSet("percent", "%").complement("unencoded");
93 if (tok
.prefix(token
, unencodedChars
))
96 // we are either at '%' or at end of input
98 int64_t hex1
= 0, hex2
= 0;
99 if (tok
.int64(hex1
, 16, false, 1) && tok
.int64(hex2
, 16, false, 1))
100 output
.append(static_cast<char>((hex1
<< 4) | hex2
));
102 throw TextException("invalid pct-encoded triplet", Here());
109 AnyP::Uri::Asterisk()
111 static SBuf
star("*");
116 AnyP::Uri::SlashPath()
118 static SBuf
slash("/");
123 AnyP::Uri::host(const char *src
)
125 hostAddr_
.fromHost(src
);
126 if (hostAddr_
.isAnyAddr()) {
127 xstrncpy(host_
, src
, sizeof(host_
));
128 hostIsNumeric_
= false;
130 hostAddr_
.toHostStr(host_
, sizeof(host_
));
131 debugs(23, 3, "given IP: " << hostAddr_
);
137 // TODO: Replace with ToSBuf(parsedHost()) or similar.
139 AnyP::Uri::hostOrIp() const
141 if (hostIsNumeric()) {
142 static char ip
[MAX_IPSTRLEN
];
143 const auto hostStrLen
= hostIP().toHostStr(ip
, sizeof(ip
));
144 return SBuf(ip
, hostStrLen
);
149 std::optional
<AnyP::Host
>
150 AnyP::Uri::parsedHost() const
153 return Host::ParseIp(hostIP());
155 // XXX: Interpret host subcomponent as reg-name representing a DNS name. It
156 // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but
157 // current Squid APIs do not support adequate representation of those cases.
158 const SBuf
regName(host());
160 if (regName
.find('%') != SBuf::npos
) {
161 debugs(23, 3, "rejecting percent-encoded reg-name: " << regName
);
162 return std::nullopt
; // TODO: Decode() instead
165 return Host::ParseSimpleDomainName(regName
);
169 AnyP::Uri::path() const
171 // RFC 3986 section 3.3 says path can be empty (path-abempty).
172 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
173 // at least when sending and using. We must still accept path-abempty as input.
174 if (path_
.isEmpty() && (scheme_
== AnyP::PROTO_HTTP
|| scheme_
== AnyP::PROTO_HTTPS
))
183 debugs(23, 5, "urlInitialize: Initializing...");
184 /* this ensures that the number of protocol strings is the same as
185 * the enum slots allocated because the last enum is always 'MAX'.
187 assert(strcmp(AnyP::ProtocolType_str
[AnyP::PROTO_MAX
], "MAX") == 0);
189 * These test that our matchDomainName() function works the
190 * way we expect it to.
192 assert(0 == matchDomainName("foo.com", "foo.com"));
193 assert(0 == matchDomainName(".foo.com", "foo.com"));
194 assert(0 == matchDomainName("foo.com", ".foo.com"));
195 assert(0 == matchDomainName(".foo.com", ".foo.com"));
196 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
197 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
198 assert(0 != matchDomainName("x.foo.com", "foo.com"));
199 assert(0 != matchDomainName("foo.com", "x.foo.com"));
200 assert(0 != matchDomainName("bar.com", "foo.com"));
201 assert(0 != matchDomainName(".bar.com", "foo.com"));
202 assert(0 != matchDomainName(".bar.com", ".foo.com"));
203 assert(0 != matchDomainName("bar.com", ".foo.com"));
204 assert(0 < matchDomainName("zzz.com", "foo.com"));
205 assert(0 > matchDomainName("aaa.com", "foo.com"));
206 assert(0 == matchDomainName("FOO.com", "foo.COM"));
207 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
208 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
209 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
211 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains
));
212 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains
));
213 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains
));
214 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains
));
216 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards
));
217 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards
));
218 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards
));
219 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards
));
221 assert(0 != matchDomainName("foo.com", ""));
222 assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards
));
223 assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains
));
229 * Extract the URI scheme and ':' delimiter from the given input buffer.
231 * Schemes up to 16 characters are accepted.
233 * Governed by RFC 3986 section 3.1
235 static AnyP::UriScheme
236 uriParseScheme(Parser::Tokenizer
&tok
)
239 * RFC 3986 section 3.1 paragraph 2:
241 * Scheme names consist of a sequence of characters beginning with a
242 * letter and followed by any combination of letters, digits, plus
243 * ("+"), period ("."), or hyphen ("-").
245 static const auto schemeChars
= CharacterSet("scheme", "+.-") + CharacterSet::ALPHA
+ CharacterSet::DIGIT
;
248 if (tok
.prefix(str
, schemeChars
, 16) && tok
.skip(':') && CharacterSet::ALPHA
[str
.at(0)]) {
249 const auto protocol
= AnyP::UriScheme::FindProtocolType(str
);
250 if (protocol
== AnyP::PROTO_UNKNOWN
)
251 return AnyP::UriScheme(protocol
, str
.c_str());
252 return AnyP::UriScheme(protocol
, nullptr);
255 throw TextException("invalid URI scheme", Here());
259 * Appends configured append_domain to hostname, assuming
260 * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
261 * and that the host FQDN is not a 'dotless' TLD.
263 * \returns false if and only if there is not enough space to append
266 urlAppendDomain(char *host
)
268 /* For IPv4 addresses check for a dot */
269 /* For IPv6 addresses also check for a colon */
270 if (Config
.appendDomain
&& !strchr(host
, '.') && !strchr(host
, ':')) {
271 const uint64_t dlen
= strlen(host
);
272 const uint64_t want
= dlen
+ Config
.appendDomainLen
;
273 if (want
> SQUIDHOSTNAMELEN
- 1) {
274 debugs(23, 2, "URL domain too large (" << dlen
<< " bytes)");
277 strncat(host
, Config
.appendDomain
, SQUIDHOSTNAMELEN
- dlen
- 1);
285 * It is assumed that the URL is complete -
286 * ie, the end of the string is the end of the URL. Don't pass a partial
287 * URL here as this routine doesn't have any way of knowing whether
288 * it is partial or not (ie, it handles the case of no trailing slash as
289 * being "end of host with implied path of /".
291 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
292 * then rather than a URL a hostname:port is looked for.
295 AnyP::Uri::parse(const HttpRequestMethod
& method
, const SBuf
&rawUrl
)
299 LOCAL_ARRAY(char, login
, MAX_URL
);
300 LOCAL_ARRAY(char, foundHost
, MAX_URL
);
301 LOCAL_ARRAY(char, urlpath
, MAX_URL
);
309 foundHost
[0] = urlpath
[0] = login
[0] = '\0';
311 if ((l
= rawUrl
.length()) + Config
.appendDomainLen
> (MAX_URL
- 1)) {
312 debugs(23, DBG_IMPORTANT
, MYNAME
<< "URL too large (" << l
<< " bytes)");
316 if ((method
== Http::METHOD_OPTIONS
|| method
== Http::METHOD_TRACE
) &&
317 Asterisk().cmp(rawUrl
) == 0) {
318 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
319 setScheme(AnyP::PROTO_HTTP
, nullptr);
320 port(getScheme().defaultPort());
325 Parser::Tokenizer
tok(rawUrl
);
326 AnyP::UriScheme scheme
;
328 if (method
== Http::METHOD_CONNECT
) {
329 // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
330 // port number of the tunnel destination, separated by a colon".
332 const auto rawHost
= parseHost(tok
);
333 Assure(rawHost
.length() < sizeof(foundHost
));
334 SBufToCstring(foundHost
, rawHost
);
337 throw TextException("missing required :port in CONNECT target", Here());
338 foundPort
= parsePort(tok
);
340 if (!tok
.remaining().isEmpty())
341 throw TextException("garbage after host:port in CONNECT target", Here());
344 scheme
= uriParseScheme(tok
);
346 if (scheme
== AnyP::PROTO_NONE
)
347 return false; // invalid scheme
349 if (scheme
== AnyP::PROTO_URN
) {
350 parseUrn(tok
); // throws on any error
354 // URLs then have "//"
355 static const SBuf
doubleSlash("//");
356 if (!tok
.skip(doubleSlash
))
359 auto B
= tok
.remaining();
360 const char *url
= B
.c_str();
366 /* Then everything until first /; that's host (and port; which we'll look for here later) */
367 // bug 1881: If we don't get a "/" then we imply it was there
368 // bug 3074: We could just be given a "?" or "#". These also imply "/"
369 // bug 3233: whitespace is also a hostname delimiter.
370 for (dst
= foundHost
; i
< l
&& *src
!= '/' && *src
!= '?' && *src
!= '#' && *src
!= '\0' && !xisspace(*src
); ++i
, ++src
, ++dst
) {
375 * We can't check for "i >= l" here because we could be at the end of the line
376 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
377 * been -given- a valid URL and the path is just '/'.
383 // We are looking at path-abempty.
385 // path-empty, including the end of the `src` c-string cases
391 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
392 for (; i
< l
&& *src
!= '\r' && *src
!= '\n' && *src
!= '\0'; ++i
, ++src
, ++dst
) {
396 /* We -could- be at the end of the buffer here */
401 // If the parsed scheme has no (known) default port, and there is no
402 // explicit port, then we will reject the zero port during foundPort
403 // validation, often resulting in a misleading 400/ERR_INVALID_URL.
404 // TODO: Remove this hack when switching to Tokenizer-based parsing.
405 foundPort
= scheme
.defaultPort().value_or(0); // may be reset later
407 /* Is there any login information? (we should eventually parse it above) */
408 t
= strrchr(foundHost
, '@');
410 strncpy((char *) login
, (char *) foundHost
, sizeof(login
)-1);
411 login
[sizeof(login
)-1] = '\0';
412 t
= strrchr(login
, '@');
414 strncpy((char *) foundHost
, t
+ 1, sizeof(foundHost
)-1);
415 foundHost
[sizeof(foundHost
)-1] = '\0';
416 // Bug 4498: URL-unescape the login info after extraction
417 rfc1738_unescape(login
);
420 /* Is there any host information? (we should eventually parse it above) */
421 if (*foundHost
== '[') {
422 /* strip any IPA brackets. valid under IPv6. */
424 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
427 l
= strlen(foundHost
);
429 for (; i
< l
&& *src
!= ']' && *src
!= '\0'; ++i
, ++src
, ++dst
) {
433 /* we moved in-place, so truncate the actual hostname found */
437 /* skip ahead to either start of port, or original EOS */
438 while (*dst
!= '\0' && *dst
!= ':')
442 t
= strrchr(foundHost
, ':');
444 if (t
!= strchr(foundHost
,':') ) {
445 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
446 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
447 /* therefore we MUST accept the case where they are not bracketed at all. */
452 // Bug 3183 sanity check: If scheme is present, host must be too.
453 if (scheme
!= AnyP::PROTO_NONE
&& foundHost
[0] == '\0') {
454 debugs(23, DBG_IMPORTANT
, "SECURITY ALERT: Missing hostname in URL '" << url
<< "'. see access.log for details.");
458 if (t
&& *t
== ':') {
465 for (t
= foundHost
; *t
; ++t
)
468 if (stringHasWhitespace(foundHost
)) {
469 if (URI_WHITESPACE_STRIP
== Config
.uri_whitespace
) {
482 debugs(23, 3, "Split URL '" << rawUrl
<< "' into proto='" << scheme
.image() << "', host='" << foundHost
<< "', port='" << foundPort
<< "', path='" << urlpath
<< "'");
484 if (Config
.onoff
.check_hostnames
&&
485 strspn(foundHost
, Config
.onoff
.allow_underscore
? valid_hostname_chars_u
: valid_hostname_chars
) != strlen(foundHost
)) {
486 debugs(23, DBG_IMPORTANT
, MYNAME
<< "Illegal character in hostname '" << foundHost
<< "'");
490 if (!urlAppendDomain(foundHost
))
493 /* remove trailing dots from hostnames */
494 while ((l
= strlen(foundHost
)) > 0 && foundHost
[--l
] == '.')
497 /* reject duplicate or leading dots */
498 if (strstr(foundHost
, "..") || *foundHost
== '.') {
499 debugs(23, DBG_IMPORTANT
, MYNAME
<< "Illegal hostname '" << foundHost
<< "'");
503 if (foundPort
< 1 || foundPort
> 65535) {
504 debugs(23, 3, "Invalid port '" << foundPort
<< "'");
508 if (stringHasWhitespace(urlpath
)) {
509 debugs(23, 2, "URI has whitespace: {" << rawUrl
<< "}");
511 switch (Config
.uri_whitespace
) {
513 case URI_WHITESPACE_DENY
:
516 case URI_WHITESPACE_ALLOW
:
519 case URI_WHITESPACE_ENCODE
:
520 t
= rfc1738_escape_unescaped(urlpath
);
521 xstrncpy(urlpath
, t
, MAX_URL
);
524 case URI_WHITESPACE_CHOP
:
525 *(urlpath
+ strcspn(urlpath
, w_space
)) = '\0';
528 case URI_WHITESPACE_STRIP
:
545 userInfo(SBuf(login
));
550 debugs(23, 2, "error: " << CurrentException
<< " " << Raw("rawUrl", rawUrl
.rawContent(), rawUrl
.length()));
556 * Governed by RFC 8141 section 2:
558 * assigned-name = "urn" ":" NID ":" NSS
559 * NID = (alphanum) 0*30(ldh) (alphanum)
560 * ldh = alphanum / "-"
561 * NSS = pchar *(pchar / "/")
563 * RFC 3986 Appendix D.2 defines (as deprecated):
565 * alphanum = ALPHA / DIGIT
567 * Notice that NID is exactly 2-32 characters in length.
570 AnyP::Uri::parseUrn(Parser::Tokenizer
&tok
)
572 static const auto nidChars
= CharacterSet("NID","-") + CharacterSet::ALPHA
+ CharacterSet::DIGIT
;
573 static const auto alphanum
= (CharacterSet::ALPHA
+ CharacterSet::DIGIT
).rename("alphanum");
575 if (!tok
.prefix(nid
, nidChars
, 32))
576 throw TextException("NID not found", Here());
579 throw TextException("NID too long or missing ':' delimiter", Here());
581 if (nid
.length() < 2)
582 throw TextException("NID too short", Here());
584 if (!alphanum
[*nid
.begin()])
585 throw TextException("NID prefix is not alphanumeric", Here());
587 if (!alphanum
[*nid
.rbegin()])
588 throw TextException("NID suffix is not alphanumeric", Here());
590 setScheme(AnyP::PROTO_URN
, nullptr);
592 // TODO validate path characters
593 path(tok
.remaining());
594 debugs(23, 3, "Split URI into proto=urn, nid=" << nid
<< ", " << Raw("path",path().rawContent(),path().length()));
597 /// Extracts and returns a (suspected but only partially validated) uri-host
598 /// IPv6address, IPv4address, or reg-name component. This function uses (and
599 /// quotes) RFC 3986, Section 3.2.2 syntax rules.
601 AnyP::Uri::parseHost(Parser::Tokenizer
&tok
) const
603 // host = IP-literal / IPv4address / reg-name
605 // XXX: CharacterSets below reject uri-host values containing whitespace
606 // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
607 // can be interpreted as if it applies to uri-host and this code. TODO: Fix
608 // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
609 // port, etc.) from that directive scope.
611 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
613 // Add "." because IPv6address in RFC 3986 includes ls32, which includes
614 // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
615 // This set rejects IPvFuture that needs a "v" character.
616 static const CharacterSet IPv6chars
= (
617 CharacterSet::HEXDIG
+ CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
619 if (!tok
.prefix(ipv6ish
, IPv6chars
))
620 throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
623 throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
625 // This rejects bracketed IPv4address and domain names because they lack ":".
626 if (ipv6ish
.find(':') == SBuf::npos
)
627 throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
629 // This rejects bracketed non-IP addresses that our caller would have
630 // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
631 Ip::Address ipv6check
;
632 if (!ipv6check
.fromHost(ipv6ish
.c_str()))
633 throw TextException("malformed bracketed IPv6 address in uri-host", Here());
638 // no brackets implies we are looking at IPv4address or reg-name
640 // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&"
641 // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the
642 // non-CONNECT uri-host parsing code to use us.
644 SBuf otherHost
; // IPv4address-ish or reg-name-ish;
645 // ":" is not in TCHAR so we will stop before any port specification
646 if (tok
.prefix(otherHost
, CharacterSet::TCHAR
))
649 throw TextException("malformed IPv4 address or host name in uri-host", Here());
652 /// Extracts and returns an RFC 3986 URI authority port value (with additional
653 /// restrictions). The RFC defines port as a possibly empty sequence of decimal
654 /// digits. We reject certain ports (that are syntactically valid from the RFC
655 /// point of view) because we are worried that Squid and other traffic handlers
656 /// may dangerously mishandle unusual (and virtually always bogus) port numbers.
657 /// Rejected ports cannot be successfully used by Squid itself.
659 AnyP::Uri::parsePort(Parser::Tokenizer
&tok
) const
662 throw TextException("zero or zero-prefixed port", Here());
665 if (!tok
.int64(rawPort
, 10, false)) // port = *DIGIT
666 throw TextException("malformed or missing port", Here());
669 constexpr KnownPort portMax
= 65535; // TODO: Make this a class-scope constant and REuse it.
670 constexpr auto portStorageMax
= std::numeric_limits
<Port::value_type
>::max();
671 static_assert(!Less(portStorageMax
, portMax
), "Port type can represent the maximum valid port number");
672 if (Less(portMax
, rawPort
))
673 throw TextException("huge port", Here());
675 // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
676 // code to use us (so that foundPort "int" disappears or starts using Port).
677 return NaturalCast
<int>(rawPort
);
684 authorityHttp_
.clear();
685 authorityWithPort_
.clear();
689 AnyP::Uri::authority(bool requirePort
) const
691 if (authorityHttp_
.isEmpty()) {
693 // both formats contain Host/IP
694 authorityWithPort_
.append(host());
695 authorityHttp_
= authorityWithPort_
;
697 if (port().has_value()) {
698 authorityWithPort_
.appendf(":%hu", *port());
699 // authorityHttp_ only has :port for known non-default ports
700 if (port() != getScheme().defaultPort())
701 authorityHttp_
= authorityWithPort_
;
703 // else XXX: We made authorityWithPort_ that does not have a port.
704 // TODO: Audit callers and refuse to give out broken authorityWithPort_.
707 return requirePort
? authorityWithPort_
: authorityHttp_
;
711 AnyP::Uri::absolute() const
713 if (absolute_
.isEmpty()) {
714 // TODO: most URL will be much shorter, avoid allocating this much
715 absolute_
.reserveCapacity(MAX_URL
);
717 absolute_
.append(getScheme().image());
718 absolute_
.append(":",1);
719 if (getScheme() != AnyP::PROTO_URN
) {
720 absolute_
.append("//", 2);
721 const bool allowUserInfo
= getScheme() == AnyP::PROTO_FTP
||
722 getScheme() == AnyP::PROTO_UNKNOWN
;
724 if (allowUserInfo
&& !userInfo().isEmpty()) {
725 static const CharacterSet uiChars
= CharacterSet(UserInfoChars())
727 .rename("userinfo-reserved");
728 absolute_
.append(Encode(userInfo(), uiChars
));
729 absolute_
.append("@", 1);
731 absolute_
.append(authority());
733 absolute_
.append(host());
734 absolute_
.append(":", 1);
736 absolute_
.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
742 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
743 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
744 * and never copy the query-string part in the first place
747 urlCanonicalCleanWithoutRequest(const SBuf
&url
, const HttpRequestMethod
&method
, const AnyP::UriScheme
&scheme
)
749 LOCAL_ARRAY(char, buf
, MAX_URL
);
751 snprintf(buf
, sizeof(buf
), SQUIDSBUFPH
, SQUIDSBUFPRINT(url
));
752 buf
[sizeof(buf
)-1] = '\0';
754 // URN, CONNECT method, and non-stripped URIs can go straight out
755 if (Config
.onoff
.strip_query_terms
&& !(method
== Http::METHOD_CONNECT
|| scheme
== AnyP::PROTO_URN
)) {
756 // strip anything AFTER a question-mark
757 // leaving the '?' in place
758 if (auto t
= strchr(buf
, '?')) {
763 if (stringHasCntl(buf
))
764 xstrncpy(buf
, rfc1738_escape_unescaped(buf
), MAX_URL
);
770 * Yet another alternative to urlCanonical.
771 * This one adds the https:// parts to Http::METHOD_CONNECT URL
772 * for use in error page outputs.
773 * Luckily we can leverage the others instead of duplicating.
776 urlCanonicalFakeHttps(const HttpRequest
* request
)
778 LOCAL_ARRAY(char, buf
, MAX_URL
);
780 // method CONNECT and port HTTPS
781 if (request
->method
== Http::METHOD_CONNECT
&& request
->url
.port() == 443) {
782 snprintf(buf
, MAX_URL
, "https://%s/*", request
->url
.host());
786 // else do the normal complete canonical thing.
787 return request
->canonicalCleanUrl();
791 * Test if a URL is a relative reference.
793 * Governed by RFC 3986 section 4.2
795 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
797 * relative-part = "//" authority path-abempty
803 urlIsRelative(const char *url
)
806 return false; // no URL
809 * RFC 3986 section 5.2.3
811 * path = path-abempty ; begins with "/" or is empty
812 * / path-absolute ; begins with "/" but not "//"
813 * / path-noscheme ; begins with a non-colon segment
814 * / path-rootless ; begins with a segment
815 * / path-empty ; zero characters
819 return true; // path-empty
822 // network-path reference (a.k.a. 'scheme-relative URI') or
823 // path-absolute (a.k.a. 'absolute-path reference')
827 for (const auto *p
= url
; *p
!= '\0' && *p
!= '/' && *p
!= '?' && *p
!= '#'; ++p
) {
829 return false; // colon is forbidden in first segment
832 return true; // path-noscheme, path-abempty, path-rootless
836 AnyP::Uri::addRelativePath(const char *relUrl
)
838 // URN cannot be merged
839 if (getScheme() == AnyP::PROTO_URN
)
842 // TODO: Handle . and .. segment normalization
844 const auto lastSlashPos
= path_
.rfind('/');
845 // TODO: To optimize and simplify, add and use SBuf::replace().
846 const auto relUrlLength
= strlen(relUrl
);
847 if (lastSlashPos
== SBuf::npos
) {
848 // start replacing the whole path
849 path_
.reserveCapacity(1 + relUrlLength
);
850 path_
.assign("/", 1);
852 // start replacing just the last segment
853 path_
.reserveCapacity(lastSlashPos
+ 1 + relUrlLength
);
854 path_
.chop(0, lastSlashPos
+1);
856 path_
.append(relUrl
, relUrlLength
);
860 matchDomainName(const char *h
, const char *d
, MatchDomainNameFlags flags
)
865 const bool hostIncludesSubdomains
= (*h
== '.');
879 * Start at the ends of the two strings and work towards the
882 while (xtolower(h
[--hl
]) == xtolower(d
[--dl
])) {
883 if (hl
== 0 && dl
== 0) {
885 * We made it all the way to the beginning of both
886 * strings without finding any difference.
893 * The host string is shorter than the domain string.
894 * There is only one case when this can be a match.
895 * If the domain is just one character longer, and if
896 * that character is a leading '.' then we call it a
900 if (1 == dl
&& '.' == d
[0])
908 * The domain string is shorter than the host string.
909 * This is a match only if the first domain character
914 if (flags
& mdnRejectSubsubDomains
) {
915 // Check for sub-sub domain and reject
916 while(--hl
>= 0 && h
[hl
] != '.');
918 // No sub-sub domain found, but reject if there is a
919 // leading dot in given host string (which is removed
920 // before the check is started).
921 return hostIncludesSubdomains
? 1 : 0;
923 return 1; // sub-sub domain, reject
932 * We found different characters in the same position (from the end).
935 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
936 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
937 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
938 if ((flags
& mdnHonorWildcards
) && h
[hl
] == '*' && h
[hl
+ 1] == '.')
942 * If one of those character is '.' then its special. In order
943 * for splay tree sorting to work properly, "x-foo.com" must
944 * be greater than ".foo.com" even though '-' is less than '.'.
952 return (xtolower(h
[hl
]) - xtolower(d
[dl
]));
956 * return true if we can serve requests for this method.
959 urlCheckRequest(const HttpRequest
* r
)
961 /* protocol "independent" methods
963 * actually these methods are specific to HTTP:
964 * they are methods we receive on our HTTP port,
965 * and if we had a FTP listener would not be relevant
968 * So, we should delegate them to HTTP. The problem is that we
969 * do not have a default protocol from the client side of HTTP.
972 if (r
->method
== Http::METHOD_CONNECT
)
975 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
976 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
977 if (r
->method
== Http::METHOD_OPTIONS
|| r
->method
== Http::METHOD_TRACE
)
978 return (r
->header
.getInt64(Http::HdrType::MAX_FORWARDS
) == 0 || r
->url
.path() != AnyP::Uri::Asterisk());
980 if (r
->method
== Http::METHOD_PURGE
)
983 /* does method match the protocol? */
984 switch (r
->url
.getScheme()) {
986 case AnyP::PROTO_URN
:
987 case AnyP::PROTO_HTTP
:
990 case AnyP::PROTO_FTP
:
991 if (r
->method
== Http::METHOD_PUT
||
992 r
->method
== Http::METHOD_GET
||
993 r
->method
== Http::METHOD_HEAD
)
997 case AnyP::PROTO_WAIS
:
998 case AnyP::PROTO_WHOIS
:
999 if (r
->method
== Http::METHOD_GET
||
1000 r
->method
== Http::METHOD_HEAD
)
1004 case AnyP::PROTO_HTTPS
:
1005 #if USE_OPENSSL || HAVE_LIBGNUTLS
1009 * Squid can't originate an SSL connection, so it should
1010 * never receive an "https:" URL. It should always be
1024 AnyP::Uri::Uri(AnyP::UriScheme
const &aScheme
) :
1026 hostIsNumeric_(false)
1031 // TODO: fix code duplication with AnyP::Uri::parse()
1033 AnyP::Uri::cleanup(const char *uri
)
1035 char *cleanedUri
= nullptr;
1036 switch (Config
.uri_whitespace
) {
1037 case URI_WHITESPACE_ALLOW
: {
1038 const auto flags
= RFC1738_ESCAPE_NOSPACE
| RFC1738_ESCAPE_UNESCAPED
;
1039 cleanedUri
= xstrndup(rfc1738_do_escape(uri
, flags
), MAX_URL
);
1043 case URI_WHITESPACE_ENCODE
:
1044 cleanedUri
= xstrndup(rfc1738_do_escape(uri
, RFC1738_ESCAPE_UNESCAPED
), MAX_URL
);
1047 case URI_WHITESPACE_CHOP
: {
1048 const auto pos
= strcspn(uri
, w_space
);
1049 char *choppedUri
= nullptr;
1050 if (pos
< strlen(uri
))
1051 choppedUri
= xstrndup(uri
, pos
+ 1);
1052 cleanedUri
= xstrndup(rfc1738_do_escape(choppedUri
? choppedUri
: uri
,
1053 RFC1738_ESCAPE_UNESCAPED
), MAX_URL
);
1054 cleanedUri
[pos
] = '\0';
1059 case URI_WHITESPACE_DENY
:
1060 case URI_WHITESPACE_STRIP
:
1062 // TODO: avoid duplication with urlParse()
1064 char *tmp_uri
= static_cast<char*>(xmalloc(strlen(uri
) + 1));
1068 if (!xisspace(*t
)) {
1075 cleanedUri
= xstrndup(rfc1738_escape_unescaped(tmp_uri
), MAX_URL
);