]> git.ipfire.org Git - thirdparty/squid.git/blame - src/anyp/Uri.cc
Simplify appending SBuf to String (#2108)
[thirdparty/squid.git] / src / anyp / Uri.cc
CommitLineData
30a4f2a8 1/*
1f7b830e 2 * Copyright (C) 1996-2025 The Squid Software Foundation and contributors
e25c139f 3 *
bbc27441
AJ
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
019dd986 7 */
ed43818f 8
bbc27441
AJ
9/* DEBUG: section 23 URL Parsing */
10
f7f3304a 11#include "squid.h"
22b2a7a0 12#include "anyp/Host.h"
c8ab5ec6 13#include "anyp/Uri.h"
675b8408 14#include "base/Raw.h"
582c2af2 15#include "globals.h"
528b2c61 16#include "HttpRequest.h"
6c880a16 17#include "parser/Tokenizer.h"
1fa9b1a7 18#include "rfc1738.h"
4d5904f7 19#include "SquidConfig.h"
963ff143 20#include "SquidMath.h"
090089c4 21
a78278e2 22static const char valid_hostname_chars_u[] =
62e76326 23 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
24 "abcdefghijklmnopqrstuvwxyz"
a78278e2 25 "0123456789-._"
cc192b50 26 "[:]"
a78278e2 27 ;
28static const char valid_hostname_chars[] =
62e76326 29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
30 "abcdefghijklmnopqrstuvwxyz"
31 "0123456789-."
cc192b50 32 "[:]"
62e76326 33 ;
090089c4 34
614bd511
AJ
35/// Characters which are valid within a URI userinfo section
36static const CharacterSet &
37UserInfoChars()
38{
39 /*
40 * RFC 3986 section 3.2.1
41 *
42 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
43 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
44 * pct-encoded = "%" HEXDIG HEXDIG
45 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
46 */
47 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
48 CharacterSet::ALPHA +
49 CharacterSet::DIGIT;
50 return userInfoValid;
51}
52
53/**
54 * Governed by RFC 3986 section 2.1
55 */
56SBuf
57AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
58{
59 if (buf.isEmpty())
60 return buf;
61
62 Parser::Tokenizer tk(buf);
63 SBuf goodSection;
64 // optimization for the arguably common "no encoding necessary" case
65 if (tk.prefix(goodSection, ignore) && tk.atEnd())
66 return buf;
67
68 SBuf output;
69 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
70 output.append(goodSection); // may be empty
71
72 while (!tk.atEnd()) {
73 // TODO: Add Tokenizer::parseOne(void).
74 const auto ch = tk.remaining()[0];
65d21317 75 output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table
614bd511
AJ
76 (void)tk.skip(ch);
77
78 if (tk.prefix(goodSection, ignore))
79 output.append(goodSection);
80 }
81
82 return output;
83}
84
26256f28
FC
85SBuf
86AnyP::Uri::Decode(const SBuf &buf)
87{
88 SBuf output;
89 Parser::Tokenizer tok(buf);
90 while (!tok.atEnd()) {
91 SBuf token;
92 static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
93 if (tok.prefix(token, unencodedChars))
94 output.append(token);
95
96 // we are either at '%' or at end of input
97 if (tok.skip('%')) {
98 int64_t hex1 = 0, hex2 = 0;
99 if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
100 output.append(static_cast<char>((hex1 << 4) | hex2));
101 else
102 throw TextException("invalid pct-encoded triplet", Here());
103 }
104 }
105 return output;
106}
107
2e260208 108const SBuf &
c8ab5ec6 109AnyP::Uri::Asterisk()
2e260208
AJ
110{
111 static SBuf star("*");
112 return star;
113}
114
51b5dcf5 115const SBuf &
c8ab5ec6 116AnyP::Uri::SlashPath()
51b5dcf5
AJ
117{
118 static SBuf slash("/");
119 return slash;
120}
121
5c51bffb 122void
c8ab5ec6 123AnyP::Uri::host(const char *src)
5c51bffb 124{
1560ae82 125 hostAddr_.fromHost(src);
5c51bffb
AJ
126 if (hostAddr_.isAnyAddr()) {
127 xstrncpy(host_, src, sizeof(host_));
128 hostIsNumeric_ = false;
129 } else {
130 hostAddr_.toHostStr(host_, sizeof(host_));
131 debugs(23, 3, "given IP: " << hostAddr_);
132 hostIsNumeric_ = 1;
133 }
134 touch();
135}
136
22b2a7a0 137// TODO: Replace with ToSBuf(parsedHost()) or similar.
9ce4a1eb
CT
138SBuf
139AnyP::Uri::hostOrIp() const
140{
1560ae82
O
141 if (hostIsNumeric()) {
142 static char ip[MAX_IPSTRLEN];
143 const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
144 return SBuf(ip, hostStrLen);
145 } else
9ce4a1eb
CT
146 return SBuf(host());
147}
148
22b2a7a0
TW
149std::optional<AnyP::Host>
150AnyP::Uri::parsedHost() const
151{
152 if (hostIsNumeric())
153 return Host::ParseIp(hostIP());
154
155 // XXX: Interpret host subcomponent as reg-name representing a DNS name. It
156 // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but
157 // current Squid APIs do not support adequate representation of those cases.
158 const SBuf regName(host());
159
160 if (regName.find('%') != SBuf::npos) {
161 debugs(23, 3, "rejecting percent-encoded reg-name: " << regName);
162 return std::nullopt; // TODO: Decode() instead
163 }
164
165 return Host::ParseSimpleDomainName(regName);
166}
167
51b5dcf5 168const SBuf &
c8ab5ec6 169AnyP::Uri::path() const
51b5dcf5
AJ
170{
171 // RFC 3986 section 3.3 says path can be empty (path-abempty).
172 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
173 // at least when sending and using. We must still accept path-abempty as input.
174 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
175 return SlashPath();
176
177 return path_;
178}
179
b8d8561b 180void
0673c0ba 181urlInitialize(void)
090089c4 182{
bf8fe701 183 debugs(23, 5, "urlInitialize: Initializing...");
985c86bc 184 /* this ensures that the number of protocol strings is the same as
0c3d3f65 185 * the enum slots allocated because the last enum is always 'MAX'.
985c86bc 186 */
0c3d3f65 187 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
9bc73deb 188 /*
189 * These test that our matchDomainName() function works the
190 * way we expect it to.
191 */
192 assert(0 == matchDomainName("foo.com", "foo.com"));
d20b1cd0 193 assert(0 == matchDomainName(".foo.com", "foo.com"));
9bc73deb 194 assert(0 == matchDomainName("foo.com", ".foo.com"));
195 assert(0 == matchDomainName(".foo.com", ".foo.com"));
196 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
abbd7825 197 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
9bc73deb 198 assert(0 != matchDomainName("x.foo.com", "foo.com"));
199 assert(0 != matchDomainName("foo.com", "x.foo.com"));
200 assert(0 != matchDomainName("bar.com", "foo.com"));
201 assert(0 != matchDomainName(".bar.com", "foo.com"));
202 assert(0 != matchDomainName(".bar.com", ".foo.com"));
203 assert(0 != matchDomainName("bar.com", ".foo.com"));
204 assert(0 < matchDomainName("zzz.com", "foo.com"));
205 assert(0 > matchDomainName("aaa.com", "foo.com"));
206 assert(0 == matchDomainName("FOO.com", "foo.COM"));
aca95add 207 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
208 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
d20b1cd0 209 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
abbd7825
CT
210
211 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
212 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
213 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
214 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
215
216 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
217 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
218 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
219 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
220
b70f8649
AW
221 assert(0 != matchDomainName("foo.com", ""));
222 assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards));
223 assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains));
224
9bc73deb 225 /* more cases? */
090089c4 226}
227
cc192b50 228/**
6c880a16
AJ
229 * Extract the URI scheme and ':' delimiter from the given input buffer.
230 *
231 * Schemes up to 16 characters are accepted.
232 *
233 * Governed by RFC 3986 section 3.1
d4a04ed5 234 */
6c880a16
AJ
235static AnyP::UriScheme
236uriParseScheme(Parser::Tokenizer &tok)
92a6f4b1 237{
6c880a16
AJ
238 /*
239 * RFC 3986 section 3.1 paragraph 2:
240 *
241 * Scheme names consist of a sequence of characters beginning with a
242 * letter and followed by any combination of letters, digits, plus
243 * ("+"), period ("."), or hyphen ("-").
244 */
7902bd5b 245 static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
6c880a16
AJ
246
247 SBuf str;
248 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
249 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
250 if (protocol == AnyP::PROTO_UNKNOWN)
251 return AnyP::UriScheme(protocol, str.c_str());
252 return AnyP::UriScheme(protocol, nullptr);
253 }
d31d59d8 254
6c880a16 255 throw TextException("invalid URI scheme", Here());
92a6f4b1 256}
257
38aa10ef
AJ
258/**
259 * Appends configured append_domain to hostname, assuming
260 * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
261 * and that the host FQDN is not a 'dotless' TLD.
262 *
263 * \returns false if and only if there is not enough space to append
264 */
265bool
266urlAppendDomain(char *host)
267{
268 /* For IPv4 addresses check for a dot */
269 /* For IPv6 addresses also check for a colon */
270 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
271 const uint64_t dlen = strlen(host);
272 const uint64_t want = dlen + Config.appendDomainLen;
273 if (want > SQUIDHOSTNAMELEN - 1) {
274 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
275 return false;
276 }
277 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
278 }
279 return true;
280}
281
d4a04ed5 282/*
283 * Parse a URI/URL.
284 *
6c880a16 285 * It is assumed that the URL is complete -
cc192b50 286 * ie, the end of the string is the end of the URL. Don't pass a partial
287 * URL here as this routine doesn't have any way of knowing whether
6c880a16 288 * it is partial or not (ie, it handles the case of no trailing slash as
cc192b50 289 * being "end of host with implied path of /".
6c880a16
AJ
290 *
291 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
292 * then rather than a URL a hostname:port is looked for.
cc192b50 293 */
9157915c 294bool
6c880a16 295AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
7111c86a 296{
6c880a16
AJ
297 try {
298
77b1029d 299 LOCAL_ARRAY(char, login, MAX_URL);
300 LOCAL_ARRAY(char, foundHost, MAX_URL);
301 LOCAL_ARRAY(char, urlpath, MAX_URL);
aee3523a
AR
302 char *t = nullptr;
303 char *q = nullptr;
77b1029d 304 int foundPort;
305 int l;
306 int i;
307 const char *src;
308 char *dst;
309 foundHost[0] = urlpath[0] = login[0] = '\0';
310
311 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
312 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
313 return false;
314 }
6c880a16 315
77b1029d 316 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
317 Asterisk().cmp(rawUrl) == 0) {
318 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
319 setScheme(AnyP::PROTO_HTTP, nullptr);
320 port(getScheme().defaultPort());
321 path(Asterisk());
322 return true;
323 }
6c880a16 324
77b1029d 325 Parser::Tokenizer tok(rawUrl);
326 AnyP::UriScheme scheme;
cc192b50 327
77b1029d 328 if (method == Http::METHOD_CONNECT) {
963ff143
AR
329 // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
330 // port number of the tunnel destination, separated by a colon".
6c880a16 331
963ff143
AR
332 const auto rawHost = parseHost(tok);
333 Assure(rawHost.length() < sizeof(foundHost));
334 SBufToCstring(foundHost, rawHost);
6c880a16 335
963ff143
AR
336 if (!tok.skip(':'))
337 throw TextException("missing required :port in CONNECT target", Here());
338 foundPort = parsePort(tok);
6c880a16 339
963ff143
AR
340 if (!tok.remaining().isEmpty())
341 throw TextException("garbage after host:port in CONNECT target", Here());
77b1029d 342 } else {
cc192b50 343
77b1029d 344 scheme = uriParseScheme(tok);
6c880a16 345
77b1029d 346 if (scheme == AnyP::PROTO_NONE)
347 return false; // invalid scheme
6c880a16 348
77b1029d 349 if (scheme == AnyP::PROTO_URN) {
350 parseUrn(tok); // throws on any error
351 return true;
352 }
62e76326 353
77b1029d 354 // URLs then have "//"
355 static const SBuf doubleSlash("//");
356 if (!tok.skip(doubleSlash))
357 return false;
cc192b50 358
77b1029d 359 auto B = tok.remaining();
360 const char *url = B.c_str();
cc192b50 361
77b1029d 362 /* Parse the URL: */
363 src = url;
364 i = 0;
62e76326 365
2f8abb64 366 /* Then everything until first /; that's host (and port; which we'll look for here later) */
77b1029d 367 // bug 1881: If we don't get a "/" then we imply it was there
368 // bug 3074: We could just be given a "?" or "#". These also imply "/"
369 // bug 3233: whitespace is also a hostname delimiter.
370 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
371 *dst = *src;
372 }
373
374 /*
375 * We can't check for "i >= l" here because we could be at the end of the line
376 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
377 * been -given- a valid URL and the path is just '/'.
378 */
379 if (i > l)
380 return false;
381 *dst = '\0';
62e76326 382
dfd81859
AJ
383 // We are looking at path-abempty.
384 if (*src != '/') {
385 // path-empty, including the end of the `src` c-string cases
77b1029d 386 urlpath[0] = '/';
387 dst = &urlpath[1];
388 } else {
389 dst = urlpath;
390 }
2f8abb64 391 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
77b1029d 392 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
cc192b50 393 *dst = *src;
394 }
395
77b1029d 396 /* We -could- be at the end of the buffer here */
397 if (i > l)
398 return false;
5db6bf73 399 *dst = '\0';
cc192b50 400
380b09ae
AR
401 // If the parsed scheme has no (known) default port, and there is no
402 // explicit port, then we will reject the zero port during foundPort
403 // validation, often resulting in a misleading 400/ERR_INVALID_URL.
404 // TODO: Remove this hack when switching to Tokenizer-based parsing.
405 foundPort = scheme.defaultPort().value_or(0); // may be reset later
77b1029d 406
407 /* Is there any login information? (we should eventually parse it above) */
408 t = strrchr(foundHost, '@');
aee3523a 409 if (t != nullptr) {
77b1029d 410 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
411 login[sizeof(login)-1] = '\0';
412 t = strrchr(login, '@');
413 *t = 0;
414 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
415 foundHost[sizeof(foundHost)-1] = '\0';
416 // Bug 4498: URL-unescape the login info after extraction
417 rfc1738_unescape(login);
418 }
419
420 /* Is there any host information? (we should eventually parse it above) */
421 if (*foundHost == '[') {
422 /* strip any IPA brackets. valid under IPv6. */
423 dst = foundHost;
424 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
425 src = foundHost;
426 ++src;
427 l = strlen(foundHost);
428 i = 1;
429 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
430 *dst = *src;
431 }
432
433 /* we moved in-place, so truncate the actual hostname found */
434 *dst = '\0';
5db6bf73 435 ++dst;
cc192b50 436
77b1029d 437 /* skip ahead to either start of port, or original EOS */
438 while (*dst != '\0' && *dst != ':')
439 ++dst;
440 t = dst;
441 } else {
442 t = strrchr(foundHost, ':');
443
444 if (t != strchr(foundHost,':') ) {
445 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
446 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
447 /* therefore we MUST accept the case where they are not bracketed at all. */
aee3523a 448 t = nullptr;
77b1029d 449 }
cc192b50 450 }
62e76326 451
77b1029d 452 // Bug 3183 sanity check: If scheme is present, host must be too.
453 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
454 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
455 return false;
456 }
b5acc277 457
77b1029d 458 if (t && *t == ':') {
459 *t = '\0';
460 ++t;
461 foundPort = atoi(t);
462 }
62e76326 463 }
62e76326 464
77b1029d 465 for (t = foundHost; *t; ++t)
466 *t = xtolower(*t);
467
468 if (stringHasWhitespace(foundHost)) {
469 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
470 t = q = foundHost;
471 while (*t) {
472 if (!xisspace(*t)) {
473 *q = *t;
474 ++q;
475 }
476 ++t;
5db6bf73 477 }
77b1029d 478 *q = '\0';
62e76326 479 }
62e76326 480 }
62e76326 481
77b1029d 482 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
cc192b50 483
77b1029d 484 if (Config.onoff.check_hostnames &&
485 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
486 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
487 return false;
488 }
62e76326 489
77b1029d 490 if (!urlAppendDomain(foundHost))
491 return false;
cc192b50 492
77b1029d 493 /* remove trailing dots from hostnames */
494 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
495 foundHost[l] = '\0';
62e76326 496
77b1029d 497 /* reject duplicate or leading dots */
498 if (strstr(foundHost, "..") || *foundHost == '.') {
499 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
500 return false;
501 }
62e76326 502
77b1029d 503 if (foundPort < 1 || foundPort > 65535) {
504 debugs(23, 3, "Invalid port '" << foundPort << "'");
505 return false;
506 }
62e76326 507
77b1029d 508 if (stringHasWhitespace(urlpath)) {
509 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
62e76326 510
77b1029d 511 switch (Config.uri_whitespace) {
62e76326 512
77b1029d 513 case URI_WHITESPACE_DENY:
514 return false;
62e76326 515
77b1029d 516 case URI_WHITESPACE_ALLOW:
517 break;
518
519 case URI_WHITESPACE_ENCODE:
520 t = rfc1738_escape_unescaped(urlpath);
521 xstrncpy(urlpath, t, MAX_URL);
522 break;
523
524 case URI_WHITESPACE_CHOP:
525 *(urlpath + strcspn(urlpath, w_space)) = '\0';
526 break;
527
528 case URI_WHITESPACE_STRIP:
529 default:
530 t = q = urlpath;
531 while (*t) {
532 if (!xisspace(*t)) {
533 *q = *t;
534 ++q;
535 }
536 ++t;
5db6bf73 537 }
77b1029d 538 *q = '\0';
62e76326 539 }
62e76326 540 }
62e76326 541
77b1029d 542 setScheme(scheme);
543 path(urlpath);
544 host(foundHost);
545 userInfo(SBuf(login));
546 port(foundPort);
547 return true;
6c880a16
AJ
548
549 } catch (...) {
550 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
551 return false;
552 }
7111c86a 553}
554
6c880a16
AJ
555/**
556 * Governed by RFC 8141 section 2:
557 *
558 * assigned-name = "urn" ":" NID ":" NSS
559 * NID = (alphanum) 0*30(ldh) (alphanum)
560 * ldh = alphanum / "-"
561 * NSS = pchar *(pchar / "/")
562 *
563 * RFC 3986 Appendix D.2 defines (as deprecated):
564 *
565 * alphanum = ALPHA / DIGIT
566 *
567 * Notice that NID is exactly 2-32 characters in length.
568 */
db59367a 569void
6c880a16 570AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
23d92c64 571{
6c880a16
AJ
572 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
573 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
574 SBuf nid;
575 if (!tok.prefix(nid, nidChars, 32))
576 throw TextException("NID not found", Here());
577
578 if (!tok.skip(':'))
579 throw TextException("NID too long or missing ':' delimiter", Here());
580
581 if (nid.length() < 2)
582 throw TextException("NID too short", Here());
583
584 if (!alphanum[*nid.begin()])
585 throw TextException("NID prefix is not alphanumeric", Here());
586
587 if (!alphanum[*nid.rbegin()])
588 throw TextException("NID suffix is not alphanumeric", Here());
589
590 setScheme(AnyP::PROTO_URN, nullptr);
591 host(nid.c_str());
592 // TODO validate path characters
593 path(tok.remaining());
594 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
23d92c64 595}
596
963ff143
AR
597/// Extracts and returns a (suspected but only partially validated) uri-host
598/// IPv6address, IPv4address, or reg-name component. This function uses (and
599/// quotes) RFC 3986, Section 3.2.2 syntax rules.
600SBuf
601AnyP::Uri::parseHost(Parser::Tokenizer &tok) const
602{
603 // host = IP-literal / IPv4address / reg-name
604
605 // XXX: CharacterSets below reject uri-host values containing whitespace
606 // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
607 // can be interpreted as if it applies to uri-host and this code. TODO: Fix
608 // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
609 // port, etc.) from that directive scope.
610
611 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
612 if (tok.skip('[')) {
613 // Add "." because IPv6address in RFC 3986 includes ls32, which includes
614 // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
615 // This set rejects IPvFuture that needs a "v" character.
616 static const CharacterSet IPv6chars = (
617 CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
618 SBuf ipv6ish;
619 if (!tok.prefix(ipv6ish, IPv6chars))
620 throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
621
622 if (!tok.skip(']'))
623 throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
624
625 // This rejects bracketed IPv4address and domain names because they lack ":".
626 if (ipv6ish.find(':') == SBuf::npos)
627 throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
628
629 // This rejects bracketed non-IP addresses that our caller would have
630 // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
631 Ip::Address ipv6check;
632 if (!ipv6check.fromHost(ipv6ish.c_str()))
633 throw TextException("malformed bracketed IPv6 address in uri-host", Here());
634
635 return ipv6ish;
636 }
637
638 // no brackets implies we are looking at IPv4address or reg-name
639
640 // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&"
641 // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the
642 // non-CONNECT uri-host parsing code to use us.
643
644 SBuf otherHost; // IPv4address-ish or reg-name-ish;
645 // ":" is not in TCHAR so we will stop before any port specification
646 if (tok.prefix(otherHost, CharacterSet::TCHAR))
647 return otherHost;
648
649 throw TextException("malformed IPv4 address or host name in uri-host", Here());
650}
651
652/// Extracts and returns an RFC 3986 URI authority port value (with additional
653/// restrictions). The RFC defines port as a possibly empty sequence of decimal
654/// digits. We reject certain ports (that are syntactically valid from the RFC
655/// point of view) because we are worried that Squid and other traffic handlers
656/// may dangerously mishandle unusual (and virtually always bogus) port numbers.
657/// Rejected ports cannot be successfully used by Squid itself.
658int
659AnyP::Uri::parsePort(Parser::Tokenizer &tok) const
660{
661 if (tok.skip('0'))
662 throw TextException("zero or zero-prefixed port", Here());
663
664 int64_t rawPort = 0;
665 if (!tok.int64(rawPort, 10, false)) // port = *DIGIT
666 throw TextException("malformed or missing port", Here());
667
668 Assure(rawPort > 0);
669 constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it.
670 constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max();
671 static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number");
672 if (Less(portMax, rawPort))
673 throw TextException("huge port", Here());
674
675 // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
676 // code to use us (so that foundPort "int" disappears or starts using Port).
677 return NaturalCast<int>(rawPort);
678}
679
5c51bffb 680void
c8ab5ec6 681AnyP::Uri::touch()
5c51bffb 682{
c823e2da 683 absolute_.clear();
5c51bffb
AJ
684 authorityHttp_.clear();
685 authorityWithPort_.clear();
686}
687
688SBuf &
c8ab5ec6 689AnyP::Uri::authority(bool requirePort) const
5c51bffb
AJ
690{
691 if (authorityHttp_.isEmpty()) {
692
693 // both formats contain Host/IP
694 authorityWithPort_.append(host());
695 authorityHttp_ = authorityWithPort_;
696
380b09ae
AR
697 if (port().has_value()) {
698 authorityWithPort_.appendf(":%hu", *port());
699 // authorityHttp_ only has :port for known non-default ports
700 if (port() != getScheme().defaultPort())
701 authorityHttp_ = authorityWithPort_;
702 }
703 // else XXX: We made authorityWithPort_ that does not have a port.
704 // TODO: Audit callers and refuse to give out broken authorityWithPort_.
5c51bffb
AJ
705 }
706
707 return requirePort ? authorityWithPort_ : authorityHttp_;
708}
709
c823e2da 710SBuf &
c8ab5ec6 711AnyP::Uri::absolute() const
c823e2da
AJ
712{
713 if (absolute_.isEmpty()) {
714 // TODO: most URL will be much shorter, avoid allocating this much
715 absolute_.reserveCapacity(MAX_URL);
716
d31d59d8
AJ
717 absolute_.append(getScheme().image());
718 absolute_.append(":",1);
c823e2da
AJ
719 if (getScheme() != AnyP::PROTO_URN) {
720 absolute_.append("//", 2);
0d0f5161
AJ
721 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
722 getScheme() == AnyP::PROTO_UNKNOWN;
723
724 if (allowUserInfo && !userInfo().isEmpty()) {
614bd511
AJ
725 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
726 .remove('%')
727 .rename("userinfo-reserved");
728 absolute_.append(Encode(userInfo(), uiChars));
c823e2da
AJ
729 absolute_.append("@", 1);
730 }
731 absolute_.append(authority());
6c880a16
AJ
732 } else {
733 absolute_.append(host());
734 absolute_.append(":", 1);
c823e2da 735 }
614bd511 736 absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
c823e2da
AJ
737 }
738
739 return absolute_;
740}
741
9837567d 742/* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
914b89a2 743 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
744 * and never copy the query-string part in the first place
745 */
88738790 746char *
bec110e4 747urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
88738790 748{
749 LOCAL_ARRAY(char, buf, MAX_URL);
62e76326 750
bec110e4 751 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
c823e2da 752 buf[sizeof(buf)-1] = '\0';
62e76326 753
c823e2da 754 // URN, CONNECT method, and non-stripped URIs can go straight out
bec110e4 755 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
c823e2da
AJ
756 // strip anything AFTER a question-mark
757 // leaving the '?' in place
758 if (auto t = strchr(buf, '?')) {
759 *(++t) = '\0';
e2849af8 760 }
d548ee64 761 }
62e76326 762
9bc73deb 763 if (stringHasCntl(buf))
62e76326 764 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
765
88738790 766 return buf;
767}
768
b3802bdc
AJ
769/**
770 * Yet another alternative to urlCanonical.
c2a7cefd 771 * This one adds the https:// parts to Http::METHOD_CONNECT URL
b3802bdc
AJ
772 * for use in error page outputs.
773 * Luckily we can leverage the others instead of duplicating.
774 */
775const char *
776urlCanonicalFakeHttps(const HttpRequest * request)
777{
778 LOCAL_ARRAY(char, buf, MAX_URL);
779
780 // method CONNECT and port HTTPS
5c51bffb
AJ
781 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
782 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
b3802bdc
AJ
783 return buf;
784 }
785
786 // else do the normal complete canonical thing.
bec110e4 787 return request->canonicalCleanUrl();
b3802bdc
AJ
788}
789
614bd511
AJ
790/**
791 * Test if a URL is a relative reference.
792 *
793 * Governed by RFC 3986 section 4.2
794 *
795 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
bf956b0a 796 *
614bd511
AJ
797 * relative-part = "//" authority path-abempty
798 * / path-absolute
799 * / path-noscheme
800 * / path-empty
bf956b0a 801 */
6e44cca8 802bool
bf956b0a
BR
803urlIsRelative(const char *url)
804{
614bd511
AJ
805 if (!url)
806 return false; // no URL
bf956b0a 807
614bd511
AJ
808 /*
809 * RFC 3986 section 5.2.3
810 *
811 * path = path-abempty ; begins with "/" or is empty
812 * / path-absolute ; begins with "/" but not "//"
813 * / path-noscheme ; begins with a non-colon segment
814 * / path-rootless ; begins with a segment
815 * / path-empty ; zero characters
816 */
bf956b0a 817
614bd511
AJ
818 if (*url == '\0')
819 return true; // path-empty
bf956b0a 820
614bd511 821 if (*url == '/') {
a226c967
A
822 // network-path reference (a.k.a. 'scheme-relative URI') or
823 // path-absolute (a.k.a. 'absolute-path reference')
824 return true;
bf956b0a 825 }
3cbbd242 826
614bd511
AJ
827 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
828 if (*p == ':')
829 return false; // colon is forbidden in first segment
3cbbd242 830 }
26ac0430 831
614bd511
AJ
832 return true; // path-noscheme, path-abempty, path-rootless
833}
26ac0430 834
614bd511
AJ
835void
836AnyP::Uri::addRelativePath(const char *relUrl)
837{
838 // URN cannot be merged
839 if (getScheme() == AnyP::PROTO_URN)
840 return;
841
842 // TODO: Handle . and .. segment normalization
843
844 const auto lastSlashPos = path_.rfind('/');
845 // TODO: To optimize and simplify, add and use SBuf::replace().
846 const auto relUrlLength = strlen(relUrl);
847 if (lastSlashPos == SBuf::npos) {
848 // start replacing the whole path
849 path_.reserveCapacity(1 + relUrlLength);
850 path_.assign("/", 1);
6e44cca8 851 } else {
614bd511
AJ
852 // start replacing just the last segment
853 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
854 path_.chop(0, lastSlashPos+1);
6e44cca8 855 }
614bd511 856 path_.append(relUrl, relUrlLength);
3cbbd242 857}
858
b8d8561b 859int
6c1219b9 860matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
30a4f2a8 861{
9bc73deb 862 int dl;
863 int hl;
62e76326 864
abbd7825 865 const bool hostIncludesSubdomains = (*h == '.');
d20b1cd0 866 while ('.' == *h)
5db6bf73 867 ++h;
62e76326 868
9bc73deb 869 hl = strlen(h);
62e76326 870
abbd7825
CT
871 if (hl == 0)
872 return -1;
873
9bc73deb 874 dl = strlen(d);
b70f8649
AW
875 if (dl == 0)
876 return 1;
62e76326 877
9bc73deb 878 /*
879 * Start at the ends of the two strings and work towards the
880 * beginning.
881 */
882 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
62e76326 883 if (hl == 0 && dl == 0) {
884 /*
885 * We made it all the way to the beginning of both
886 * strings without finding any difference.
887 */
888 return 0;
889 }
890
891 if (0 == hl) {
892 /*
893 * The host string is shorter than the domain string.
894 * There is only one case when this can be a match.
895 * If the domain is just one character longer, and if
896 * that character is a leading '.' then we call it a
897 * match.
898 */
899
900 if (1 == dl && '.' == d[0])
901 return 0;
902 else
903 return -1;
904 }
905
906 if (0 == dl) {
907 /*
908 * The domain string is shorter than the host string.
909 * This is a match only if the first domain character
910 * is a leading '.'.
911 */
912
abbd7825
CT
913 if ('.' == d[0]) {
914 if (flags & mdnRejectSubsubDomains) {
915 // Check for sub-sub domain and reject
916 while(--hl >= 0 && h[hl] != '.');
917 if (hl < 0) {
918 // No sub-sub domain found, but reject if there is a
919 // leading dot in given host string (which is removed
920 // before the check is started).
921 return hostIncludesSubdomains ? 1 : 0;
922 } else
923 return 1; // sub-sub domain, reject
924 } else
925 return 0;
926 } else
62e76326 927 return 1;
928 }
9bc73deb 929 }
62e76326 930
9bc73deb 931 /*
932 * We found different characters in the same position (from the end).
933 */
69f69080
CT
934
935 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
936 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
937 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
abbd7825 938 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
69f69080
CT
939 return 0;
940
d20b1cd0 941 /*
942 * If one of those character is '.' then its special. In order
943 * for splay tree sorting to work properly, "x-foo.com" must
944 * be greater than ".foo.com" even though '-' is less than '.'.
945 */
946 if ('.' == d[dl])
62e76326 947 return 1;
948
d20b1cd0 949 if ('.' == h[hl])
62e76326 950 return -1;
951
9bc73deb 952 return (xtolower(h[hl]) - xtolower(d[dl]));
30a4f2a8 953}
a8f7d3ee 954
985c86bc 955/*
610ee341 956 * return true if we can serve requests for this method.
985c86bc 957 */
8b082ed9 958bool
190154cf 959urlCheckRequest(const HttpRequest * r)
a8f7d3ee 960{
610ee341 961 /* protocol "independent" methods
962 *
963 * actually these methods are specific to HTTP:
2f8abb64 964 * they are methods we receive on our HTTP port,
610ee341 965 * and if we had a FTP listener would not be relevant
966 * there.
967 *
968 * So, we should delegate them to HTTP. The problem is that we
969 * do not have a default protocol from the client side of HTTP.
970 */
62e76326 971
c2a7cefd 972 if (r->method == Http::METHOD_CONNECT)
8b082ed9 973 return true;
62e76326 974
77ce6ba9
AR
975 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
976 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
c2a7cefd 977 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
c8ab5ec6 978 return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
62e76326 979
c2a7cefd 980 if (r->method == Http::METHOD_PURGE)
8b082ed9 981 return true;
62e76326 982
99edd1c3 983 /* does method match the protocol? */
4e3f4dc7 984 switch (r->url.getScheme()) {
62e76326 985
0c3d3f65 986 case AnyP::PROTO_URN:
0c3d3f65 987 case AnyP::PROTO_HTTP:
8b082ed9 988 return true;
62e76326 989
0c3d3f65 990 case AnyP::PROTO_FTP:
8b082ed9
FC
991 if (r->method == Http::METHOD_PUT ||
992 r->method == Http::METHOD_GET ||
993 r->method == Http::METHOD_HEAD )
994 return true;
995 return false;
62e76326 996
0c3d3f65 997 case AnyP::PROTO_WAIS:
0c3d3f65 998 case AnyP::PROTO_WHOIS:
8b082ed9
FC
999 if (r->method == Http::METHOD_GET ||
1000 r->method == Http::METHOD_HEAD)
1001 return true;
1002 return false;
62e76326 1003
0c3d3f65 1004 case AnyP::PROTO_HTTPS:
c813943d 1005#if USE_OPENSSL || HAVE_LIBGNUTLS
8b082ed9 1006 return true;
1f7c9178 1007#else
62e76326 1008 /*
8b082ed9
FC
1009 * Squid can't originate an SSL connection, so it should
1010 * never receive an "https:" URL. It should always be
1011 * CONNECT instead.
1012 */
1013 return false;
1f7c9178 1014#endif
62e76326 1015
a8f7d3ee 1016 default:
8b082ed9 1017 return false;
a8f7d3ee 1018 }
62e76326 1019
8b082ed9
FC
1020 /* notreached */
1021 return false;
a8f7d3ee 1022}
9ce5e3e6 1023
c8ab5ec6 1024AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
d59e4742 1025 scheme_(aScheme),
380b09ae 1026 hostIsNumeric_(false)
d59e4742
FC
1027{
1028 *host_=0;
1029}
1a739503 1030
bec110e4
EB
1031// TODO: fix code duplication with AnyP::Uri::parse()
1032char *
1033AnyP::Uri::cleanup(const char *uri)
1034{
bec110e4
EB
1035 char *cleanedUri = nullptr;
1036 switch (Config.uri_whitespace) {
8b082ed9
FC
1037 case URI_WHITESPACE_ALLOW: {
1038 const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED;
bec110e4
EB
1039 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1040 break;
8b082ed9
FC
1041 }
1042
1043 case URI_WHITESPACE_ENCODE:
1044 cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL);
1045 break;
bec110e4
EB
1046
1047 case URI_WHITESPACE_CHOP: {
bec110e4
EB
1048 const auto pos = strcspn(uri, w_space);
1049 char *choppedUri = nullptr;
1050 if (pos < strlen(uri))
1051 choppedUri = xstrndup(uri, pos + 1);
8b082ed9
FC
1052 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
1053 RFC1738_ESCAPE_UNESCAPED), MAX_URL);
bec110e4
EB
1054 cleanedUri[pos] = '\0';
1055 xfree(choppedUri);
8b082ed9 1056 break;
bec110e4 1057 }
bec110e4
EB
1058
1059 case URI_WHITESPACE_DENY:
1060 case URI_WHITESPACE_STRIP:
1061 default: {
1062 // TODO: avoid duplication with urlParse()
1063 const char *t;
1064 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1065 char *q = tmp_uri;
1066 t = uri;
1067 while (*t) {
1068 if (!xisspace(*t)) {
1069 *q = *t;
1070 ++q;
1071 }
1072 ++t;
1073 }
1074 *q = '\0';
1075 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1076 xfree(tmp_uri);
8b082ed9 1077 break;
bec110e4 1078 }
bec110e4
EB
1079 }
1080
1081 assert(cleanedUri);
1082 return cleanedUri;
1083}
279e60ef 1084