]> git.ipfire.org Git - thirdparty/squid.git/blob - src/anyp/Uri.cc
Fix SMP mgr:userhash, mgr:sourcehash, and mgr:carp reports (#1844)
[thirdparty/squid.git] / src / anyp / Uri.cc
1 /*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 /* DEBUG: section 23 URL Parsing */
10
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "base/Raw.h"
14 #include "globals.h"
15 #include "HttpRequest.h"
16 #include "parser/Tokenizer.h"
17 #include "rfc1738.h"
18 #include "SquidConfig.h"
19 #include "SquidMath.h"
20
21 static const char valid_hostname_chars_u[] =
22 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
23 "abcdefghijklmnopqrstuvwxyz"
24 "0123456789-._"
25 "[:]"
26 ;
27 static const char valid_hostname_chars[] =
28 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
29 "abcdefghijklmnopqrstuvwxyz"
30 "0123456789-."
31 "[:]"
32 ;
33
34 /// Characters which are valid within a URI userinfo section
35 static const CharacterSet &
36 UserInfoChars()
37 {
38 /*
39 * RFC 3986 section 3.2.1
40 *
41 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
42 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
43 * pct-encoded = "%" HEXDIG HEXDIG
44 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
45 */
46 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
47 CharacterSet::ALPHA +
48 CharacterSet::DIGIT;
49 return userInfoValid;
50 }
51
52 /**
53 * Governed by RFC 3986 section 2.1
54 */
55 SBuf
56 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
57 {
58 if (buf.isEmpty())
59 return buf;
60
61 Parser::Tokenizer tk(buf);
62 SBuf goodSection;
63 // optimization for the arguably common "no encoding necessary" case
64 if (tk.prefix(goodSection, ignore) && tk.atEnd())
65 return buf;
66
67 SBuf output;
68 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
69 output.append(goodSection); // may be empty
70
71 while (!tk.atEnd()) {
72 // TODO: Add Tokenizer::parseOne(void).
73 const auto ch = tk.remaining()[0];
74 output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table
75 (void)tk.skip(ch);
76
77 if (tk.prefix(goodSection, ignore))
78 output.append(goodSection);
79 }
80
81 return output;
82 }
83
84 SBuf
85 AnyP::Uri::Decode(const SBuf &buf)
86 {
87 SBuf output;
88 Parser::Tokenizer tok(buf);
89 while (!tok.atEnd()) {
90 SBuf token;
91 static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
92 if (tok.prefix(token, unencodedChars))
93 output.append(token);
94
95 // we are either at '%' or at end of input
96 if (tok.skip('%')) {
97 int64_t hex1 = 0, hex2 = 0;
98 if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
99 output.append(static_cast<char>((hex1 << 4) | hex2));
100 else
101 throw TextException("invalid pct-encoded triplet", Here());
102 }
103 }
104 return output;
105 }
106
107 const SBuf &
108 AnyP::Uri::Asterisk()
109 {
110 static SBuf star("*");
111 return star;
112 }
113
114 const SBuf &
115 AnyP::Uri::SlashPath()
116 {
117 static SBuf slash("/");
118 return slash;
119 }
120
121 void
122 AnyP::Uri::host(const char *src)
123 {
124 hostAddr_.fromHost(src);
125 if (hostAddr_.isAnyAddr()) {
126 xstrncpy(host_, src, sizeof(host_));
127 hostIsNumeric_ = false;
128 } else {
129 hostAddr_.toHostStr(host_, sizeof(host_));
130 debugs(23, 3, "given IP: " << hostAddr_);
131 hostIsNumeric_ = 1;
132 }
133 touch();
134 }
135
136 SBuf
137 AnyP::Uri::hostOrIp() const
138 {
139 if (hostIsNumeric()) {
140 static char ip[MAX_IPSTRLEN];
141 const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
142 return SBuf(ip, hostStrLen);
143 } else
144 return SBuf(host());
145 }
146
147 const SBuf &
148 AnyP::Uri::path() const
149 {
150 // RFC 3986 section 3.3 says path can be empty (path-abempty).
151 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
152 // at least when sending and using. We must still accept path-abempty as input.
153 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
154 return SlashPath();
155
156 return path_;
157 }
158
159 void
160 urlInitialize(void)
161 {
162 debugs(23, 5, "urlInitialize: Initializing...");
163 /* this ensures that the number of protocol strings is the same as
164 * the enum slots allocated because the last enum is always 'MAX'.
165 */
166 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
167 /*
168 * These test that our matchDomainName() function works the
169 * way we expect it to.
170 */
171 assert(0 == matchDomainName("foo.com", "foo.com"));
172 assert(0 == matchDomainName(".foo.com", "foo.com"));
173 assert(0 == matchDomainName("foo.com", ".foo.com"));
174 assert(0 == matchDomainName(".foo.com", ".foo.com"));
175 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
176 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
177 assert(0 != matchDomainName("x.foo.com", "foo.com"));
178 assert(0 != matchDomainName("foo.com", "x.foo.com"));
179 assert(0 != matchDomainName("bar.com", "foo.com"));
180 assert(0 != matchDomainName(".bar.com", "foo.com"));
181 assert(0 != matchDomainName(".bar.com", ".foo.com"));
182 assert(0 != matchDomainName("bar.com", ".foo.com"));
183 assert(0 < matchDomainName("zzz.com", "foo.com"));
184 assert(0 > matchDomainName("aaa.com", "foo.com"));
185 assert(0 == matchDomainName("FOO.com", "foo.COM"));
186 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
187 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
188 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
189
190 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
191 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
192 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
193 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
194
195 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
196 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
197 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
198 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
199
200 assert(0 != matchDomainName("foo.com", ""));
201 assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards));
202 assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains));
203
204 /* more cases? */
205 }
206
207 /**
208 * Extract the URI scheme and ':' delimiter from the given input buffer.
209 *
210 * Schemes up to 16 characters are accepted.
211 *
212 * Governed by RFC 3986 section 3.1
213 */
214 static AnyP::UriScheme
215 uriParseScheme(Parser::Tokenizer &tok)
216 {
217 /*
218 * RFC 3986 section 3.1 paragraph 2:
219 *
220 * Scheme names consist of a sequence of characters beginning with a
221 * letter and followed by any combination of letters, digits, plus
222 * ("+"), period ("."), or hyphen ("-").
223 */
224 static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
225
226 SBuf str;
227 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
228 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
229 if (protocol == AnyP::PROTO_UNKNOWN)
230 return AnyP::UriScheme(protocol, str.c_str());
231 return AnyP::UriScheme(protocol, nullptr);
232 }
233
234 throw TextException("invalid URI scheme", Here());
235 }
236
237 /**
238 * Appends configured append_domain to hostname, assuming
239 * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
240 * and that the host FQDN is not a 'dotless' TLD.
241 *
242 * \returns false if and only if there is not enough space to append
243 */
244 bool
245 urlAppendDomain(char *host)
246 {
247 /* For IPv4 addresses check for a dot */
248 /* For IPv6 addresses also check for a colon */
249 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
250 const uint64_t dlen = strlen(host);
251 const uint64_t want = dlen + Config.appendDomainLen;
252 if (want > SQUIDHOSTNAMELEN - 1) {
253 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
254 return false;
255 }
256 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
257 }
258 return true;
259 }
260
261 /*
262 * Parse a URI/URL.
263 *
264 * It is assumed that the URL is complete -
265 * ie, the end of the string is the end of the URL. Don't pass a partial
266 * URL here as this routine doesn't have any way of knowing whether
267 * it is partial or not (ie, it handles the case of no trailing slash as
268 * being "end of host with implied path of /".
269 *
270 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
271 * then rather than a URL a hostname:port is looked for.
272 */
273 bool
274 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
275 {
276 try {
277
278 LOCAL_ARRAY(char, login, MAX_URL);
279 LOCAL_ARRAY(char, foundHost, MAX_URL);
280 LOCAL_ARRAY(char, urlpath, MAX_URL);
281 char *t = nullptr;
282 char *q = nullptr;
283 int foundPort;
284 int l;
285 int i;
286 const char *src;
287 char *dst;
288 foundHost[0] = urlpath[0] = login[0] = '\0';
289
290 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
291 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
292 return false;
293 }
294
295 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
296 Asterisk().cmp(rawUrl) == 0) {
297 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
298 setScheme(AnyP::PROTO_HTTP, nullptr);
299 port(getScheme().defaultPort());
300 path(Asterisk());
301 return true;
302 }
303
304 Parser::Tokenizer tok(rawUrl);
305 AnyP::UriScheme scheme;
306
307 if (method == Http::METHOD_CONNECT) {
308 // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
309 // port number of the tunnel destination, separated by a colon".
310
311 const auto rawHost = parseHost(tok);
312 Assure(rawHost.length() < sizeof(foundHost));
313 SBufToCstring(foundHost, rawHost);
314
315 if (!tok.skip(':'))
316 throw TextException("missing required :port in CONNECT target", Here());
317 foundPort = parsePort(tok);
318
319 if (!tok.remaining().isEmpty())
320 throw TextException("garbage after host:port in CONNECT target", Here());
321 } else {
322
323 scheme = uriParseScheme(tok);
324
325 if (scheme == AnyP::PROTO_NONE)
326 return false; // invalid scheme
327
328 if (scheme == AnyP::PROTO_URN) {
329 parseUrn(tok); // throws on any error
330 return true;
331 }
332
333 // URLs then have "//"
334 static const SBuf doubleSlash("//");
335 if (!tok.skip(doubleSlash))
336 return false;
337
338 auto B = tok.remaining();
339 const char *url = B.c_str();
340
341 /* Parse the URL: */
342 src = url;
343 i = 0;
344
345 /* Then everything until first /; that's host (and port; which we'll look for here later) */
346 // bug 1881: If we don't get a "/" then we imply it was there
347 // bug 3074: We could just be given a "?" or "#". These also imply "/"
348 // bug 3233: whitespace is also a hostname delimiter.
349 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
350 *dst = *src;
351 }
352
353 /*
354 * We can't check for "i >= l" here because we could be at the end of the line
355 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
356 * been -given- a valid URL and the path is just '/'.
357 */
358 if (i > l)
359 return false;
360 *dst = '\0';
361
362 // We are looking at path-abempty.
363 if (*src != '/') {
364 // path-empty, including the end of the `src` c-string cases
365 urlpath[0] = '/';
366 dst = &urlpath[1];
367 } else {
368 dst = urlpath;
369 }
370 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
371 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
372 *dst = *src;
373 }
374
375 /* We -could- be at the end of the buffer here */
376 if (i > l)
377 return false;
378 *dst = '\0';
379
380 // If the parsed scheme has no (known) default port, and there is no
381 // explicit port, then we will reject the zero port during foundPort
382 // validation, often resulting in a misleading 400/ERR_INVALID_URL.
383 // TODO: Remove this hack when switching to Tokenizer-based parsing.
384 foundPort = scheme.defaultPort().value_or(0); // may be reset later
385
386 /* Is there any login information? (we should eventually parse it above) */
387 t = strrchr(foundHost, '@');
388 if (t != nullptr) {
389 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
390 login[sizeof(login)-1] = '\0';
391 t = strrchr(login, '@');
392 *t = 0;
393 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
394 foundHost[sizeof(foundHost)-1] = '\0';
395 // Bug 4498: URL-unescape the login info after extraction
396 rfc1738_unescape(login);
397 }
398
399 /* Is there any host information? (we should eventually parse it above) */
400 if (*foundHost == '[') {
401 /* strip any IPA brackets. valid under IPv6. */
402 dst = foundHost;
403 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
404 src = foundHost;
405 ++src;
406 l = strlen(foundHost);
407 i = 1;
408 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
409 *dst = *src;
410 }
411
412 /* we moved in-place, so truncate the actual hostname found */
413 *dst = '\0';
414 ++dst;
415
416 /* skip ahead to either start of port, or original EOS */
417 while (*dst != '\0' && *dst != ':')
418 ++dst;
419 t = dst;
420 } else {
421 t = strrchr(foundHost, ':');
422
423 if (t != strchr(foundHost,':') ) {
424 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
425 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
426 /* therefore we MUST accept the case where they are not bracketed at all. */
427 t = nullptr;
428 }
429 }
430
431 // Bug 3183 sanity check: If scheme is present, host must be too.
432 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
433 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
434 return false;
435 }
436
437 if (t && *t == ':') {
438 *t = '\0';
439 ++t;
440 foundPort = atoi(t);
441 }
442 }
443
444 for (t = foundHost; *t; ++t)
445 *t = xtolower(*t);
446
447 if (stringHasWhitespace(foundHost)) {
448 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
449 t = q = foundHost;
450 while (*t) {
451 if (!xisspace(*t)) {
452 *q = *t;
453 ++q;
454 }
455 ++t;
456 }
457 *q = '\0';
458 }
459 }
460
461 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
462
463 if (Config.onoff.check_hostnames &&
464 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
465 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
466 return false;
467 }
468
469 if (!urlAppendDomain(foundHost))
470 return false;
471
472 /* remove trailing dots from hostnames */
473 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
474 foundHost[l] = '\0';
475
476 /* reject duplicate or leading dots */
477 if (strstr(foundHost, "..") || *foundHost == '.') {
478 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
479 return false;
480 }
481
482 if (foundPort < 1 || foundPort > 65535) {
483 debugs(23, 3, "Invalid port '" << foundPort << "'");
484 return false;
485 }
486
487 if (stringHasWhitespace(urlpath)) {
488 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
489
490 switch (Config.uri_whitespace) {
491
492 case URI_WHITESPACE_DENY:
493 return false;
494
495 case URI_WHITESPACE_ALLOW:
496 break;
497
498 case URI_WHITESPACE_ENCODE:
499 t = rfc1738_escape_unescaped(urlpath);
500 xstrncpy(urlpath, t, MAX_URL);
501 break;
502
503 case URI_WHITESPACE_CHOP:
504 *(urlpath + strcspn(urlpath, w_space)) = '\0';
505 break;
506
507 case URI_WHITESPACE_STRIP:
508 default:
509 t = q = urlpath;
510 while (*t) {
511 if (!xisspace(*t)) {
512 *q = *t;
513 ++q;
514 }
515 ++t;
516 }
517 *q = '\0';
518 }
519 }
520
521 setScheme(scheme);
522 path(urlpath);
523 host(foundHost);
524 userInfo(SBuf(login));
525 port(foundPort);
526 return true;
527
528 } catch (...) {
529 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
530 return false;
531 }
532 }
533
534 /**
535 * Governed by RFC 8141 section 2:
536 *
537 * assigned-name = "urn" ":" NID ":" NSS
538 * NID = (alphanum) 0*30(ldh) (alphanum)
539 * ldh = alphanum / "-"
540 * NSS = pchar *(pchar / "/")
541 *
542 * RFC 3986 Appendix D.2 defines (as deprecated):
543 *
544 * alphanum = ALPHA / DIGIT
545 *
546 * Notice that NID is exactly 2-32 characters in length.
547 */
548 void
549 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
550 {
551 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
552 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
553 SBuf nid;
554 if (!tok.prefix(nid, nidChars, 32))
555 throw TextException("NID not found", Here());
556
557 if (!tok.skip(':'))
558 throw TextException("NID too long or missing ':' delimiter", Here());
559
560 if (nid.length() < 2)
561 throw TextException("NID too short", Here());
562
563 if (!alphanum[*nid.begin()])
564 throw TextException("NID prefix is not alphanumeric", Here());
565
566 if (!alphanum[*nid.rbegin()])
567 throw TextException("NID suffix is not alphanumeric", Here());
568
569 setScheme(AnyP::PROTO_URN, nullptr);
570 host(nid.c_str());
571 // TODO validate path characters
572 path(tok.remaining());
573 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
574 }
575
576 /// Extracts and returns a (suspected but only partially validated) uri-host
577 /// IPv6address, IPv4address, or reg-name component. This function uses (and
578 /// quotes) RFC 3986, Section 3.2.2 syntax rules.
579 SBuf
580 AnyP::Uri::parseHost(Parser::Tokenizer &tok) const
581 {
582 // host = IP-literal / IPv4address / reg-name
583
584 // XXX: CharacterSets below reject uri-host values containing whitespace
585 // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
586 // can be interpreted as if it applies to uri-host and this code. TODO: Fix
587 // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
588 // port, etc.) from that directive scope.
589
590 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
591 if (tok.skip('[')) {
592 // Add "." because IPv6address in RFC 3986 includes ls32, which includes
593 // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
594 // This set rejects IPvFuture that needs a "v" character.
595 static const CharacterSet IPv6chars = (
596 CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
597 SBuf ipv6ish;
598 if (!tok.prefix(ipv6ish, IPv6chars))
599 throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
600
601 if (!tok.skip(']'))
602 throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
603
604 // This rejects bracketed IPv4address and domain names because they lack ":".
605 if (ipv6ish.find(':') == SBuf::npos)
606 throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
607
608 // This rejects bracketed non-IP addresses that our caller would have
609 // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
610 Ip::Address ipv6check;
611 if (!ipv6check.fromHost(ipv6ish.c_str()))
612 throw TextException("malformed bracketed IPv6 address in uri-host", Here());
613
614 return ipv6ish;
615 }
616
617 // no brackets implies we are looking at IPv4address or reg-name
618
619 // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&"
620 // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the
621 // non-CONNECT uri-host parsing code to use us.
622
623 SBuf otherHost; // IPv4address-ish or reg-name-ish;
624 // ":" is not in TCHAR so we will stop before any port specification
625 if (tok.prefix(otherHost, CharacterSet::TCHAR))
626 return otherHost;
627
628 throw TextException("malformed IPv4 address or host name in uri-host", Here());
629 }
630
631 /// Extracts and returns an RFC 3986 URI authority port value (with additional
632 /// restrictions). The RFC defines port as a possibly empty sequence of decimal
633 /// digits. We reject certain ports (that are syntactically valid from the RFC
634 /// point of view) because we are worried that Squid and other traffic handlers
635 /// may dangerously mishandle unusual (and virtually always bogus) port numbers.
636 /// Rejected ports cannot be successfully used by Squid itself.
637 int
638 AnyP::Uri::parsePort(Parser::Tokenizer &tok) const
639 {
640 if (tok.skip('0'))
641 throw TextException("zero or zero-prefixed port", Here());
642
643 int64_t rawPort = 0;
644 if (!tok.int64(rawPort, 10, false)) // port = *DIGIT
645 throw TextException("malformed or missing port", Here());
646
647 Assure(rawPort > 0);
648 constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it.
649 constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max();
650 static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number");
651 if (Less(portMax, rawPort))
652 throw TextException("huge port", Here());
653
654 // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
655 // code to use us (so that foundPort "int" disappears or starts using Port).
656 return NaturalCast<int>(rawPort);
657 }
658
659 void
660 AnyP::Uri::touch()
661 {
662 absolute_.clear();
663 authorityHttp_.clear();
664 authorityWithPort_.clear();
665 }
666
667 SBuf &
668 AnyP::Uri::authority(bool requirePort) const
669 {
670 if (authorityHttp_.isEmpty()) {
671
672 // both formats contain Host/IP
673 authorityWithPort_.append(host());
674 authorityHttp_ = authorityWithPort_;
675
676 if (port().has_value()) {
677 authorityWithPort_.appendf(":%hu", *port());
678 // authorityHttp_ only has :port for known non-default ports
679 if (port() != getScheme().defaultPort())
680 authorityHttp_ = authorityWithPort_;
681 }
682 // else XXX: We made authorityWithPort_ that does not have a port.
683 // TODO: Audit callers and refuse to give out broken authorityWithPort_.
684 }
685
686 return requirePort ? authorityWithPort_ : authorityHttp_;
687 }
688
689 SBuf &
690 AnyP::Uri::absolute() const
691 {
692 if (absolute_.isEmpty()) {
693 // TODO: most URL will be much shorter, avoid allocating this much
694 absolute_.reserveCapacity(MAX_URL);
695
696 absolute_.append(getScheme().image());
697 absolute_.append(":",1);
698 if (getScheme() != AnyP::PROTO_URN) {
699 absolute_.append("//", 2);
700 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
701 getScheme() == AnyP::PROTO_UNKNOWN;
702
703 if (allowUserInfo && !userInfo().isEmpty()) {
704 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
705 .remove('%')
706 .rename("userinfo-reserved");
707 absolute_.append(Encode(userInfo(), uiChars));
708 absolute_.append("@", 1);
709 }
710 absolute_.append(authority());
711 } else {
712 absolute_.append(host());
713 absolute_.append(":", 1);
714 }
715 absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
716 }
717
718 return absolute_;
719 }
720
721 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
722 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
723 * and never copy the query-string part in the first place
724 */
725 char *
726 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
727 {
728 LOCAL_ARRAY(char, buf, MAX_URL);
729
730 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
731 buf[sizeof(buf)-1] = '\0';
732
733 // URN, CONNECT method, and non-stripped URIs can go straight out
734 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
735 // strip anything AFTER a question-mark
736 // leaving the '?' in place
737 if (auto t = strchr(buf, '?')) {
738 *(++t) = '\0';
739 }
740 }
741
742 if (stringHasCntl(buf))
743 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
744
745 return buf;
746 }
747
748 /**
749 * Yet another alternative to urlCanonical.
750 * This one adds the https:// parts to Http::METHOD_CONNECT URL
751 * for use in error page outputs.
752 * Luckily we can leverage the others instead of duplicating.
753 */
754 const char *
755 urlCanonicalFakeHttps(const HttpRequest * request)
756 {
757 LOCAL_ARRAY(char, buf, MAX_URL);
758
759 // method CONNECT and port HTTPS
760 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
761 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
762 return buf;
763 }
764
765 // else do the normal complete canonical thing.
766 return request->canonicalCleanUrl();
767 }
768
769 /**
770 * Test if a URL is a relative reference.
771 *
772 * Governed by RFC 3986 section 4.2
773 *
774 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
775 *
776 * relative-part = "//" authority path-abempty
777 * / path-absolute
778 * / path-noscheme
779 * / path-empty
780 */
781 bool
782 urlIsRelative(const char *url)
783 {
784 if (!url)
785 return false; // no URL
786
787 /*
788 * RFC 3986 section 5.2.3
789 *
790 * path = path-abempty ; begins with "/" or is empty
791 * / path-absolute ; begins with "/" but not "//"
792 * / path-noscheme ; begins with a non-colon segment
793 * / path-rootless ; begins with a segment
794 * / path-empty ; zero characters
795 */
796
797 if (*url == '\0')
798 return true; // path-empty
799
800 if (*url == '/') {
801 // network-path reference (a.k.a. 'scheme-relative URI') or
802 // path-absolute (a.k.a. 'absolute-path reference')
803 return true;
804 }
805
806 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
807 if (*p == ':')
808 return false; // colon is forbidden in first segment
809 }
810
811 return true; // path-noscheme, path-abempty, path-rootless
812 }
813
814 void
815 AnyP::Uri::addRelativePath(const char *relUrl)
816 {
817 // URN cannot be merged
818 if (getScheme() == AnyP::PROTO_URN)
819 return;
820
821 // TODO: Handle . and .. segment normalization
822
823 const auto lastSlashPos = path_.rfind('/');
824 // TODO: To optimize and simplify, add and use SBuf::replace().
825 const auto relUrlLength = strlen(relUrl);
826 if (lastSlashPos == SBuf::npos) {
827 // start replacing the whole path
828 path_.reserveCapacity(1 + relUrlLength);
829 path_.assign("/", 1);
830 } else {
831 // start replacing just the last segment
832 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
833 path_.chop(0, lastSlashPos+1);
834 }
835 path_.append(relUrl, relUrlLength);
836 }
837
838 int
839 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
840 {
841 int dl;
842 int hl;
843
844 const bool hostIncludesSubdomains = (*h == '.');
845 while ('.' == *h)
846 ++h;
847
848 hl = strlen(h);
849
850 if (hl == 0)
851 return -1;
852
853 dl = strlen(d);
854 if (dl == 0)
855 return 1;
856
857 /*
858 * Start at the ends of the two strings and work towards the
859 * beginning.
860 */
861 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
862 if (hl == 0 && dl == 0) {
863 /*
864 * We made it all the way to the beginning of both
865 * strings without finding any difference.
866 */
867 return 0;
868 }
869
870 if (0 == hl) {
871 /*
872 * The host string is shorter than the domain string.
873 * There is only one case when this can be a match.
874 * If the domain is just one character longer, and if
875 * that character is a leading '.' then we call it a
876 * match.
877 */
878
879 if (1 == dl && '.' == d[0])
880 return 0;
881 else
882 return -1;
883 }
884
885 if (0 == dl) {
886 /*
887 * The domain string is shorter than the host string.
888 * This is a match only if the first domain character
889 * is a leading '.'.
890 */
891
892 if ('.' == d[0]) {
893 if (flags & mdnRejectSubsubDomains) {
894 // Check for sub-sub domain and reject
895 while(--hl >= 0 && h[hl] != '.');
896 if (hl < 0) {
897 // No sub-sub domain found, but reject if there is a
898 // leading dot in given host string (which is removed
899 // before the check is started).
900 return hostIncludesSubdomains ? 1 : 0;
901 } else
902 return 1; // sub-sub domain, reject
903 } else
904 return 0;
905 } else
906 return 1;
907 }
908 }
909
910 /*
911 * We found different characters in the same position (from the end).
912 */
913
914 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
915 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
916 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
917 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
918 return 0;
919
920 /*
921 * If one of those character is '.' then its special. In order
922 * for splay tree sorting to work properly, "x-foo.com" must
923 * be greater than ".foo.com" even though '-' is less than '.'.
924 */
925 if ('.' == d[dl])
926 return 1;
927
928 if ('.' == h[hl])
929 return -1;
930
931 return (xtolower(h[hl]) - xtolower(d[dl]));
932 }
933
934 /*
935 * return true if we can serve requests for this method.
936 */
937 bool
938 urlCheckRequest(const HttpRequest * r)
939 {
940 /* protocol "independent" methods
941 *
942 * actually these methods are specific to HTTP:
943 * they are methods we receive on our HTTP port,
944 * and if we had a FTP listener would not be relevant
945 * there.
946 *
947 * So, we should delegate them to HTTP. The problem is that we
948 * do not have a default protocol from the client side of HTTP.
949 */
950
951 if (r->method == Http::METHOD_CONNECT)
952 return true;
953
954 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
955 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
956 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
957 return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
958
959 if (r->method == Http::METHOD_PURGE)
960 return true;
961
962 /* does method match the protocol? */
963 switch (r->url.getScheme()) {
964
965 case AnyP::PROTO_URN:
966 case AnyP::PROTO_HTTP:
967 return true;
968
969 case AnyP::PROTO_FTP:
970 if (r->method == Http::METHOD_PUT ||
971 r->method == Http::METHOD_GET ||
972 r->method == Http::METHOD_HEAD )
973 return true;
974 return false;
975
976 case AnyP::PROTO_WAIS:
977 case AnyP::PROTO_WHOIS:
978 if (r->method == Http::METHOD_GET ||
979 r->method == Http::METHOD_HEAD)
980 return true;
981 return false;
982
983 case AnyP::PROTO_HTTPS:
984 #if USE_OPENSSL || HAVE_LIBGNUTLS
985 return true;
986 #else
987 /*
988 * Squid can't originate an SSL connection, so it should
989 * never receive an "https:" URL. It should always be
990 * CONNECT instead.
991 */
992 return false;
993 #endif
994
995 default:
996 return false;
997 }
998
999 /* notreached */
1000 return false;
1001 }
1002
1003 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
1004 scheme_(aScheme),
1005 hostIsNumeric_(false)
1006 {
1007 *host_=0;
1008 }
1009
1010 // TODO: fix code duplication with AnyP::Uri::parse()
1011 char *
1012 AnyP::Uri::cleanup(const char *uri)
1013 {
1014 char *cleanedUri = nullptr;
1015 switch (Config.uri_whitespace) {
1016 case URI_WHITESPACE_ALLOW: {
1017 const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED;
1018 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1019 break;
1020 }
1021
1022 case URI_WHITESPACE_ENCODE:
1023 cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL);
1024 break;
1025
1026 case URI_WHITESPACE_CHOP: {
1027 const auto pos = strcspn(uri, w_space);
1028 char *choppedUri = nullptr;
1029 if (pos < strlen(uri))
1030 choppedUri = xstrndup(uri, pos + 1);
1031 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
1032 RFC1738_ESCAPE_UNESCAPED), MAX_URL);
1033 cleanedUri[pos] = '\0';
1034 xfree(choppedUri);
1035 break;
1036 }
1037
1038 case URI_WHITESPACE_DENY:
1039 case URI_WHITESPACE_STRIP:
1040 default: {
1041 // TODO: avoid duplication with urlParse()
1042 const char *t;
1043 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1044 char *q = tmp_uri;
1045 t = uri;
1046 while (*t) {
1047 if (!xisspace(*t)) {
1048 *q = *t;
1049 ++q;
1050 }
1051 ++t;
1052 }
1053 *q = '\0';
1054 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1055 xfree(tmp_uri);
1056 break;
1057 }
1058 }
1059
1060 assert(cleanedUri);
1061 return cleanedUri;
1062 }
1063