]> git.ipfire.org Git - thirdparty/squid.git/blob - src/anyp/Uri.cc
Source Format Enforcement (#1234)
[thirdparty/squid.git] / src / anyp / Uri.cc
1 /*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 /* DEBUG: section 23 URL Parsing */
10
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "base/Raw.h"
14 #include "globals.h"
15 #include "HttpRequest.h"
16 #include "parser/Tokenizer.h"
17 #include "rfc1738.h"
18 #include "SquidConfig.h"
19 #include "SquidString.h"
20
21 static const char valid_hostname_chars_u[] =
22 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
23 "abcdefghijklmnopqrstuvwxyz"
24 "0123456789-._"
25 "[:]"
26 ;
27 static const char valid_hostname_chars[] =
28 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
29 "abcdefghijklmnopqrstuvwxyz"
30 "0123456789-."
31 "[:]"
32 ;
33
34 /// Characters which are valid within a URI userinfo section
35 static const CharacterSet &
36 UserInfoChars()
37 {
38 /*
39 * RFC 3986 section 3.2.1
40 *
41 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
42 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
43 * pct-encoded = "%" HEXDIG HEXDIG
44 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
45 */
46 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
47 CharacterSet::ALPHA +
48 CharacterSet::DIGIT;
49 return userInfoValid;
50 }
51
52 /**
53 * Governed by RFC 3986 section 2.1
54 */
55 SBuf
56 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
57 {
58 if (buf.isEmpty())
59 return buf;
60
61 Parser::Tokenizer tk(buf);
62 SBuf goodSection;
63 // optimization for the arguably common "no encoding necessary" case
64 if (tk.prefix(goodSection, ignore) && tk.atEnd())
65 return buf;
66
67 SBuf output;
68 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
69 output.append(goodSection); // may be empty
70
71 while (!tk.atEnd()) {
72 // TODO: Add Tokenizer::parseOne(void).
73 const auto ch = tk.remaining()[0];
74 output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
75 (void)tk.skip(ch);
76
77 if (tk.prefix(goodSection, ignore))
78 output.append(goodSection);
79 }
80
81 return output;
82 }
83
84 const SBuf &
85 AnyP::Uri::Asterisk()
86 {
87 static SBuf star("*");
88 return star;
89 }
90
91 const SBuf &
92 AnyP::Uri::SlashPath()
93 {
94 static SBuf slash("/");
95 return slash;
96 }
97
98 void
99 AnyP::Uri::host(const char *src)
100 {
101 hostAddr_.fromHost(src);
102 if (hostAddr_.isAnyAddr()) {
103 xstrncpy(host_, src, sizeof(host_));
104 hostIsNumeric_ = false;
105 } else {
106 hostAddr_.toHostStr(host_, sizeof(host_));
107 debugs(23, 3, "given IP: " << hostAddr_);
108 hostIsNumeric_ = 1;
109 }
110 touch();
111 }
112
113 SBuf
114 AnyP::Uri::hostOrIp() const
115 {
116 if (hostIsNumeric()) {
117 static char ip[MAX_IPSTRLEN];
118 const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
119 return SBuf(ip, hostStrLen);
120 } else
121 return SBuf(host());
122 }
123
124 const SBuf &
125 AnyP::Uri::path() const
126 {
127 // RFC 3986 section 3.3 says path can be empty (path-abempty).
128 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
129 // at least when sending and using. We must still accept path-abempty as input.
130 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
131 return SlashPath();
132
133 return path_;
134 }
135
136 void
137 urlInitialize(void)
138 {
139 debugs(23, 5, "urlInitialize: Initializing...");
140 /* this ensures that the number of protocol strings is the same as
141 * the enum slots allocated because the last enum is always 'MAX'.
142 */
143 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
144 /*
145 * These test that our matchDomainName() function works the
146 * way we expect it to.
147 */
148 assert(0 == matchDomainName("foo.com", "foo.com"));
149 assert(0 == matchDomainName(".foo.com", "foo.com"));
150 assert(0 == matchDomainName("foo.com", ".foo.com"));
151 assert(0 == matchDomainName(".foo.com", ".foo.com"));
152 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
153 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
154 assert(0 != matchDomainName("x.foo.com", "foo.com"));
155 assert(0 != matchDomainName("foo.com", "x.foo.com"));
156 assert(0 != matchDomainName("bar.com", "foo.com"));
157 assert(0 != matchDomainName(".bar.com", "foo.com"));
158 assert(0 != matchDomainName(".bar.com", ".foo.com"));
159 assert(0 != matchDomainName("bar.com", ".foo.com"));
160 assert(0 < matchDomainName("zzz.com", "foo.com"));
161 assert(0 > matchDomainName("aaa.com", "foo.com"));
162 assert(0 == matchDomainName("FOO.com", "foo.COM"));
163 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
164 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
165 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
166
167 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
168 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
169 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
170 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
171
172 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
173 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
174 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
175 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
176
177 /* more cases? */
178 }
179
180 /**
181 * Extract the URI scheme and ':' delimiter from the given input buffer.
182 *
183 * Schemes up to 16 characters are accepted.
184 *
185 * Governed by RFC 3986 section 3.1
186 */
187 static AnyP::UriScheme
188 uriParseScheme(Parser::Tokenizer &tok)
189 {
190 /*
191 * RFC 3986 section 3.1 paragraph 2:
192 *
193 * Scheme names consist of a sequence of characters beginning with a
194 * letter and followed by any combination of letters, digits, plus
195 * ("+"), period ("."), or hyphen ("-").
196 *
197 * The underscore ("_") required to match "cache_object://" squid
198 * special URI scheme.
199 */
200 static const auto schemeChars =
201 #if USE_HTTP_VIOLATIONS
202 CharacterSet("special", "_") +
203 #endif
204 CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
205
206 SBuf str;
207 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
208 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
209 if (protocol == AnyP::PROTO_UNKNOWN)
210 return AnyP::UriScheme(protocol, str.c_str());
211 return AnyP::UriScheme(protocol, nullptr);
212 }
213
214 throw TextException("invalid URI scheme", Here());
215 }
216
217 /**
218 * Appends configured append_domain to hostname, assuming
219 * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
220 * and that the host FQDN is not a 'dotless' TLD.
221 *
222 * \returns false if and only if there is not enough space to append
223 */
224 bool
225 urlAppendDomain(char *host)
226 {
227 /* For IPv4 addresses check for a dot */
228 /* For IPv6 addresses also check for a colon */
229 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
230 const uint64_t dlen = strlen(host);
231 const uint64_t want = dlen + Config.appendDomainLen;
232 if (want > SQUIDHOSTNAMELEN - 1) {
233 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
234 return false;
235 }
236 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
237 }
238 return true;
239 }
240
241 /*
242 * Parse a URI/URL.
243 *
244 * It is assumed that the URL is complete -
245 * ie, the end of the string is the end of the URL. Don't pass a partial
246 * URL here as this routine doesn't have any way of knowing whether
247 * it is partial or not (ie, it handles the case of no trailing slash as
248 * being "end of host with implied path of /".
249 *
250 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
251 * then rather than a URL a hostname:port is looked for.
252 */
253 bool
254 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
255 {
256 try {
257
258 LOCAL_ARRAY(char, login, MAX_URL);
259 LOCAL_ARRAY(char, foundHost, MAX_URL);
260 LOCAL_ARRAY(char, urlpath, MAX_URL);
261 char *t = nullptr;
262 char *q = nullptr;
263 int foundPort;
264 int l;
265 int i;
266 const char *src;
267 char *dst;
268 foundHost[0] = urlpath[0] = login[0] = '\0';
269
270 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
271 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
272 return false;
273 }
274
275 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
276 Asterisk().cmp(rawUrl) == 0) {
277 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
278 setScheme(AnyP::PROTO_HTTP, nullptr);
279 port(getScheme().defaultPort());
280 path(Asterisk());
281 return true;
282 }
283
284 Parser::Tokenizer tok(rawUrl);
285 AnyP::UriScheme scheme;
286
287 if (method == Http::METHOD_CONNECT) {
288 /*
289 * RFC 7230 section 5.3.3: authority-form = authority
290 * "excluding any userinfo and its "@" delimiter"
291 *
292 * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
293 *
294 * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
295 */
296 foundPort = 443;
297
298 // XXX: use tokenizer
299 auto B = tok.buf();
300 const char *url = B.c_str();
301
302 if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
303 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
304 return false;
305
306 } else {
307
308 scheme = uriParseScheme(tok);
309
310 if (scheme == AnyP::PROTO_NONE)
311 return false; // invalid scheme
312
313 if (scheme == AnyP::PROTO_URN) {
314 parseUrn(tok); // throws on any error
315 return true;
316 }
317
318 // URLs then have "//"
319 static const SBuf doubleSlash("//");
320 if (!tok.skip(doubleSlash))
321 return false;
322
323 auto B = tok.remaining();
324 const char *url = B.c_str();
325
326 /* Parse the URL: */
327 src = url;
328 i = 0;
329
330 /* Then everything until first /; that's host (and port; which we'll look for here later) */
331 // bug 1881: If we don't get a "/" then we imply it was there
332 // bug 3074: We could just be given a "?" or "#". These also imply "/"
333 // bug 3233: whitespace is also a hostname delimiter.
334 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
335 *dst = *src;
336 }
337
338 /*
339 * We can't check for "i >= l" here because we could be at the end of the line
340 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
341 * been -given- a valid URL and the path is just '/'.
342 */
343 if (i > l)
344 return false;
345 *dst = '\0';
346
347 // We are looking at path-abempty.
348 if (*src != '/') {
349 // path-empty, including the end of the `src` c-string cases
350 urlpath[0] = '/';
351 dst = &urlpath[1];
352 } else {
353 dst = urlpath;
354 }
355 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
356 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
357 *dst = *src;
358 }
359
360 /* We -could- be at the end of the buffer here */
361 if (i > l)
362 return false;
363 *dst = '\0';
364
365 foundPort = scheme.defaultPort(); // may be reset later
366
367 /* Is there any login information? (we should eventually parse it above) */
368 t = strrchr(foundHost, '@');
369 if (t != nullptr) {
370 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
371 login[sizeof(login)-1] = '\0';
372 t = strrchr(login, '@');
373 *t = 0;
374 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
375 foundHost[sizeof(foundHost)-1] = '\0';
376 // Bug 4498: URL-unescape the login info after extraction
377 rfc1738_unescape(login);
378 }
379
380 /* Is there any host information? (we should eventually parse it above) */
381 if (*foundHost == '[') {
382 /* strip any IPA brackets. valid under IPv6. */
383 dst = foundHost;
384 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
385 src = foundHost;
386 ++src;
387 l = strlen(foundHost);
388 i = 1;
389 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
390 *dst = *src;
391 }
392
393 /* we moved in-place, so truncate the actual hostname found */
394 *dst = '\0';
395 ++dst;
396
397 /* skip ahead to either start of port, or original EOS */
398 while (*dst != '\0' && *dst != ':')
399 ++dst;
400 t = dst;
401 } else {
402 t = strrchr(foundHost, ':');
403
404 if (t != strchr(foundHost,':') ) {
405 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
406 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
407 /* therefore we MUST accept the case where they are not bracketed at all. */
408 t = nullptr;
409 }
410 }
411
412 // Bug 3183 sanity check: If scheme is present, host must be too.
413 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
414 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
415 return false;
416 }
417
418 if (t && *t == ':') {
419 *t = '\0';
420 ++t;
421 foundPort = atoi(t);
422 }
423 }
424
425 for (t = foundHost; *t; ++t)
426 *t = xtolower(*t);
427
428 if (stringHasWhitespace(foundHost)) {
429 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
430 t = q = foundHost;
431 while (*t) {
432 if (!xisspace(*t)) {
433 *q = *t;
434 ++q;
435 }
436 ++t;
437 }
438 *q = '\0';
439 }
440 }
441
442 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
443
444 if (Config.onoff.check_hostnames &&
445 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
446 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
447 return false;
448 }
449
450 if (!urlAppendDomain(foundHost))
451 return false;
452
453 /* remove trailing dots from hostnames */
454 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
455 foundHost[l] = '\0';
456
457 /* reject duplicate or leading dots */
458 if (strstr(foundHost, "..") || *foundHost == '.') {
459 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
460 return false;
461 }
462
463 if (foundPort < 1 || foundPort > 65535) {
464 debugs(23, 3, "Invalid port '" << foundPort << "'");
465 return false;
466 }
467
468 if (stringHasWhitespace(urlpath)) {
469 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
470
471 switch (Config.uri_whitespace) {
472
473 case URI_WHITESPACE_DENY:
474 return false;
475
476 case URI_WHITESPACE_ALLOW:
477 break;
478
479 case URI_WHITESPACE_ENCODE:
480 t = rfc1738_escape_unescaped(urlpath);
481 xstrncpy(urlpath, t, MAX_URL);
482 break;
483
484 case URI_WHITESPACE_CHOP:
485 *(urlpath + strcspn(urlpath, w_space)) = '\0';
486 break;
487
488 case URI_WHITESPACE_STRIP:
489 default:
490 t = q = urlpath;
491 while (*t) {
492 if (!xisspace(*t)) {
493 *q = *t;
494 ++q;
495 }
496 ++t;
497 }
498 *q = '\0';
499 }
500 }
501
502 setScheme(scheme);
503 path(urlpath);
504 host(foundHost);
505 userInfo(SBuf(login));
506 port(foundPort);
507 return true;
508
509 } catch (...) {
510 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
511 return false;
512 }
513 }
514
515 /**
516 * Governed by RFC 8141 section 2:
517 *
518 * assigned-name = "urn" ":" NID ":" NSS
519 * NID = (alphanum) 0*30(ldh) (alphanum)
520 * ldh = alphanum / "-"
521 * NSS = pchar *(pchar / "/")
522 *
523 * RFC 3986 Appendix D.2 defines (as deprecated):
524 *
525 * alphanum = ALPHA / DIGIT
526 *
527 * Notice that NID is exactly 2-32 characters in length.
528 */
529 void
530 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
531 {
532 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
533 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
534 SBuf nid;
535 if (!tok.prefix(nid, nidChars, 32))
536 throw TextException("NID not found", Here());
537
538 if (!tok.skip(':'))
539 throw TextException("NID too long or missing ':' delimiter", Here());
540
541 if (nid.length() < 2)
542 throw TextException("NID too short", Here());
543
544 if (!alphanum[*nid.begin()])
545 throw TextException("NID prefix is not alphanumeric", Here());
546
547 if (!alphanum[*nid.rbegin()])
548 throw TextException("NID suffix is not alphanumeric", Here());
549
550 setScheme(AnyP::PROTO_URN, nullptr);
551 host(nid.c_str());
552 // TODO validate path characters
553 path(tok.remaining());
554 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
555 }
556
557 void
558 AnyP::Uri::touch()
559 {
560 absolute_.clear();
561 authorityHttp_.clear();
562 authorityWithPort_.clear();
563 }
564
565 SBuf &
566 AnyP::Uri::authority(bool requirePort) const
567 {
568 if (authorityHttp_.isEmpty()) {
569
570 // both formats contain Host/IP
571 authorityWithPort_.append(host());
572 authorityHttp_ = authorityWithPort_;
573
574 // authorityForm_ only has :port if it is non-default
575 authorityWithPort_.appendf(":%u",port());
576 if (port() != getScheme().defaultPort())
577 authorityHttp_ = authorityWithPort_;
578 }
579
580 return requirePort ? authorityWithPort_ : authorityHttp_;
581 }
582
583 SBuf &
584 AnyP::Uri::absolute() const
585 {
586 if (absolute_.isEmpty()) {
587 // TODO: most URL will be much shorter, avoid allocating this much
588 absolute_.reserveCapacity(MAX_URL);
589
590 absolute_.append(getScheme().image());
591 absolute_.append(":",1);
592 if (getScheme() != AnyP::PROTO_URN) {
593 absolute_.append("//", 2);
594 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
595 getScheme() == AnyP::PROTO_UNKNOWN;
596
597 if (allowUserInfo && !userInfo().isEmpty()) {
598 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
599 .remove('%')
600 .rename("userinfo-reserved");
601 absolute_.append(Encode(userInfo(), uiChars));
602 absolute_.append("@", 1);
603 }
604 absolute_.append(authority());
605 } else {
606 absolute_.append(host());
607 absolute_.append(":", 1);
608 }
609 absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
610 }
611
612 return absolute_;
613 }
614
615 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
616 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
617 * and never copy the query-string part in the first place
618 */
619 char *
620 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
621 {
622 LOCAL_ARRAY(char, buf, MAX_URL);
623
624 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
625 buf[sizeof(buf)-1] = '\0';
626
627 // URN, CONNECT method, and non-stripped URIs can go straight out
628 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
629 // strip anything AFTER a question-mark
630 // leaving the '?' in place
631 if (auto t = strchr(buf, '?')) {
632 *(++t) = '\0';
633 }
634 }
635
636 if (stringHasCntl(buf))
637 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
638
639 return buf;
640 }
641
642 /**
643 * Yet another alternative to urlCanonical.
644 * This one adds the https:// parts to Http::METHOD_CONNECT URL
645 * for use in error page outputs.
646 * Luckily we can leverage the others instead of duplicating.
647 */
648 const char *
649 urlCanonicalFakeHttps(const HttpRequest * request)
650 {
651 LOCAL_ARRAY(char, buf, MAX_URL);
652
653 // method CONNECT and port HTTPS
654 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
655 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
656 return buf;
657 }
658
659 // else do the normal complete canonical thing.
660 return request->canonicalCleanUrl();
661 }
662
663 /**
664 * Test if a URL is a relative reference.
665 *
666 * Governed by RFC 3986 section 4.2
667 *
668 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
669 *
670 * relative-part = "//" authority path-abempty
671 * / path-absolute
672 * / path-noscheme
673 * / path-empty
674 */
675 bool
676 urlIsRelative(const char *url)
677 {
678 if (!url)
679 return false; // no URL
680
681 /*
682 * RFC 3986 section 5.2.3
683 *
684 * path = path-abempty ; begins with "/" or is empty
685 * / path-absolute ; begins with "/" but not "//"
686 * / path-noscheme ; begins with a non-colon segment
687 * / path-rootless ; begins with a segment
688 * / path-empty ; zero characters
689 */
690
691 if (*url == '\0')
692 return true; // path-empty
693
694 if (*url == '/') {
695 // RFC 3986 section 5.2.3
696 // path-absolute ; begins with "/" but not "//"
697 if (url[1] == '/')
698 return true; // network-path reference, aka. 'scheme-relative URI'
699 else
700 return true; // path-absolute, aka 'absolute-path reference'
701 }
702
703 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
704 if (*p == ':')
705 return false; // colon is forbidden in first segment
706 }
707
708 return true; // path-noscheme, path-abempty, path-rootless
709 }
710
711 void
712 AnyP::Uri::addRelativePath(const char *relUrl)
713 {
714 // URN cannot be merged
715 if (getScheme() == AnyP::PROTO_URN)
716 return;
717
718 // TODO: Handle . and .. segment normalization
719
720 const auto lastSlashPos = path_.rfind('/');
721 // TODO: To optimize and simplify, add and use SBuf::replace().
722 const auto relUrlLength = strlen(relUrl);
723 if (lastSlashPos == SBuf::npos) {
724 // start replacing the whole path
725 path_.reserveCapacity(1 + relUrlLength);
726 path_.assign("/", 1);
727 } else {
728 // start replacing just the last segment
729 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
730 path_.chop(0, lastSlashPos+1);
731 }
732 path_.append(relUrl, relUrlLength);
733 }
734
735 int
736 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
737 {
738 int dl;
739 int hl;
740
741 const bool hostIncludesSubdomains = (*h == '.');
742 while ('.' == *h)
743 ++h;
744
745 hl = strlen(h);
746
747 if (hl == 0)
748 return -1;
749
750 dl = strlen(d);
751
752 /*
753 * Start at the ends of the two strings and work towards the
754 * beginning.
755 */
756 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
757 if (hl == 0 && dl == 0) {
758 /*
759 * We made it all the way to the beginning of both
760 * strings without finding any difference.
761 */
762 return 0;
763 }
764
765 if (0 == hl) {
766 /*
767 * The host string is shorter than the domain string.
768 * There is only one case when this can be a match.
769 * If the domain is just one character longer, and if
770 * that character is a leading '.' then we call it a
771 * match.
772 */
773
774 if (1 == dl && '.' == d[0])
775 return 0;
776 else
777 return -1;
778 }
779
780 if (0 == dl) {
781 /*
782 * The domain string is shorter than the host string.
783 * This is a match only if the first domain character
784 * is a leading '.'.
785 */
786
787 if ('.' == d[0]) {
788 if (flags & mdnRejectSubsubDomains) {
789 // Check for sub-sub domain and reject
790 while(--hl >= 0 && h[hl] != '.');
791 if (hl < 0) {
792 // No sub-sub domain found, but reject if there is a
793 // leading dot in given host string (which is removed
794 // before the check is started).
795 return hostIncludesSubdomains ? 1 : 0;
796 } else
797 return 1; // sub-sub domain, reject
798 } else
799 return 0;
800 } else
801 return 1;
802 }
803 }
804
805 /*
806 * We found different characters in the same position (from the end).
807 */
808
809 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
810 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
811 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
812 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
813 return 0;
814
815 /*
816 * If one of those character is '.' then its special. In order
817 * for splay tree sorting to work properly, "x-foo.com" must
818 * be greater than ".foo.com" even though '-' is less than '.'.
819 */
820 if ('.' == d[dl])
821 return 1;
822
823 if ('.' == h[hl])
824 return -1;
825
826 return (xtolower(h[hl]) - xtolower(d[dl]));
827 }
828
829 /*
830 * return true if we can serve requests for this method.
831 */
832 bool
833 urlCheckRequest(const HttpRequest * r)
834 {
835 /* protocol "independent" methods
836 *
837 * actually these methods are specific to HTTP:
838 * they are methods we receive on our HTTP port,
839 * and if we had a FTP listener would not be relevant
840 * there.
841 *
842 * So, we should delegate them to HTTP. The problem is that we
843 * do not have a default protocol from the client side of HTTP.
844 */
845
846 if (r->method == Http::METHOD_CONNECT)
847 return true;
848
849 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
850 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
851 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
852 return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
853
854 if (r->method == Http::METHOD_PURGE)
855 return true;
856
857 /* does method match the protocol? */
858 switch (r->url.getScheme()) {
859
860 case AnyP::PROTO_URN:
861 case AnyP::PROTO_HTTP:
862 case AnyP::PROTO_CACHE_OBJECT:
863 return true;
864
865 case AnyP::PROTO_FTP:
866 if (r->method == Http::METHOD_PUT ||
867 r->method == Http::METHOD_GET ||
868 r->method == Http::METHOD_HEAD )
869 return true;
870 return false;
871
872 case AnyP::PROTO_WAIS:
873 case AnyP::PROTO_WHOIS:
874 if (r->method == Http::METHOD_GET ||
875 r->method == Http::METHOD_HEAD)
876 return true;
877 return false;
878
879 case AnyP::PROTO_HTTPS:
880 #if USE_OPENSSL || USE_GNUTLS
881 return true;
882 #else
883 /*
884 * Squid can't originate an SSL connection, so it should
885 * never receive an "https:" URL. It should always be
886 * CONNECT instead.
887 */
888 return false;
889 #endif
890
891 default:
892 return false;
893 }
894
895 /* notreached */
896 return false;
897 }
898
899 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
900 scheme_(aScheme),
901 hostIsNumeric_(false),
902 port_(0)
903 {
904 *host_=0;
905 }
906
907 // TODO: fix code duplication with AnyP::Uri::parse()
908 char *
909 AnyP::Uri::cleanup(const char *uri)
910 {
911 char *cleanedUri = nullptr;
912 switch (Config.uri_whitespace) {
913 case URI_WHITESPACE_ALLOW: {
914 const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED;
915 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
916 break;
917 }
918
919 case URI_WHITESPACE_ENCODE:
920 cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL);
921 break;
922
923 case URI_WHITESPACE_CHOP: {
924 const auto pos = strcspn(uri, w_space);
925 char *choppedUri = nullptr;
926 if (pos < strlen(uri))
927 choppedUri = xstrndup(uri, pos + 1);
928 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
929 RFC1738_ESCAPE_UNESCAPED), MAX_URL);
930 cleanedUri[pos] = '\0';
931 xfree(choppedUri);
932 break;
933 }
934
935 case URI_WHITESPACE_DENY:
936 case URI_WHITESPACE_STRIP:
937 default: {
938 // TODO: avoid duplication with urlParse()
939 const char *t;
940 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
941 char *q = tmp_uri;
942 t = uri;
943 while (*t) {
944 if (!xisspace(*t)) {
945 *q = *t;
946 ++q;
947 }
948 ++t;
949 }
950 *q = '\0';
951 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
952 xfree(tmp_uri);
953 break;
954 }
955 }
956
957 assert(cleanedUri);
958 return cleanedUri;
959 }
960