]> git.ipfire.org Git - thirdparty/squid.git/blob - src/anyp/Uri.cc
Fix clang (with its own libc++) build after 9865de7 (#661)
[thirdparty/squid.git] / src / anyp / Uri.cc
1 /*
2 * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 /* DEBUG: section 23 URL Parsing */
10
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "globals.h"
14 #include "HttpRequest.h"
15 #include "parser/Tokenizer.h"
16 #include "rfc1738.h"
17 #include "SquidConfig.h"
18 #include "SquidString.h"
19
20 static const char valid_hostname_chars_u[] =
21 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22 "abcdefghijklmnopqrstuvwxyz"
23 "0123456789-._"
24 "[:]"
25 ;
26 static const char valid_hostname_chars[] =
27 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
28 "abcdefghijklmnopqrstuvwxyz"
29 "0123456789-."
30 "[:]"
31 ;
32
33 /// Characters which are valid within a URI userinfo section
34 static const CharacterSet &
35 UserInfoChars()
36 {
37 /*
38 * RFC 3986 section 3.2.1
39 *
40 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
41 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
42 * pct-encoded = "%" HEXDIG HEXDIG
43 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
44 */
45 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
46 CharacterSet::ALPHA +
47 CharacterSet::DIGIT;
48 return userInfoValid;
49 }
50
51 /**
52 * Governed by RFC 3986 section 2.1
53 */
54 SBuf
55 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
56 {
57 if (buf.isEmpty())
58 return buf;
59
60 Parser::Tokenizer tk(buf);
61 SBuf goodSection;
62 // optimization for the arguably common "no encoding necessary" case
63 if (tk.prefix(goodSection, ignore) && tk.atEnd())
64 return buf;
65
66 SBuf output;
67 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
68 output.append(goodSection); // may be empty
69
70 while (!tk.atEnd()) {
71 // TODO: Add Tokenizer::parseOne(void).
72 const auto ch = tk.remaining()[0];
73 output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
74 (void)tk.skip(ch);
75
76 if (tk.prefix(goodSection, ignore))
77 output.append(goodSection);
78 }
79
80 return output;
81 }
82
83 const SBuf &
84 AnyP::Uri::Asterisk()
85 {
86 static SBuf star("*");
87 return star;
88 }
89
90 const SBuf &
91 AnyP::Uri::SlashPath()
92 {
93 static SBuf slash("/");
94 return slash;
95 }
96
97 void
98 AnyP::Uri::host(const char *src)
99 {
100 hostAddr_.setEmpty();
101 hostAddr_ = src;
102 if (hostAddr_.isAnyAddr()) {
103 xstrncpy(host_, src, sizeof(host_));
104 hostIsNumeric_ = false;
105 } else {
106 hostAddr_.toHostStr(host_, sizeof(host_));
107 debugs(23, 3, "given IP: " << hostAddr_);
108 hostIsNumeric_ = 1;
109 }
110 touch();
111 }
112
113 SBuf
114 AnyP::Uri::hostOrIp() const
115 {
116 static char ip[MAX_IPSTRLEN];
117 if (hostIsNumeric())
118 return SBuf(hostIP().toStr(ip, sizeof(ip)));
119 else
120 return SBuf(host());
121 }
122
123 const SBuf &
124 AnyP::Uri::path() const
125 {
126 // RFC 3986 section 3.3 says path can be empty (path-abempty).
127 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
128 // at least when sending and using. We must still accept path-abempty as input.
129 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
130 return SlashPath();
131
132 return path_;
133 }
134
135 void
136 urlInitialize(void)
137 {
138 debugs(23, 5, "urlInitialize: Initializing...");
139 /* this ensures that the number of protocol strings is the same as
140 * the enum slots allocated because the last enum is always 'MAX'.
141 */
142 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
143 /*
144 * These test that our matchDomainName() function works the
145 * way we expect it to.
146 */
147 assert(0 == matchDomainName("foo.com", "foo.com"));
148 assert(0 == matchDomainName(".foo.com", "foo.com"));
149 assert(0 == matchDomainName("foo.com", ".foo.com"));
150 assert(0 == matchDomainName(".foo.com", ".foo.com"));
151 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
152 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
153 assert(0 != matchDomainName("x.foo.com", "foo.com"));
154 assert(0 != matchDomainName("foo.com", "x.foo.com"));
155 assert(0 != matchDomainName("bar.com", "foo.com"));
156 assert(0 != matchDomainName(".bar.com", "foo.com"));
157 assert(0 != matchDomainName(".bar.com", ".foo.com"));
158 assert(0 != matchDomainName("bar.com", ".foo.com"));
159 assert(0 < matchDomainName("zzz.com", "foo.com"));
160 assert(0 > matchDomainName("aaa.com", "foo.com"));
161 assert(0 == matchDomainName("FOO.com", "foo.COM"));
162 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
163 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
164 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
165
166 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
167 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
168 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
169 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
170
171 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
172 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
173 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
174 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
175
176 /* more cases? */
177 }
178
179 /**
180 * Extract the URI scheme and ':' delimiter from the given input buffer.
181 *
182 * Schemes up to 16 characters are accepted.
183 *
184 * Governed by RFC 3986 section 3.1
185 */
186 static AnyP::UriScheme
187 uriParseScheme(Parser::Tokenizer &tok)
188 {
189 /*
190 * RFC 3986 section 3.1 paragraph 2:
191 *
192 * Scheme names consist of a sequence of characters beginning with a
193 * letter and followed by any combination of letters, digits, plus
194 * ("+"), period ("."), or hyphen ("-").
195 *
196 * The underscore ("_") required to match "cache_object://" squid
197 * special URI scheme.
198 */
199 static const auto schemeChars =
200 #if USE_HTTP_VIOLATIONS
201 CharacterSet("special", "_") +
202 #endif
203 CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
204
205 SBuf str;
206 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
207 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
208 if (protocol == AnyP::PROTO_UNKNOWN)
209 return AnyP::UriScheme(protocol, str.c_str());
210 return AnyP::UriScheme(protocol, nullptr);
211 }
212
213 throw TextException("invalid URI scheme", Here());
214 }
215
216 /**
217 * Appends configured append_domain to hostname, assuming
218 * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
219 * and that the host FQDN is not a 'dotless' TLD.
220 *
221 * \returns false if and only if there is not enough space to append
222 */
223 bool
224 urlAppendDomain(char *host)
225 {
226 /* For IPv4 addresses check for a dot */
227 /* For IPv6 addresses also check for a colon */
228 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
229 const uint64_t dlen = strlen(host);
230 const uint64_t want = dlen + Config.appendDomainLen;
231 if (want > SQUIDHOSTNAMELEN - 1) {
232 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
233 return false;
234 }
235 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
236 }
237 return true;
238 }
239
240 /*
241 * Parse a URI/URL.
242 *
243 * It is assumed that the URL is complete -
244 * ie, the end of the string is the end of the URL. Don't pass a partial
245 * URL here as this routine doesn't have any way of knowing whether
246 * it is partial or not (ie, it handles the case of no trailing slash as
247 * being "end of host with implied path of /".
248 *
249 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
250 * then rather than a URL a hostname:port is looked for.
251 */
252 bool
253 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
254 {
255 try {
256
257 LOCAL_ARRAY(char, login, MAX_URL);
258 LOCAL_ARRAY(char, foundHost, MAX_URL);
259 LOCAL_ARRAY(char, urlpath, MAX_URL);
260 char *t = NULL;
261 char *q = NULL;
262 int foundPort;
263 int l;
264 int i;
265 const char *src;
266 char *dst;
267 foundHost[0] = urlpath[0] = login[0] = '\0';
268
269 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
270 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
271 return false;
272 }
273
274 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
275 Asterisk().cmp(rawUrl) == 0) {
276 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
277 setScheme(AnyP::PROTO_HTTP, nullptr);
278 port(getScheme().defaultPort());
279 path(Asterisk());
280 return true;
281 }
282
283 Parser::Tokenizer tok(rawUrl);
284 AnyP::UriScheme scheme;
285
286 if (method == Http::METHOD_CONNECT) {
287 /*
288 * RFC 7230 section 5.3.3: authority-form = authority
289 * "excluding any userinfo and its "@" delimiter"
290 *
291 * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
292 *
293 * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
294 */
295 foundPort = 443;
296
297 // XXX: use tokenizer
298 auto B = tok.buf();
299 const char *url = B.c_str();
300
301 if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
302 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
303 return false;
304
305 } else {
306
307 scheme = uriParseScheme(tok);
308
309 if (scheme == AnyP::PROTO_NONE)
310 return false; // invalid scheme
311
312 if (scheme == AnyP::PROTO_URN) {
313 parseUrn(tok); // throws on any error
314 return true;
315 }
316
317 // URLs then have "//"
318 static const SBuf doubleSlash("//");
319 if (!tok.skip(doubleSlash))
320 return false;
321
322 auto B = tok.remaining();
323 const char *url = B.c_str();
324
325 /* Parse the URL: */
326 src = url;
327 i = 0;
328
329 /* Then everything until first /; that's host (and port; which we'll look for here later) */
330 // bug 1881: If we don't get a "/" then we imply it was there
331 // bug 3074: We could just be given a "?" or "#". These also imply "/"
332 // bug 3233: whitespace is also a hostname delimiter.
333 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
334 *dst = *src;
335 }
336
337 /*
338 * We can't check for "i >= l" here because we could be at the end of the line
339 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
340 * been -given- a valid URL and the path is just '/'.
341 */
342 if (i > l)
343 return false;
344 *dst = '\0';
345
346 // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
347 if (*src == '?' || *src == '#' || *src == '\0') {
348 urlpath[0] = '/';
349 dst = &urlpath[1];
350 } else {
351 dst = urlpath;
352 }
353 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
354 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
355 *dst = *src;
356 }
357
358 /* We -could- be at the end of the buffer here */
359 if (i > l)
360 return false;
361 /* If the URL path is empty we set it to be "/" */
362 if (dst == urlpath) {
363 *dst = '/';
364 ++dst;
365 }
366 *dst = '\0';
367
368 foundPort = scheme.defaultPort(); // may be reset later
369
370 /* Is there any login information? (we should eventually parse it above) */
371 t = strrchr(foundHost, '@');
372 if (t != NULL) {
373 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
374 login[sizeof(login)-1] = '\0';
375 t = strrchr(login, '@');
376 *t = 0;
377 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
378 foundHost[sizeof(foundHost)-1] = '\0';
379 // Bug 4498: URL-unescape the login info after extraction
380 rfc1738_unescape(login);
381 }
382
383 /* Is there any host information? (we should eventually parse it above) */
384 if (*foundHost == '[') {
385 /* strip any IPA brackets. valid under IPv6. */
386 dst = foundHost;
387 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
388 src = foundHost;
389 ++src;
390 l = strlen(foundHost);
391 i = 1;
392 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
393 *dst = *src;
394 }
395
396 /* we moved in-place, so truncate the actual hostname found */
397 *dst = '\0';
398 ++dst;
399
400 /* skip ahead to either start of port, or original EOS */
401 while (*dst != '\0' && *dst != ':')
402 ++dst;
403 t = dst;
404 } else {
405 t = strrchr(foundHost, ':');
406
407 if (t != strchr(foundHost,':') ) {
408 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
409 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
410 /* therefore we MUST accept the case where they are not bracketed at all. */
411 t = NULL;
412 }
413 }
414
415 // Bug 3183 sanity check: If scheme is present, host must be too.
416 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
417 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
418 return false;
419 }
420
421 if (t && *t == ':') {
422 *t = '\0';
423 ++t;
424 foundPort = atoi(t);
425 }
426 }
427
428 for (t = foundHost; *t; ++t)
429 *t = xtolower(*t);
430
431 if (stringHasWhitespace(foundHost)) {
432 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
433 t = q = foundHost;
434 while (*t) {
435 if (!xisspace(*t)) {
436 *q = *t;
437 ++q;
438 }
439 ++t;
440 }
441 *q = '\0';
442 }
443 }
444
445 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
446
447 if (Config.onoff.check_hostnames &&
448 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
449 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
450 return false;
451 }
452
453 if (!urlAppendDomain(foundHost))
454 return false;
455
456 /* remove trailing dots from hostnames */
457 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
458 foundHost[l] = '\0';
459
460 /* reject duplicate or leading dots */
461 if (strstr(foundHost, "..") || *foundHost == '.') {
462 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
463 return false;
464 }
465
466 if (foundPort < 1 || foundPort > 65535) {
467 debugs(23, 3, "Invalid port '" << foundPort << "'");
468 return false;
469 }
470
471 #if HARDCODE_DENY_PORTS
472 /* These ports are filtered in the default squid.conf, but
473 * maybe someone wants them hardcoded... */
474 if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
475 debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
476 return false;
477 }
478 #endif
479
480 if (stringHasWhitespace(urlpath)) {
481 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
482
483 switch (Config.uri_whitespace) {
484
485 case URI_WHITESPACE_DENY:
486 return false;
487
488 case URI_WHITESPACE_ALLOW:
489 break;
490
491 case URI_WHITESPACE_ENCODE:
492 t = rfc1738_escape_unescaped(urlpath);
493 xstrncpy(urlpath, t, MAX_URL);
494 break;
495
496 case URI_WHITESPACE_CHOP:
497 *(urlpath + strcspn(urlpath, w_space)) = '\0';
498 break;
499
500 case URI_WHITESPACE_STRIP:
501 default:
502 t = q = urlpath;
503 while (*t) {
504 if (!xisspace(*t)) {
505 *q = *t;
506 ++q;
507 }
508 ++t;
509 }
510 *q = '\0';
511 }
512 }
513
514 setScheme(scheme);
515 path(urlpath);
516 host(foundHost);
517 userInfo(SBuf(login));
518 port(foundPort);
519 return true;
520
521 } catch (...) {
522 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
523 return false;
524 }
525 }
526
527 /**
528 * Governed by RFC 8141 section 2:
529 *
530 * assigned-name = "urn" ":" NID ":" NSS
531 * NID = (alphanum) 0*30(ldh) (alphanum)
532 * ldh = alphanum / "-"
533 * NSS = pchar *(pchar / "/")
534 *
535 * RFC 3986 Appendix D.2 defines (as deprecated):
536 *
537 * alphanum = ALPHA / DIGIT
538 *
539 * Notice that NID is exactly 2-32 characters in length.
540 */
541 void
542 AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
543 {
544 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
545 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
546 SBuf nid;
547 if (!tok.prefix(nid, nidChars, 32))
548 throw TextException("NID not found", Here());
549
550 if (!tok.skip(':'))
551 throw TextException("NID too long or missing ':' delimiter", Here());
552
553 if (nid.length() < 2)
554 throw TextException("NID too short", Here());
555
556 if (!alphanum[*nid.begin()])
557 throw TextException("NID prefix is not alphanumeric", Here());
558
559 if (!alphanum[*nid.rbegin()])
560 throw TextException("NID suffix is not alphanumeric", Here());
561
562 setScheme(AnyP::PROTO_URN, nullptr);
563 host(nid.c_str());
564 // TODO validate path characters
565 path(tok.remaining());
566 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
567 }
568
569 void
570 AnyP::Uri::touch()
571 {
572 absolute_.clear();
573 authorityHttp_.clear();
574 authorityWithPort_.clear();
575 }
576
577 SBuf &
578 AnyP::Uri::authority(bool requirePort) const
579 {
580 if (authorityHttp_.isEmpty()) {
581
582 // both formats contain Host/IP
583 authorityWithPort_.append(host());
584 authorityHttp_ = authorityWithPort_;
585
586 // authorityForm_ only has :port if it is non-default
587 authorityWithPort_.appendf(":%u",port());
588 if (port() != getScheme().defaultPort())
589 authorityHttp_ = authorityWithPort_;
590 }
591
592 return requirePort ? authorityWithPort_ : authorityHttp_;
593 }
594
595 SBuf &
596 AnyP::Uri::absolute() const
597 {
598 if (absolute_.isEmpty()) {
599 // TODO: most URL will be much shorter, avoid allocating this much
600 absolute_.reserveCapacity(MAX_URL);
601
602 absolute_.append(getScheme().image());
603 absolute_.append(":",1);
604 if (getScheme() != AnyP::PROTO_URN) {
605 absolute_.append("//", 2);
606 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
607 getScheme() == AnyP::PROTO_UNKNOWN;
608
609 if (allowUserInfo && !userInfo().isEmpty()) {
610 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
611 .remove('%')
612 .rename("userinfo-reserved");
613 absolute_.append(Encode(userInfo(), uiChars));
614 absolute_.append("@", 1);
615 }
616 absolute_.append(authority());
617 } else {
618 absolute_.append(host());
619 absolute_.append(":", 1);
620 }
621 absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
622 }
623
624 return absolute_;
625 }
626
627 /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
628 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
629 * and never copy the query-string part in the first place
630 */
631 char *
632 urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
633 {
634 LOCAL_ARRAY(char, buf, MAX_URL);
635
636 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
637 buf[sizeof(buf)-1] = '\0';
638
639 // URN, CONNECT method, and non-stripped URIs can go straight out
640 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
641 // strip anything AFTER a question-mark
642 // leaving the '?' in place
643 if (auto t = strchr(buf, '?')) {
644 *(++t) = '\0';
645 }
646 }
647
648 if (stringHasCntl(buf))
649 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
650
651 return buf;
652 }
653
654 /**
655 * Yet another alternative to urlCanonical.
656 * This one adds the https:// parts to Http::METHOD_CONNECT URL
657 * for use in error page outputs.
658 * Luckily we can leverage the others instead of duplicating.
659 */
660 const char *
661 urlCanonicalFakeHttps(const HttpRequest * request)
662 {
663 LOCAL_ARRAY(char, buf, MAX_URL);
664
665 // method CONNECT and port HTTPS
666 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
667 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
668 return buf;
669 }
670
671 // else do the normal complete canonical thing.
672 return request->canonicalCleanUrl();
673 }
674
675 /**
676 * Test if a URL is a relative reference.
677 *
678 * Governed by RFC 3986 section 4.2
679 *
680 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
681 *
682 * relative-part = "//" authority path-abempty
683 * / path-absolute
684 * / path-noscheme
685 * / path-empty
686 */
687 bool
688 urlIsRelative(const char *url)
689 {
690 if (!url)
691 return false; // no URL
692
693 /*
694 * RFC 3986 section 5.2.3
695 *
696 * path = path-abempty ; begins with "/" or is empty
697 * / path-absolute ; begins with "/" but not "//"
698 * / path-noscheme ; begins with a non-colon segment
699 * / path-rootless ; begins with a segment
700 * / path-empty ; zero characters
701 */
702
703 if (*url == '\0')
704 return true; // path-empty
705
706 if (*url == '/') {
707 // RFC 3986 section 5.2.3
708 // path-absolute ; begins with "/" but not "//"
709 if (url[1] == '/')
710 return true; // network-path reference, aka. 'scheme-relative URI'
711 else
712 return true; // path-absolute, aka 'absolute-path reference'
713 }
714
715 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
716 if (*p == ':')
717 return false; // colon is forbidden in first segment
718 }
719
720 return true; // path-noscheme, path-abempty, path-rootless
721 }
722
723 void
724 AnyP::Uri::addRelativePath(const char *relUrl)
725 {
726 // URN cannot be merged
727 if (getScheme() == AnyP::PROTO_URN)
728 return;
729
730 // TODO: Handle . and .. segment normalization
731
732 const auto lastSlashPos = path_.rfind('/');
733 // TODO: To optimize and simplify, add and use SBuf::replace().
734 const auto relUrlLength = strlen(relUrl);
735 if (lastSlashPos == SBuf::npos) {
736 // start replacing the whole path
737 path_.reserveCapacity(1 + relUrlLength);
738 path_.assign("/", 1);
739 } else {
740 // start replacing just the last segment
741 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
742 path_.chop(0, lastSlashPos+1);
743 }
744 path_.append(relUrl, relUrlLength);
745 }
746
747 int
748 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
749 {
750 int dl;
751 int hl;
752
753 const bool hostIncludesSubdomains = (*h == '.');
754 while ('.' == *h)
755 ++h;
756
757 hl = strlen(h);
758
759 if (hl == 0)
760 return -1;
761
762 dl = strlen(d);
763
764 /*
765 * Start at the ends of the two strings and work towards the
766 * beginning.
767 */
768 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
769 if (hl == 0 && dl == 0) {
770 /*
771 * We made it all the way to the beginning of both
772 * strings without finding any difference.
773 */
774 return 0;
775 }
776
777 if (0 == hl) {
778 /*
779 * The host string is shorter than the domain string.
780 * There is only one case when this can be a match.
781 * If the domain is just one character longer, and if
782 * that character is a leading '.' then we call it a
783 * match.
784 */
785
786 if (1 == dl && '.' == d[0])
787 return 0;
788 else
789 return -1;
790 }
791
792 if (0 == dl) {
793 /*
794 * The domain string is shorter than the host string.
795 * This is a match only if the first domain character
796 * is a leading '.'.
797 */
798
799 if ('.' == d[0]) {
800 if (flags & mdnRejectSubsubDomains) {
801 // Check for sub-sub domain and reject
802 while(--hl >= 0 && h[hl] != '.');
803 if (hl < 0) {
804 // No sub-sub domain found, but reject if there is a
805 // leading dot in given host string (which is removed
806 // before the check is started).
807 return hostIncludesSubdomains ? 1 : 0;
808 } else
809 return 1; // sub-sub domain, reject
810 } else
811 return 0;
812 } else
813 return 1;
814 }
815 }
816
817 /*
818 * We found different characters in the same position (from the end).
819 */
820
821 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
822 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
823 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
824 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
825 return 0;
826
827 /*
828 * If one of those character is '.' then its special. In order
829 * for splay tree sorting to work properly, "x-foo.com" must
830 * be greater than ".foo.com" even though '-' is less than '.'.
831 */
832 if ('.' == d[dl])
833 return 1;
834
835 if ('.' == h[hl])
836 return -1;
837
838 return (xtolower(h[hl]) - xtolower(d[dl]));
839 }
840
841 /*
842 * return true if we can serve requests for this method.
843 */
844 int
845 urlCheckRequest(const HttpRequest * r)
846 {
847 int rc = 0;
848 /* protocol "independent" methods
849 *
850 * actually these methods are specific to HTTP:
851 * they are methods we receive on our HTTP port,
852 * and if we had a FTP listener would not be relevant
853 * there.
854 *
855 * So, we should delegate them to HTTP. The problem is that we
856 * do not have a default protocol from the client side of HTTP.
857 */
858
859 if (r->method == Http::METHOD_CONNECT)
860 return 1;
861
862 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
863 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
864 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
865 return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
866
867 if (r->method == Http::METHOD_PURGE)
868 return 1;
869
870 /* does method match the protocol? */
871 switch (r->url.getScheme()) {
872
873 case AnyP::PROTO_URN:
874
875 case AnyP::PROTO_HTTP:
876
877 case AnyP::PROTO_CACHE_OBJECT:
878 rc = 1;
879 break;
880
881 case AnyP::PROTO_FTP:
882
883 if (r->method == Http::METHOD_PUT)
884 rc = 1;
885
886 case AnyP::PROTO_GOPHER:
887
888 case AnyP::PROTO_WAIS:
889
890 case AnyP::PROTO_WHOIS:
891 if (r->method == Http::METHOD_GET)
892 rc = 1;
893 else if (r->method == Http::METHOD_HEAD)
894 rc = 1;
895
896 break;
897
898 case AnyP::PROTO_HTTPS:
899 #if USE_OPENSSL
900 rc = 1;
901 #elif USE_GNUTLS
902 rc = 1;
903 #else
904 /*
905 * Squid can't originate an SSL connection, so it should
906 * never receive an "https:" URL. It should always be
907 * CONNECT instead.
908 */
909 rc = 0;
910 #endif
911 break;
912
913 default:
914 break;
915 }
916
917 return rc;
918 }
919
920 AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
921 scheme_(aScheme),
922 hostIsNumeric_(false),
923 port_(0)
924 {
925 *host_=0;
926 }
927
928 // TODO: fix code duplication with AnyP::Uri::parse()
929 char *
930 AnyP::Uri::cleanup(const char *uri)
931 {
932 int flags = 0;
933 char *cleanedUri = nullptr;
934 switch (Config.uri_whitespace) {
935 case URI_WHITESPACE_ALLOW:
936 flags |= RFC1738_ESCAPE_NOSPACE;
937 // fall through to next case
938 case URI_WHITESPACE_ENCODE:
939 flags |= RFC1738_ESCAPE_UNESCAPED;
940 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
941 break;
942
943 case URI_WHITESPACE_CHOP: {
944 flags |= RFC1738_ESCAPE_UNESCAPED;
945 const auto pos = strcspn(uri, w_space);
946 char *choppedUri = nullptr;
947 if (pos < strlen(uri))
948 choppedUri = xstrndup(uri, pos + 1);
949 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
950 cleanedUri[pos] = '\0';
951 xfree(choppedUri);
952 }
953 break;
954
955 case URI_WHITESPACE_DENY:
956 case URI_WHITESPACE_STRIP:
957 default: {
958 // TODO: avoid duplication with urlParse()
959 const char *t;
960 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
961 char *q = tmp_uri;
962 t = uri;
963 while (*t) {
964 if (!xisspace(*t)) {
965 *q = *t;
966 ++q;
967 }
968 ++t;
969 }
970 *q = '\0';
971 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
972 xfree(tmp_uri);
973 }
974 break;
975 }
976
977 assert(cleanedUri);
978 return cleanedUri;
979 }
980