]> git.ipfire.org Git - thirdparty/squid.git/blame - src/anyp/Uri.cc
Cleanup: remove urlHostname hacks (#615)
[thirdparty/squid.git] / src / anyp / Uri.cc
CommitLineData
30a4f2a8 1/*
77b1029d 2 * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
e25c139f 3 *
bbc27441
AJ
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
019dd986 7 */
ed43818f 8
bbc27441
AJ
9/* DEBUG: section 23 URL Parsing */
10
f7f3304a 11#include "squid.h"
c8ab5ec6 12#include "anyp/Uri.h"
582c2af2 13#include "globals.h"
528b2c61 14#include "HttpRequest.h"
6c880a16 15#include "parser/Tokenizer.h"
1fa9b1a7 16#include "rfc1738.h"
4d5904f7 17#include "SquidConfig.h"
7a707cb5 18#include "SquidString.h"
090089c4 19
a78278e2 20static const char valid_hostname_chars_u[] =
62e76326 21 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22 "abcdefghijklmnopqrstuvwxyz"
a78278e2 23 "0123456789-._"
cc192b50 24 "[:]"
a78278e2 25 ;
26static const char valid_hostname_chars[] =
62e76326 27 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
28 "abcdefghijklmnopqrstuvwxyz"
29 "0123456789-."
cc192b50 30 "[:]"
62e76326 31 ;
090089c4 32
2e260208 33const SBuf &
c8ab5ec6 34AnyP::Uri::Asterisk()
2e260208
AJ
35{
36 static SBuf star("*");
37 return star;
38}
39
51b5dcf5 40const SBuf &
c8ab5ec6 41AnyP::Uri::SlashPath()
51b5dcf5
AJ
42{
43 static SBuf slash("/");
44 return slash;
45}
46
5c51bffb 47void
c8ab5ec6 48AnyP::Uri::host(const char *src)
5c51bffb
AJ
49{
50 hostAddr_.setEmpty();
51 hostAddr_ = src;
52 if (hostAddr_.isAnyAddr()) {
53 xstrncpy(host_, src, sizeof(host_));
54 hostIsNumeric_ = false;
55 } else {
56 hostAddr_.toHostStr(host_, sizeof(host_));
57 debugs(23, 3, "given IP: " << hostAddr_);
58 hostIsNumeric_ = 1;
59 }
60 touch();
61}
62
9ce4a1eb
CT
63SBuf
64AnyP::Uri::hostOrIp() const
65{
66 static char ip[MAX_IPSTRLEN];
67 if (hostIsNumeric())
68 return SBuf(hostIP().toStr(ip, sizeof(ip)));
69 else
70 return SBuf(host());
71}
72
51b5dcf5 73const SBuf &
c8ab5ec6 74AnyP::Uri::path() const
51b5dcf5
AJ
75{
76 // RFC 3986 section 3.3 says path can be empty (path-abempty).
77 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
78 // at least when sending and using. We must still accept path-abempty as input.
79 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
80 return SlashPath();
81
82 return path_;
83}
84
b8d8561b 85void
0673c0ba 86urlInitialize(void)
090089c4 87{
bf8fe701 88 debugs(23, 5, "urlInitialize: Initializing...");
985c86bc 89 /* this ensures that the number of protocol strings is the same as
0c3d3f65 90 * the enum slots allocated because the last enum is always 'MAX'.
985c86bc 91 */
0c3d3f65 92 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
9bc73deb 93 /*
94 * These test that our matchDomainName() function works the
95 * way we expect it to.
96 */
97 assert(0 == matchDomainName("foo.com", "foo.com"));
d20b1cd0 98 assert(0 == matchDomainName(".foo.com", "foo.com"));
9bc73deb 99 assert(0 == matchDomainName("foo.com", ".foo.com"));
100 assert(0 == matchDomainName(".foo.com", ".foo.com"));
101 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
abbd7825 102 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
9bc73deb 103 assert(0 != matchDomainName("x.foo.com", "foo.com"));
104 assert(0 != matchDomainName("foo.com", "x.foo.com"));
105 assert(0 != matchDomainName("bar.com", "foo.com"));
106 assert(0 != matchDomainName(".bar.com", "foo.com"));
107 assert(0 != matchDomainName(".bar.com", ".foo.com"));
108 assert(0 != matchDomainName("bar.com", ".foo.com"));
109 assert(0 < matchDomainName("zzz.com", "foo.com"));
110 assert(0 > matchDomainName("aaa.com", "foo.com"));
111 assert(0 == matchDomainName("FOO.com", "foo.COM"));
aca95add 112 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
113 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
d20b1cd0 114 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
abbd7825
CT
115
116 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
117 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
118 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
119 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
120
121 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
122 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
123 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
124 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
125
9bc73deb 126 /* more cases? */
090089c4 127}
128
cc192b50 129/**
6c880a16
AJ
130 * Extract the URI scheme and ':' delimiter from the given input buffer.
131 *
132 * Schemes up to 16 characters are accepted.
133 *
134 * Governed by RFC 3986 section 3.1
d4a04ed5 135 */
6c880a16
AJ
136static AnyP::UriScheme
137uriParseScheme(Parser::Tokenizer &tok)
92a6f4b1 138{
6c880a16
AJ
139 /*
140 * RFC 3986 section 3.1 paragraph 2:
141 *
142 * Scheme names consist of a sequence of characters beginning with a
143 * letter and followed by any combination of letters, digits, plus
144 * ("+"), period ("."), or hyphen ("-").
091213e6
CT
145 *
146 * The underscore ("_") required to match "cache_object://" squid
147 * special URI scheme.
6c880a16 148 */
091213e6
CT
149 static const auto schemeChars =
150#if USE_HTTP_VIOLATIONS
151 CharacterSet("special", "_") +
152#endif
153 CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
6c880a16
AJ
154
155 SBuf str;
156 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
157 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
158 if (protocol == AnyP::PROTO_UNKNOWN)
159 return AnyP::UriScheme(protocol, str.c_str());
160 return AnyP::UriScheme(protocol, nullptr);
161 }
d31d59d8 162
6c880a16 163 throw TextException("invalid URI scheme", Here());
92a6f4b1 164}
165
38aa10ef
AJ
166/**
167 * Appends configured append_domain to hostname, assuming
168 * the given buffer is at least SQUIDHOSTNAMELEN bytes long,
169 * and that the host FQDN is not a 'dotless' TLD.
170 *
171 * \returns false if and only if there is not enough space to append
172 */
173bool
174urlAppendDomain(char *host)
175{
176 /* For IPv4 addresses check for a dot */
177 /* For IPv6 addresses also check for a colon */
178 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
179 const uint64_t dlen = strlen(host);
180 const uint64_t want = dlen + Config.appendDomainLen;
181 if (want > SQUIDHOSTNAMELEN - 1) {
182 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
183 return false;
184 }
185 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
186 }
187 return true;
188}
189
d4a04ed5 190/*
191 * Parse a URI/URL.
192 *
6c880a16 193 * It is assumed that the URL is complete -
cc192b50 194 * ie, the end of the string is the end of the URL. Don't pass a partial
195 * URL here as this routine doesn't have any way of knowing whether
6c880a16 196 * it is partial or not (ie, it handles the case of no trailing slash as
cc192b50 197 * being "end of host with implied path of /".
6c880a16
AJ
198 *
199 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
200 * then rather than a URL a hostname:port is looked for.
cc192b50 201 */
9157915c 202bool
6c880a16 203AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
7111c86a 204{
6c880a16
AJ
205 try {
206
77b1029d 207 LOCAL_ARRAY(char, login, MAX_URL);
208 LOCAL_ARRAY(char, foundHost, MAX_URL);
209 LOCAL_ARRAY(char, urlpath, MAX_URL);
210 char *t = NULL;
211 char *q = NULL;
212 int foundPort;
213 int l;
214 int i;
215 const char *src;
216 char *dst;
217 foundHost[0] = urlpath[0] = login[0] = '\0';
218
219 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
220 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
221 return false;
222 }
6c880a16 223
77b1029d 224 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
225 Asterisk().cmp(rawUrl) == 0) {
226 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
227 setScheme(AnyP::PROTO_HTTP, nullptr);
228 port(getScheme().defaultPort());
229 path(Asterisk());
230 return true;
231 }
6c880a16 232
77b1029d 233 Parser::Tokenizer tok(rawUrl);
234 AnyP::UriScheme scheme;
cc192b50 235
77b1029d 236 if (method == Http::METHOD_CONNECT) {
237 /*
238 * RFC 7230 section 5.3.3: authority-form = authority
239 * "excluding any userinfo and its "@" delimiter"
240 *
241 * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
242 *
243 * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
244 */
245 foundPort = 443;
6c880a16 246
77b1029d 247 // XXX: use tokenizer
248 auto B = tok.buf();
249 const char *url = B.c_str();
6c880a16 250
77b1029d 251 if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
252 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
253 return false;
6c880a16 254
77b1029d 255 } else {
cc192b50 256
77b1029d 257 scheme = uriParseScheme(tok);
6c880a16 258
77b1029d 259 if (scheme == AnyP::PROTO_NONE)
260 return false; // invalid scheme
6c880a16 261
77b1029d 262 if (scheme == AnyP::PROTO_URN) {
263 parseUrn(tok); // throws on any error
264 return true;
265 }
62e76326 266
77b1029d 267 // URLs then have "//"
268 static const SBuf doubleSlash("//");
269 if (!tok.skip(doubleSlash))
270 return false;
cc192b50 271
77b1029d 272 auto B = tok.remaining();
273 const char *url = B.c_str();
cc192b50 274
77b1029d 275 /* Parse the URL: */
276 src = url;
277 i = 0;
62e76326 278
2f8abb64 279 /* Then everything until first /; that's host (and port; which we'll look for here later) */
77b1029d 280 // bug 1881: If we don't get a "/" then we imply it was there
281 // bug 3074: We could just be given a "?" or "#". These also imply "/"
282 // bug 3233: whitespace is also a hostname delimiter.
283 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
284 *dst = *src;
285 }
286
287 /*
288 * We can't check for "i >= l" here because we could be at the end of the line
289 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
290 * been -given- a valid URL and the path is just '/'.
291 */
292 if (i > l)
293 return false;
294 *dst = '\0';
62e76326 295
77b1029d 296 // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
297 if (*src == '?' || *src == '#' || *src == '\0') {
298 urlpath[0] = '/';
299 dst = &urlpath[1];
300 } else {
301 dst = urlpath;
302 }
2f8abb64 303 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
77b1029d 304 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
cc192b50 305 *dst = *src;
306 }
307
77b1029d 308 /* We -could- be at the end of the buffer here */
309 if (i > l)
310 return false;
311 /* If the URL path is empty we set it to be "/" */
312 if (dst == urlpath) {
313 *dst = '/';
314 ++dst;
315 }
5db6bf73 316 *dst = '\0';
cc192b50 317
77b1029d 318 foundPort = scheme.defaultPort(); // may be reset later
319
320 /* Is there any login information? (we should eventually parse it above) */
321 t = strrchr(foundHost, '@');
322 if (t != NULL) {
323 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
324 login[sizeof(login)-1] = '\0';
325 t = strrchr(login, '@');
326 *t = 0;
327 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
328 foundHost[sizeof(foundHost)-1] = '\0';
329 // Bug 4498: URL-unescape the login info after extraction
330 rfc1738_unescape(login);
331 }
332
333 /* Is there any host information? (we should eventually parse it above) */
334 if (*foundHost == '[') {
335 /* strip any IPA brackets. valid under IPv6. */
336 dst = foundHost;
337 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
338 src = foundHost;
339 ++src;
340 l = strlen(foundHost);
341 i = 1;
342 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
343 *dst = *src;
344 }
345
346 /* we moved in-place, so truncate the actual hostname found */
347 *dst = '\0';
5db6bf73 348 ++dst;
cc192b50 349
77b1029d 350 /* skip ahead to either start of port, or original EOS */
351 while (*dst != '\0' && *dst != ':')
352 ++dst;
353 t = dst;
354 } else {
355 t = strrchr(foundHost, ':');
356
357 if (t != strchr(foundHost,':') ) {
358 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
359 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
360 /* therefore we MUST accept the case where they are not bracketed at all. */
361 t = NULL;
362 }
cc192b50 363 }
62e76326 364
77b1029d 365 // Bug 3183 sanity check: If scheme is present, host must be too.
366 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
367 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
368 return false;
369 }
b5acc277 370
77b1029d 371 if (t && *t == ':') {
372 *t = '\0';
373 ++t;
374 foundPort = atoi(t);
375 }
62e76326 376 }
62e76326 377
77b1029d 378 for (t = foundHost; *t; ++t)
379 *t = xtolower(*t);
380
381 if (stringHasWhitespace(foundHost)) {
382 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
383 t = q = foundHost;
384 while (*t) {
385 if (!xisspace(*t)) {
386 *q = *t;
387 ++q;
388 }
389 ++t;
5db6bf73 390 }
77b1029d 391 *q = '\0';
62e76326 392 }
62e76326 393 }
62e76326 394
77b1029d 395 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
cc192b50 396
77b1029d 397 if (Config.onoff.check_hostnames &&
398 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
399 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
400 return false;
401 }
62e76326 402
77b1029d 403 if (!urlAppendDomain(foundHost))
404 return false;
cc192b50 405
77b1029d 406 /* remove trailing dots from hostnames */
407 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
408 foundHost[l] = '\0';
62e76326 409
77b1029d 410 /* reject duplicate or leading dots */
411 if (strstr(foundHost, "..") || *foundHost == '.') {
412 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
413 return false;
414 }
62e76326 415
77b1029d 416 if (foundPort < 1 || foundPort > 65535) {
417 debugs(23, 3, "Invalid port '" << foundPort << "'");
418 return false;
419 }
62e76326 420
32d002cb 421#if HARDCODE_DENY_PORTS
77b1029d 422 /* These ports are filtered in the default squid.conf, but
423 * maybe someone wants them hardcoded... */
424 if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
425 debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
426 return false;
427 }
6ef12318 428#endif
cc192b50 429
77b1029d 430 if (stringHasWhitespace(urlpath)) {
431 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
62e76326 432
77b1029d 433 switch (Config.uri_whitespace) {
62e76326 434
77b1029d 435 case URI_WHITESPACE_DENY:
436 return false;
62e76326 437
77b1029d 438 case URI_WHITESPACE_ALLOW:
439 break;
440
441 case URI_WHITESPACE_ENCODE:
442 t = rfc1738_escape_unescaped(urlpath);
443 xstrncpy(urlpath, t, MAX_URL);
444 break;
445
446 case URI_WHITESPACE_CHOP:
447 *(urlpath + strcspn(urlpath, w_space)) = '\0';
448 break;
449
450 case URI_WHITESPACE_STRIP:
451 default:
452 t = q = urlpath;
453 while (*t) {
454 if (!xisspace(*t)) {
455 *q = *t;
456 ++q;
457 }
458 ++t;
5db6bf73 459 }
77b1029d 460 *q = '\0';
62e76326 461 }
62e76326 462 }
62e76326 463
77b1029d 464 setScheme(scheme);
465 path(urlpath);
466 host(foundHost);
467 userInfo(SBuf(login));
468 port(foundPort);
469 return true;
6c880a16
AJ
470
471 } catch (...) {
472 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
473 return false;
474 }
7111c86a 475}
476
6c880a16
AJ
477/**
478 * Governed by RFC 8141 section 2:
479 *
480 * assigned-name = "urn" ":" NID ":" NSS
481 * NID = (alphanum) 0*30(ldh) (alphanum)
482 * ldh = alphanum / "-"
483 * NSS = pchar *(pchar / "/")
484 *
485 * RFC 3986 Appendix D.2 defines (as deprecated):
486 *
487 * alphanum = ALPHA / DIGIT
488 *
489 * Notice that NID is exactly 2-32 characters in length.
490 */
db59367a 491void
6c880a16 492AnyP::Uri::parseUrn(Parser::Tokenizer &tok)
23d92c64 493{
6c880a16
AJ
494 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
495 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
496 SBuf nid;
497 if (!tok.prefix(nid, nidChars, 32))
498 throw TextException("NID not found", Here());
499
500 if (!tok.skip(':'))
501 throw TextException("NID too long or missing ':' delimiter", Here());
502
503 if (nid.length() < 2)
504 throw TextException("NID too short", Here());
505
506 if (!alphanum[*nid.begin()])
507 throw TextException("NID prefix is not alphanumeric", Here());
508
509 if (!alphanum[*nid.rbegin()])
510 throw TextException("NID suffix is not alphanumeric", Here());
511
512 setScheme(AnyP::PROTO_URN, nullptr);
513 host(nid.c_str());
514 // TODO validate path characters
515 path(tok.remaining());
516 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
23d92c64 517}
518
5c51bffb 519void
c8ab5ec6 520AnyP::Uri::touch()
5c51bffb 521{
c823e2da 522 absolute_.clear();
5c51bffb
AJ
523 authorityHttp_.clear();
524 authorityWithPort_.clear();
525}
526
527SBuf &
c8ab5ec6 528AnyP::Uri::authority(bool requirePort) const
5c51bffb
AJ
529{
530 if (authorityHttp_.isEmpty()) {
531
532 // both formats contain Host/IP
533 authorityWithPort_.append(host());
534 authorityHttp_ = authorityWithPort_;
535
536 // authorityForm_ only has :port if it is non-default
537 authorityWithPort_.appendf(":%u",port());
538 if (port() != getScheme().defaultPort())
539 authorityHttp_ = authorityWithPort_;
540 }
541
542 return requirePort ? authorityWithPort_ : authorityHttp_;
543}
544
c823e2da 545SBuf &
c8ab5ec6 546AnyP::Uri::absolute() const
c823e2da
AJ
547{
548 if (absolute_.isEmpty()) {
549 // TODO: most URL will be much shorter, avoid allocating this much
550 absolute_.reserveCapacity(MAX_URL);
551
d31d59d8
AJ
552 absolute_.append(getScheme().image());
553 absolute_.append(":",1);
c823e2da
AJ
554 if (getScheme() != AnyP::PROTO_URN) {
555 absolute_.append("//", 2);
0d0f5161
AJ
556 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
557 getScheme() == AnyP::PROTO_UNKNOWN;
558
559 if (allowUserInfo && !userInfo().isEmpty()) {
c823e2da
AJ
560 absolute_.append(userInfo());
561 absolute_.append("@", 1);
562 }
563 absolute_.append(authority());
6c880a16
AJ
564 } else {
565 absolute_.append(host());
566 absolute_.append(":", 1);
c823e2da
AJ
567 }
568 absolute_.append(path());
569 }
570
571 return absolute_;
572}
573
851feda6 574/** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
914b89a2 575 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
576 * and never copy the query-string part in the first place
577 */
88738790 578char *
bec110e4 579urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
88738790 580{
581 LOCAL_ARRAY(char, buf, MAX_URL);
62e76326 582
bec110e4 583 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
c823e2da 584 buf[sizeof(buf)-1] = '\0';
62e76326 585
c823e2da 586 // URN, CONNECT method, and non-stripped URIs can go straight out
bec110e4 587 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
c823e2da
AJ
588 // strip anything AFTER a question-mark
589 // leaving the '?' in place
590 if (auto t = strchr(buf, '?')) {
591 *(++t) = '\0';
e2849af8 592 }
d548ee64 593 }
62e76326 594
9bc73deb 595 if (stringHasCntl(buf))
62e76326 596 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
597
88738790 598 return buf;
599}
600
b3802bdc
AJ
601/**
602 * Yet another alternative to urlCanonical.
c2a7cefd 603 * This one adds the https:// parts to Http::METHOD_CONNECT URL
b3802bdc
AJ
604 * for use in error page outputs.
605 * Luckily we can leverage the others instead of duplicating.
606 */
607const char *
608urlCanonicalFakeHttps(const HttpRequest * request)
609{
610 LOCAL_ARRAY(char, buf, MAX_URL);
611
612 // method CONNECT and port HTTPS
5c51bffb
AJ
613 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
614 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
b3802bdc
AJ
615 return buf;
616 }
617
618 // else do the normal complete canonical thing.
bec110e4 619 return request->canonicalCleanUrl();
b3802bdc
AJ
620}
621
bf956b0a
BR
622/*
623 * Test if a URL is relative.
624 *
71051277
BR
625 * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
626 * appear before a ':'.
bf956b0a 627 */
6e44cca8 628bool
bf956b0a
BR
629urlIsRelative(const char *url)
630{
631 const char *p;
632
633 if (url == NULL) {
6e44cca8 634 return (false);
bf956b0a
BR
635 }
636 if (*url == '\0') {
6e44cca8 637 return (false);
bf956b0a
BR
638 }
639
5db6bf73 640 for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
bf956b0a
BR
641
642 if (*p == ':') {
6e44cca8 643 return (false);
bf956b0a 644 }
6e44cca8 645 return (true);
bf956b0a
BR
646}
647
648/*
71051277 649 * Convert a relative URL to an absolute URL using the context of a given
bf956b0a 650 * request.
71051277
BR
651 *
652 * It is assumed that you have already ensured that the URL is relative.
653 *
6e44cca8
BR
654 * If NULL is returned it is an indication that the method in use in the
655 * request does not distinguish between relative and absolute and you should
656 * use the url unchanged.
0376a4c9
BR
657 *
658 * If non-NULL is returned, it is up to the caller to free the resulting
659 * memory using safe_free().
bf956b0a 660 */
6e44cca8 661char *
bf956b0a 662urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
3cbbd242 663{
3cbbd242 664
c2a7cefd 665 if (req->method.id() == Http::METHOD_CONNECT) {
f3900427 666 return (NULL);
3cbbd242 667 }
26ac0430 668
6e44cca8 669 char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
26ac0430 670
4e3f4dc7 671 if (req->url.getScheme() == AnyP::PROTO_URN) {
c823e2da
AJ
672 // XXX: this is what the original code did, but it seems to break the
673 // intended behaviour of this function. It returns the stored URN path,
674 // not converting the given one into a URN...
675 snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
71051277 676 return (urlbuf);
3cbbd242 677 }
26ac0430 678
5c51bffb 679 SBuf authorityForm = req->url.authority(); // host[:port]
d31d59d8
AJ
680 const SBuf &scheme = req->url.getScheme().image();
681 size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
682 SQUIDSBUFPRINT(scheme),
5c51bffb
AJ
683 SQUIDSBUFPRINT(req->url.userInfo()),
684 !req->url.userInfo().isEmpty() ? "@" : "",
685 SQUIDSBUFPRINT(authorityForm));
6e44cca8 686
51b5dcf5
AJ
687 // if the first char is '/' assume its a relative path
688 // XXX: this breaks on scheme-relative URLs,
689 // but we should not see those outside ESI, and rarely there.
c823e2da 690 // XXX: also breaks on any URL containing a '/' in the query-string portion
6e44cca8 691 if (relUrl[0] == '/') {
51b5dcf5 692 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
6e44cca8 693 } else {
51b5dcf5
AJ
694 SBuf path = req->url.path();
695 SBuf::size_type lastSlashPos = path.rfind('/');
6e44cca8 696
51b5dcf5
AJ
697 if (lastSlashPos == SBuf::npos) {
698 // replace the whole path with the given bit(s)
5db6bf73
FC
699 urlbuf[urllen] = '/';
700 ++urllen;
51b5dcf5 701 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
6e44cca8 702 } else {
51b5dcf5
AJ
703 // replace only the last (file?) segment with the given bit(s)
704 ++lastSlashPos;
705 if (lastSlashPos > MAX_URL - urllen - 1) {
706 // XXX: crops bits in the middle of the combined URL.
707 lastSlashPos = MAX_URL - urllen - 1;
6e44cca8 708 }
3f0e38d6 709 SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
51b5dcf5 710 urllen += lastSlashPos;
6e44cca8 711 if (urllen + 1 < MAX_URL) {
51b5dcf5 712 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
6e44cca8
BR
713 }
714 }
715 }
3cbbd242 716
bc9ad11f 717 return (urlbuf);
3cbbd242 718}
719
b8d8561b 720int
6c1219b9 721matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
30a4f2a8 722{
9bc73deb 723 int dl;
724 int hl;
62e76326 725
abbd7825 726 const bool hostIncludesSubdomains = (*h == '.');
d20b1cd0 727 while ('.' == *h)
5db6bf73 728 ++h;
62e76326 729
9bc73deb 730 hl = strlen(h);
62e76326 731
abbd7825
CT
732 if (hl == 0)
733 return -1;
734
9bc73deb 735 dl = strlen(d);
62e76326 736
9bc73deb 737 /*
738 * Start at the ends of the two strings and work towards the
739 * beginning.
740 */
741 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
62e76326 742 if (hl == 0 && dl == 0) {
743 /*
744 * We made it all the way to the beginning of both
745 * strings without finding any difference.
746 */
747 return 0;
748 }
749
750 if (0 == hl) {
751 /*
752 * The host string is shorter than the domain string.
753 * There is only one case when this can be a match.
754 * If the domain is just one character longer, and if
755 * that character is a leading '.' then we call it a
756 * match.
757 */
758
759 if (1 == dl && '.' == d[0])
760 return 0;
761 else
762 return -1;
763 }
764
765 if (0 == dl) {
766 /*
767 * The domain string is shorter than the host string.
768 * This is a match only if the first domain character
769 * is a leading '.'.
770 */
771
abbd7825
CT
772 if ('.' == d[0]) {
773 if (flags & mdnRejectSubsubDomains) {
774 // Check for sub-sub domain and reject
775 while(--hl >= 0 && h[hl] != '.');
776 if (hl < 0) {
777 // No sub-sub domain found, but reject if there is a
778 // leading dot in given host string (which is removed
779 // before the check is started).
780 return hostIncludesSubdomains ? 1 : 0;
781 } else
782 return 1; // sub-sub domain, reject
783 } else
784 return 0;
785 } else
62e76326 786 return 1;
787 }
9bc73deb 788 }
62e76326 789
9bc73deb 790 /*
791 * We found different characters in the same position (from the end).
792 */
69f69080
CT
793
794 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
795 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
796 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
abbd7825 797 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
69f69080
CT
798 return 0;
799
d20b1cd0 800 /*
801 * If one of those character is '.' then its special. In order
802 * for splay tree sorting to work properly, "x-foo.com" must
803 * be greater than ".foo.com" even though '-' is less than '.'.
804 */
805 if ('.' == d[dl])
62e76326 806 return 1;
807
d20b1cd0 808 if ('.' == h[hl])
62e76326 809 return -1;
810
9bc73deb 811 return (xtolower(h[hl]) - xtolower(d[dl]));
30a4f2a8 812}
a8f7d3ee 813
985c86bc 814/*
610ee341 815 * return true if we can serve requests for this method.
985c86bc 816 */
b8d8561b 817int
190154cf 818urlCheckRequest(const HttpRequest * r)
a8f7d3ee 819{
820 int rc = 0;
610ee341 821 /* protocol "independent" methods
822 *
823 * actually these methods are specific to HTTP:
2f8abb64 824 * they are methods we receive on our HTTP port,
610ee341 825 * and if we had a FTP listener would not be relevant
826 * there.
827 *
828 * So, we should delegate them to HTTP. The problem is that we
829 * do not have a default protocol from the client side of HTTP.
830 */
62e76326 831
c2a7cefd 832 if (r->method == Http::METHOD_CONNECT)
62e76326 833 return 1;
834
77ce6ba9
AR
835 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
836 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
c2a7cefd 837 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
c8ab5ec6 838 return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
62e76326 839
c2a7cefd 840 if (r->method == Http::METHOD_PURGE)
62e76326 841 return 1;
842
99edd1c3 843 /* does method match the protocol? */
4e3f4dc7 844 switch (r->url.getScheme()) {
62e76326 845
0c3d3f65 846 case AnyP::PROTO_URN:
62e76326 847
0c3d3f65 848 case AnyP::PROTO_HTTP:
62e76326 849
39a19cb7 850 case AnyP::PROTO_CACHE_OBJECT:
62e76326 851 rc = 1;
852 break;
853
0c3d3f65 854 case AnyP::PROTO_FTP:
62e76326 855
c2a7cefd 856 if (r->method == Http::METHOD_PUT)
62e76326 857 rc = 1;
858
0c3d3f65 859 case AnyP::PROTO_GOPHER:
62e76326 860
0c3d3f65 861 case AnyP::PROTO_WAIS:
62e76326 862
0c3d3f65 863 case AnyP::PROTO_WHOIS:
c2a7cefd 864 if (r->method == Http::METHOD_GET)
62e76326 865 rc = 1;
c2a7cefd 866 else if (r->method == Http::METHOD_HEAD)
62e76326 867 rc = 1;
868
869 break;
870
0c3d3f65 871 case AnyP::PROTO_HTTPS:
cb4f4424 872#if USE_OPENSSL
62e76326 873 rc = 1;
418293da
AJ
874#elif USE_GNUTLS
875 rc = 1;
1f7c9178 876#else
62e76326 877 /*
878 * Squid can't originate an SSL connection, so it should
879 * never receive an "https:" URL. It should always be
880 * CONNECT instead.
881 */
882 rc = 0;
1f7c9178 883#endif
0166128b 884 break;
62e76326 885
a8f7d3ee 886 default:
62e76326 887 break;
a8f7d3ee 888 }
62e76326 889
a8f7d3ee 890 return rc;
891}
9ce5e3e6 892
c8ab5ec6 893AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
d59e4742
FC
894 scheme_(aScheme),
895 hostIsNumeric_(false),
896 port_(0)
897{
898 *host_=0;
899}
1a739503 900
bec110e4
EB
901// TODO: fix code duplication with AnyP::Uri::parse()
902char *
903AnyP::Uri::cleanup(const char *uri)
904{
905 int flags = 0;
906 char *cleanedUri = nullptr;
907 switch (Config.uri_whitespace) {
908 case URI_WHITESPACE_ALLOW:
909 flags |= RFC1738_ESCAPE_NOSPACE;
279e60ef 910 // fall through to next case
bec110e4
EB
911 case URI_WHITESPACE_ENCODE:
912 flags |= RFC1738_ESCAPE_UNESCAPED;
913 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
914 break;
915
916 case URI_WHITESPACE_CHOP: {
917 flags |= RFC1738_ESCAPE_UNESCAPED;
918 const auto pos = strcspn(uri, w_space);
919 char *choppedUri = nullptr;
920 if (pos < strlen(uri))
921 choppedUri = xstrndup(uri, pos + 1);
922 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
923 cleanedUri[pos] = '\0';
924 xfree(choppedUri);
925 }
926 break;
927
928 case URI_WHITESPACE_DENY:
929 case URI_WHITESPACE_STRIP:
930 default: {
931 // TODO: avoid duplication with urlParse()
932 const char *t;
933 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
934 char *q = tmp_uri;
935 t = uri;
936 while (*t) {
937 if (!xisspace(*t)) {
938 *q = *t;
939 ++q;
940 }
941 ++t;
942 }
943 *q = '\0';
944 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
945 xfree(tmp_uri);
946 }
947 break;
948 }
949
950 assert(cleanedUri);
951 return cleanedUri;
952}
279e60ef 953