]> git.ipfire.org Git - thirdparty/squid.git/blame - src/anyp/Uri.cc
Source Format Enforcement (#244)
[thirdparty/squid.git] / src / anyp / Uri.cc
CommitLineData
30a4f2a8 1/*
5b74111a 2 * Copyright (C) 1996-2018 The Squid Software Foundation and contributors
e25c139f 3 *
bbc27441
AJ
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
019dd986 7 */
ed43818f 8
bbc27441
AJ
9/* DEBUG: section 23 URL Parsing */
10
f7f3304a 11#include "squid.h"
c8ab5ec6 12#include "anyp/Uri.h"
582c2af2 13#include "globals.h"
528b2c61 14#include "HttpRequest.h"
1fa9b1a7 15#include "rfc1738.h"
4d5904f7 16#include "SquidConfig.h"
7a707cb5 17#include "SquidString.h"
090089c4 18
a78278e2 19static const char valid_hostname_chars_u[] =
62e76326 20 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
21 "abcdefghijklmnopqrstuvwxyz"
a78278e2 22 "0123456789-._"
cc192b50 23 "[:]"
a78278e2 24 ;
25static const char valid_hostname_chars[] =
62e76326 26 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
27 "abcdefghijklmnopqrstuvwxyz"
28 "0123456789-."
cc192b50 29 "[:]"
62e76326 30 ;
090089c4 31
2e260208 32const SBuf &
c8ab5ec6 33AnyP::Uri::Asterisk()
2e260208
AJ
34{
35 static SBuf star("*");
36 return star;
37}
38
51b5dcf5 39const SBuf &
c8ab5ec6 40AnyP::Uri::SlashPath()
51b5dcf5
AJ
41{
42 static SBuf slash("/");
43 return slash;
44}
45
5c51bffb 46void
c8ab5ec6 47AnyP::Uri::host(const char *src)
5c51bffb
AJ
48{
49 hostAddr_.setEmpty();
50 hostAddr_ = src;
51 if (hostAddr_.isAnyAddr()) {
52 xstrncpy(host_, src, sizeof(host_));
53 hostIsNumeric_ = false;
54 } else {
55 hostAddr_.toHostStr(host_, sizeof(host_));
56 debugs(23, 3, "given IP: " << hostAddr_);
57 hostIsNumeric_ = 1;
58 }
59 touch();
60}
61
51b5dcf5 62const SBuf &
c8ab5ec6 63AnyP::Uri::path() const
51b5dcf5
AJ
64{
65 // RFC 3986 section 3.3 says path can be empty (path-abempty).
66 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
67 // at least when sending and using. We must still accept path-abempty as input.
68 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
69 return SlashPath();
70
71 return path_;
72}
73
b8d8561b 74void
0673c0ba 75urlInitialize(void)
090089c4 76{
bf8fe701 77 debugs(23, 5, "urlInitialize: Initializing...");
985c86bc 78 /* this ensures that the number of protocol strings is the same as
0c3d3f65 79 * the enum slots allocated because the last enum is always 'MAX'.
985c86bc 80 */
0c3d3f65 81 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
9bc73deb 82 /*
83 * These test that our matchDomainName() function works the
84 * way we expect it to.
85 */
86 assert(0 == matchDomainName("foo.com", "foo.com"));
d20b1cd0 87 assert(0 == matchDomainName(".foo.com", "foo.com"));
9bc73deb 88 assert(0 == matchDomainName("foo.com", ".foo.com"));
89 assert(0 == matchDomainName(".foo.com", ".foo.com"));
90 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
abbd7825 91 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
9bc73deb 92 assert(0 != matchDomainName("x.foo.com", "foo.com"));
93 assert(0 != matchDomainName("foo.com", "x.foo.com"));
94 assert(0 != matchDomainName("bar.com", "foo.com"));
95 assert(0 != matchDomainName(".bar.com", "foo.com"));
96 assert(0 != matchDomainName(".bar.com", ".foo.com"));
97 assert(0 != matchDomainName("bar.com", ".foo.com"));
98 assert(0 < matchDomainName("zzz.com", "foo.com"));
99 assert(0 > matchDomainName("aaa.com", "foo.com"));
100 assert(0 == matchDomainName("FOO.com", "foo.COM"));
aca95add 101 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
102 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
d20b1cd0 103 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
abbd7825
CT
104
105 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
106 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
107 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
108 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
109
110 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
111 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
112 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
113 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
114
9bc73deb 115 /* more cases? */
090089c4 116}
117
cc192b50 118/**
5a7fb80a
AJ
119 * Parse the scheme name from string b, into protocol type.
120 * The string must be 0-terminated.
d4a04ed5 121 */
0c3d3f65 122AnyP::ProtocolType
5a7fb80a 123urlParseProtocol(const char *b)
92a6f4b1 124{
5a7fb80a
AJ
125 // make e point to the ':' character
126 const char *e = b + strcspn(b, ":");
d4a04ed5 127 int len = e - b;
128
fcd2d3ef 129 /* test common stuff first */
62e76326 130
d4a04ed5 131 if (strncasecmp(b, "http", len) == 0)
0c3d3f65 132 return AnyP::PROTO_HTTP;
62e76326 133
d4a04ed5 134 if (strncasecmp(b, "ftp", len) == 0)
0c3d3f65 135 return AnyP::PROTO_FTP;
62e76326 136
d4a04ed5 137 if (strncasecmp(b, "https", len) == 0)
0c3d3f65 138 return AnyP::PROTO_HTTPS;
62e76326 139
d4a04ed5 140 if (strncasecmp(b, "file", len) == 0)
0c3d3f65 141 return AnyP::PROTO_FTP;
62e76326 142
330f829e
AJ
143 if (strncasecmp(b, "coap", len) == 0)
144 return AnyP::PROTO_COAP;
145
146 if (strncasecmp(b, "coaps", len) == 0)
147 return AnyP::PROTO_COAPS;
148
d4a04ed5 149 if (strncasecmp(b, "gopher", len) == 0)
0c3d3f65 150 return AnyP::PROTO_GOPHER;
62e76326 151
d4a04ed5 152 if (strncasecmp(b, "wais", len) == 0)
0c3d3f65 153 return AnyP::PROTO_WAIS;
62e76326 154
d4a04ed5 155 if (strncasecmp(b, "cache_object", len) == 0)
39a19cb7 156 return AnyP::PROTO_CACHE_OBJECT;
62e76326 157
d4a04ed5 158 if (strncasecmp(b, "urn", len) == 0)
0c3d3f65 159 return AnyP::PROTO_URN;
62e76326 160
d4a04ed5 161 if (strncasecmp(b, "whois", len) == 0)
0c3d3f65 162 return AnyP::PROTO_WHOIS;
62e76326 163
d31d59d8
AJ
164 if (len > 0)
165 return AnyP::PROTO_UNKNOWN;
166
0c3d3f65 167 return AnyP::PROTO_NONE;
92a6f4b1 168}
169
d4a04ed5 170/*
171 * Parse a URI/URL.
172 *
9157915c 173 * Stores parsed values in the `request` argument.
c21ad0f5 174 *
26ac0430 175 * This abuses HttpRequest as a way of representing the parsed url
c21ad0f5 176 * and its components.
177 * method is used to switch parsers and to init the HttpRequest.
c2a7cefd 178 * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
c21ad0f5 179 * looked for.
180 * The url is non const so that if its too long we can NULL-terminate it in place.
d4a04ed5 181 */
cc192b50 182
183/*
184 * This routine parses a URL. Its assumed that the URL is complete -
185 * ie, the end of the string is the end of the URL. Don't pass a partial
186 * URL here as this routine doesn't have any way of knowing whether
187 * its partial or not (ie, it handles the case of no trailing slash as
188 * being "end of host with implied path of /".
189 */
9157915c 190bool
c8ab5ec6 191AnyP::Uri::parse(const HttpRequestMethod& method, const char *url)
7111c86a 192{
f2052513 193 LOCAL_ARRAY(char, proto, MAX_URL);
194 LOCAL_ARRAY(char, login, MAX_URL);
91489e45 195 LOCAL_ARRAY(char, foundHost, MAX_URL);
f2052513 196 LOCAL_ARRAY(char, urlpath, MAX_URL);
7111c86a 197 char *t = NULL;
7e3ce7b9 198 char *q = NULL;
91489e45 199 int foundPort;
0c3d3f65 200 AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
774cc2d8 201 int l;
cc192b50 202 int i;
203 const char *src;
204 char *dst;
91489e45 205 proto[0] = foundHost[0] = urlpath[0] = login[0] = '\0';
7111c86a 206
ba0fd1b6 207 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
91489e45 208 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
a0924f71 209 return false;
0a5b9b32 210 }
c2a7cefd 211 if (method == Http::METHOD_CONNECT) {
91489e45
AJ
212 /*
213 * RFC 7230 section 5.3.3: authority-form = authority
214 * "excluding any userinfo and its "@" delimiter"
215 *
216 * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
217 *
218 * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
219 */
220 foundPort = 443;
62e76326 221
91489e45
AJ
222 if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
223 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
a0924f71 224 return false;
cc192b50 225
c2a7cefd 226 } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
c8ab5ec6 227 AnyP::Uri::Asterisk().cmp(url) == 0) {
91489e45 228 parseFinish(AnyP::PROTO_HTTP, nullptr, url, foundHost, SBuf(), 80 /* HTTP default port */);
db59367a
AJ
229 return true;
230 } else if (strncmp(url, "urn:", 4) == 0) {
231 debugs(23, 3, "Split URI '" << url << "' into proto='urn', path='" << (url+4) << "'");
232 debugs(50, 5, "urn=" << (url+4));
91489e45
AJ
233 setScheme(AnyP::PROTO_URN, nullptr);
234 path(url + 4);
db59367a 235 return true;
7111c86a 236 } else {
cc192b50 237 /* Parse the URL: */
238 src = url;
239 i = 0;
240 /* Find first : - everything before is protocol */
5db6bf73 241 for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
cc192b50 242 *dst = *src;
243 }
244 if (i >= l)
a0924f71 245 return false;
cc192b50 246 *dst = '\0';
247
248 /* Then its :// */
5e245980 249 if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
a0924f71 250 return false;
cc192b50 251 i += 3;
252 src += 3;
62e76326 253
cc192b50 254 /* Then everything until first /; thats host (and port; which we'll look for here later) */
68338d14
F
255 // bug 1881: If we don't get a "/" then we imply it was there
256 // bug 3074: We could just be given a "?" or "#". These also imply "/"
b2ab59ad 257 // bug 3233: whitespace is also a hostname delimiter.
91489e45 258 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
cc192b50 259 *dst = *src;
260 }
261
26ac0430 262 /*
cc192b50 263 * We can't check for "i >= l" here because we could be at the end of the line
264 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
265 * been -given- a valid URL and the path is just '/'.
266 */
267 if (i > l)
a0924f71 268 return false;
cc192b50 269 *dst = '\0';
270
68338d14
F
271 // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
272 if (*src == '?' || *src == '#' || *src == '\0') {
273 urlpath[0] = '/';
274 dst = &urlpath[1];
275 } else {
276 dst = urlpath;
277 }
cc192b50 278 /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
5db6bf73 279 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
cc192b50 280 *dst = *src;
281 }
62e76326 282
cc192b50 283 /* We -could- be at the end of the buffer here */
284 if (i > l)
a0924f71 285 return false;
cc192b50 286 /* If the URL path is empty we set it to be "/" */
287 if (dst == urlpath) {
5db6bf73
FC
288 *dst = '/';
289 ++dst;
cc192b50 290 }
291 *dst = '\0';
292
293 protocol = urlParseProtocol(proto);
91489e45 294 foundPort = AnyP::UriScheme(protocol).defaultPort();
62e76326 295
cc192b50 296 /* Is there any login information? (we should eventually parse it above) */
91489e45 297 t = strrchr(foundHost, '@');
810635e3 298 if (t != NULL) {
91489e45 299 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
0a84e4fb 300 login[sizeof(login)-1] = '\0';
62e76326 301 t = strrchr(login, '@');
302 *t = 0;
91489e45
AJ
303 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
304 foundHost[sizeof(foundHost)-1] = '\0';
bcddfefb
AJ
305 // Bug 4498: URL-unescape the login info after extraction
306 rfc1738_unescape(login);
62e76326 307 }
308
cc192b50 309 /* Is there any host information? (we should eventually parse it above) */
91489e45 310 if (*foundHost == '[') {
cc192b50 311 /* strip any IPA brackets. valid under IPv6. */
91489e45 312 dst = foundHost;
cc192b50 313 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
91489e45 314 src = foundHost;
5db6bf73 315 ++src;
91489e45 316 l = strlen(foundHost);
cc192b50 317 i = 1;
5db6bf73 318 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
cc192b50 319 *dst = *src;
320 }
321
322 /* we moved in-place, so truncate the actual hostname found */
5db6bf73
FC
323 *dst = '\0';
324 ++dst;
cc192b50 325
326 /* skip ahead to either start of port, or original EOS */
5db6bf73
FC
327 while (*dst != '\0' && *dst != ':')
328 ++dst;
cc192b50 329 t = dst;
330 } else {
91489e45 331 t = strrchr(foundHost, ':');
cc192b50 332
91489e45 333 if (t != strchr(foundHost,':') ) {
cc192b50 334 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
335 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
336 /* therefore we MUST accept the case where they are not bracketed at all. */
337 t = NULL;
338 }
339 }
62e76326 340
b5acc277 341 // Bug 3183 sanity check: If scheme is present, host must be too.
91489e45 342 if (protocol != AnyP::PROTO_NONE && foundHost[0] == '\0') {
ac89842b 343 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
a0924f71 344 return false;
b5acc277
AJ
345 }
346
cc192b50 347 if (t && *t == ':') {
26ac0430 348 *t = '\0';
5db6bf73 349 ++t;
91489e45 350 foundPort = atoi(t);
62e76326 351 }
7111c86a 352 }
62e76326 353
91489e45 354 for (t = foundHost; *t; ++t)
62e76326 355 *t = xtolower(*t);
356
91489e45 357 if (stringHasWhitespace(foundHost)) {
62e76326 358 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
91489e45 359 t = q = foundHost;
62e76326 360 while (*t) {
5db6bf73
FC
361 if (!xisspace(*t)) {
362 *q = *t;
363 ++q;
364 }
365 ++t;
62e76326 366 }
62e76326 367 *q = '\0';
368 }
d20b1cd0 369 }
62e76326 370
91489e45 371 debugs(23, 3, "Split URL '" << url << "' into proto='" << proto << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
cc192b50 372
91489e45
AJ
373 if (Config.onoff.check_hostnames &&
374 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
375 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
a0924f71 376 return false;
b3f3dd02 377 }
62e76326 378
532e5dd4 379 /* For IPV6 addresses also check for a colon */
91489e45
AJ
380 if (Config.appendDomain && !strchr(foundHost, '.') && !strchr(foundHost, ':'))
381 strncat(foundHost, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(foundHost) - 1);
cc192b50 382
1c481e00 383 /* remove trailing dots from hostnames */
91489e45
AJ
384 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
385 foundHost[l] = '\0';
62e76326 386
cc192b50 387 /* reject duplicate or leading dots */
91489e45
AJ
388 if (strstr(foundHost, "..") || *foundHost == '.') {
389 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
a0924f71 390 return false;
cc192b50 391 }
62e76326 392
91489e45
AJ
393 if (foundPort < 1 || foundPort > 65535) {
394 debugs(23, 3, "Invalid port '" << foundPort << "'");
a0924f71 395 return false;
7111c86a 396 }
62e76326 397
32d002cb 398#if HARDCODE_DENY_PORTS
429fdbec 399 /* These ports are filtered in the default squid.conf, but
400 * maybe someone wants them hardcoded... */
91489e45
AJ
401 if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
402 debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
a0924f71 403 return false;
429fdbec 404 }
6ef12318 405#endif
cc192b50 406
30abd221 407 if (stringHasWhitespace(urlpath)) {
91489e45 408 debugs(23, 2, "URI has whitespace: {" << url << "}");
62e76326 409
410 switch (Config.uri_whitespace) {
411
412 case URI_WHITESPACE_DENY:
a0924f71 413 return false;
62e76326 414
415 case URI_WHITESPACE_ALLOW:
416 break;
417
418 case URI_WHITESPACE_ENCODE:
419 t = rfc1738_escape_unescaped(urlpath);
420 xstrncpy(urlpath, t, MAX_URL);
421 break;
422
423 case URI_WHITESPACE_CHOP:
424 *(urlpath + strcspn(urlpath, w_space)) = '\0';
425 break;
426
427 case URI_WHITESPACE_STRIP:
62e76326 428 default:
429 t = q = urlpath;
62e76326 430 while (*t) {
5db6bf73
FC
431 if (!xisspace(*t)) {
432 *q = *t;
433 ++q;
434 }
435 ++t;
62e76326 436 }
62e76326 437 *q = '\0';
438 }
d548ee64 439 }
62e76326 440
91489e45 441 parseFinish(protocol, proto, urlpath, foundHost, SBuf(login), foundPort);
9157915c 442 return true;
7111c86a 443}
444
db59367a
AJ
445/// Update the URL object with parsed URI data.
446void
c8ab5ec6 447AnyP::Uri::parseFinish(const AnyP::ProtocolType protocol,
e69ca1f1 448 const char *const protoStr, // for unknown protocols
449 const char *const aUrlPath,
450 const char *const aHost,
451 const SBuf &aLogin,
452 const int aPort)
23d92c64 453{
db59367a
AJ
454 setScheme(protocol, protoStr);
455 path(aUrlPath);
456 host(aHost);
457 userInfo(aLogin);
458 port(aPort);
23d92c64 459}
460
5c51bffb 461void
c8ab5ec6 462AnyP::Uri::touch()
5c51bffb 463{
c823e2da 464 absolute_.clear();
5c51bffb
AJ
465 authorityHttp_.clear();
466 authorityWithPort_.clear();
467}
468
469SBuf &
c8ab5ec6 470AnyP::Uri::authority(bool requirePort) const
5c51bffb
AJ
471{
472 if (authorityHttp_.isEmpty()) {
473
474 // both formats contain Host/IP
475 authorityWithPort_.append(host());
476 authorityHttp_ = authorityWithPort_;
477
478 // authorityForm_ only has :port if it is non-default
479 authorityWithPort_.appendf(":%u",port());
480 if (port() != getScheme().defaultPort())
481 authorityHttp_ = authorityWithPort_;
482 }
483
484 return requirePort ? authorityWithPort_ : authorityHttp_;
485}
486
c823e2da 487SBuf &
c8ab5ec6 488AnyP::Uri::absolute() const
c823e2da
AJ
489{
490 if (absolute_.isEmpty()) {
491 // TODO: most URL will be much shorter, avoid allocating this much
492 absolute_.reserveCapacity(MAX_URL);
493
d31d59d8
AJ
494 absolute_.append(getScheme().image());
495 absolute_.append(":",1);
c823e2da
AJ
496 if (getScheme() != AnyP::PROTO_URN) {
497 absolute_.append("//", 2);
498 const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP ||
499 getScheme() != AnyP::PROTO_HTTPS ||
500 userInfo().isEmpty();
501 if (!omitUserInfo) {
502 absolute_.append(userInfo());
503 absolute_.append("@", 1);
504 }
505 absolute_.append(authority());
506 }
507 absolute_.append(path());
508 }
509
510 return absolute_;
511}
512
851feda6 513/** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
914b89a2 514 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
515 * and never copy the query-string part in the first place
516 */
88738790 517char *
190154cf 518urlCanonicalClean(const HttpRequest * request)
88738790 519{
520 LOCAL_ARRAY(char, buf, MAX_URL);
62e76326 521
851feda6 522 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(request->effectiveRequestUri()));
c823e2da 523 buf[sizeof(buf)-1] = '\0';
62e76326 524
c823e2da 525 // URN, CONNECT method, and non-stripped URIs can go straight out
851feda6 526 if (Config.onoff.strip_query_terms && !(request->method == Http::METHOD_CONNECT || request->url.getScheme() == AnyP::PROTO_URN)) {
c823e2da
AJ
527 // strip anything AFTER a question-mark
528 // leaving the '?' in place
529 if (auto t = strchr(buf, '?')) {
530 *(++t) = '\0';
e2849af8 531 }
d548ee64 532 }
62e76326 533
9bc73deb 534 if (stringHasCntl(buf))
62e76326 535 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
536
88738790 537 return buf;
538}
539
b3802bdc
AJ
540/**
541 * Yet another alternative to urlCanonical.
c2a7cefd 542 * This one adds the https:// parts to Http::METHOD_CONNECT URL
b3802bdc
AJ
543 * for use in error page outputs.
544 * Luckily we can leverage the others instead of duplicating.
545 */
546const char *
547urlCanonicalFakeHttps(const HttpRequest * request)
548{
549 LOCAL_ARRAY(char, buf, MAX_URL);
550
551 // method CONNECT and port HTTPS
5c51bffb
AJ
552 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
553 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
b3802bdc
AJ
554 return buf;
555 }
556
557 // else do the normal complete canonical thing.
558 return urlCanonicalClean(request);
559}
560
bf956b0a
BR
561/*
562 * Test if a URL is relative.
563 *
71051277
BR
564 * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
565 * appear before a ':'.
bf956b0a 566 */
6e44cca8 567bool
bf956b0a
BR
568urlIsRelative(const char *url)
569{
570 const char *p;
571
572 if (url == NULL) {
6e44cca8 573 return (false);
bf956b0a
BR
574 }
575 if (*url == '\0') {
6e44cca8 576 return (false);
bf956b0a
BR
577 }
578
5db6bf73 579 for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
bf956b0a
BR
580
581 if (*p == ':') {
6e44cca8 582 return (false);
bf956b0a 583 }
6e44cca8 584 return (true);
bf956b0a
BR
585}
586
587/*
71051277 588 * Convert a relative URL to an absolute URL using the context of a given
bf956b0a 589 * request.
71051277
BR
590 *
591 * It is assumed that you have already ensured that the URL is relative.
592 *
6e44cca8
BR
593 * If NULL is returned it is an indication that the method in use in the
594 * request does not distinguish between relative and absolute and you should
595 * use the url unchanged.
0376a4c9
BR
596 *
597 * If non-NULL is returned, it is up to the caller to free the resulting
598 * memory using safe_free().
bf956b0a 599 */
6e44cca8 600char *
bf956b0a 601urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
3cbbd242 602{
3cbbd242 603
c2a7cefd 604 if (req->method.id() == Http::METHOD_CONNECT) {
f3900427 605 return (NULL);
3cbbd242 606 }
26ac0430 607
6e44cca8 608 char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
26ac0430 609
4e3f4dc7 610 if (req->url.getScheme() == AnyP::PROTO_URN) {
c823e2da
AJ
611 // XXX: this is what the original code did, but it seems to break the
612 // intended behaviour of this function. It returns the stored URN path,
613 // not converting the given one into a URN...
614 snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
71051277 615 return (urlbuf);
3cbbd242 616 }
26ac0430 617
5c51bffb 618 SBuf authorityForm = req->url.authority(); // host[:port]
d31d59d8
AJ
619 const SBuf &scheme = req->url.getScheme().image();
620 size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
621 SQUIDSBUFPRINT(scheme),
5c51bffb
AJ
622 SQUIDSBUFPRINT(req->url.userInfo()),
623 !req->url.userInfo().isEmpty() ? "@" : "",
624 SQUIDSBUFPRINT(authorityForm));
6e44cca8 625
51b5dcf5
AJ
626 // if the first char is '/' assume its a relative path
627 // XXX: this breaks on scheme-relative URLs,
628 // but we should not see those outside ESI, and rarely there.
c823e2da 629 // XXX: also breaks on any URL containing a '/' in the query-string portion
6e44cca8 630 if (relUrl[0] == '/') {
51b5dcf5 631 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
6e44cca8 632 } else {
51b5dcf5
AJ
633 SBuf path = req->url.path();
634 SBuf::size_type lastSlashPos = path.rfind('/');
6e44cca8 635
51b5dcf5
AJ
636 if (lastSlashPos == SBuf::npos) {
637 // replace the whole path with the given bit(s)
5db6bf73
FC
638 urlbuf[urllen] = '/';
639 ++urllen;
51b5dcf5 640 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
6e44cca8 641 } else {
51b5dcf5
AJ
642 // replace only the last (file?) segment with the given bit(s)
643 ++lastSlashPos;
644 if (lastSlashPos > MAX_URL - urllen - 1) {
645 // XXX: crops bits in the middle of the combined URL.
646 lastSlashPos = MAX_URL - urllen - 1;
6e44cca8 647 }
3f0e38d6 648 SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
51b5dcf5 649 urllen += lastSlashPos;
6e44cca8 650 if (urllen + 1 < MAX_URL) {
51b5dcf5 651 xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
6e44cca8
BR
652 }
653 }
654 }
3cbbd242 655
bc9ad11f 656 return (urlbuf);
3cbbd242 657}
658
b8d8561b 659int
abbd7825 660matchDomainName(const char *h, const char *d, uint flags)
30a4f2a8 661{
9bc73deb 662 int dl;
663 int hl;
62e76326 664
abbd7825 665 const bool hostIncludesSubdomains = (*h == '.');
d20b1cd0 666 while ('.' == *h)
5db6bf73 667 ++h;
62e76326 668
9bc73deb 669 hl = strlen(h);
62e76326 670
abbd7825
CT
671 if (hl == 0)
672 return -1;
673
9bc73deb 674 dl = strlen(d);
62e76326 675
9bc73deb 676 /*
677 * Start at the ends of the two strings and work towards the
678 * beginning.
679 */
680 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
62e76326 681 if (hl == 0 && dl == 0) {
682 /*
683 * We made it all the way to the beginning of both
684 * strings without finding any difference.
685 */
686 return 0;
687 }
688
689 if (0 == hl) {
690 /*
691 * The host string is shorter than the domain string.
692 * There is only one case when this can be a match.
693 * If the domain is just one character longer, and if
694 * that character is a leading '.' then we call it a
695 * match.
696 */
697
698 if (1 == dl && '.' == d[0])
699 return 0;
700 else
701 return -1;
702 }
703
704 if (0 == dl) {
705 /*
706 * The domain string is shorter than the host string.
707 * This is a match only if the first domain character
708 * is a leading '.'.
709 */
710
abbd7825
CT
711 if ('.' == d[0]) {
712 if (flags & mdnRejectSubsubDomains) {
713 // Check for sub-sub domain and reject
714 while(--hl >= 0 && h[hl] != '.');
715 if (hl < 0) {
716 // No sub-sub domain found, but reject if there is a
717 // leading dot in given host string (which is removed
718 // before the check is started).
719 return hostIncludesSubdomains ? 1 : 0;
720 } else
721 return 1; // sub-sub domain, reject
722 } else
723 return 0;
724 } else
62e76326 725 return 1;
726 }
9bc73deb 727 }
62e76326 728
9bc73deb 729 /*
730 * We found different characters in the same position (from the end).
731 */
69f69080
CT
732
733 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
734 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
735 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
abbd7825 736 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
69f69080
CT
737 return 0;
738
d20b1cd0 739 /*
740 * If one of those character is '.' then its special. In order
741 * for splay tree sorting to work properly, "x-foo.com" must
742 * be greater than ".foo.com" even though '-' is less than '.'.
743 */
744 if ('.' == d[dl])
62e76326 745 return 1;
746
d20b1cd0 747 if ('.' == h[hl])
62e76326 748 return -1;
749
9bc73deb 750 return (xtolower(h[hl]) - xtolower(d[dl]));
30a4f2a8 751}
a8f7d3ee 752
985c86bc 753/*
610ee341 754 * return true if we can serve requests for this method.
985c86bc 755 */
b8d8561b 756int
190154cf 757urlCheckRequest(const HttpRequest * r)
a8f7d3ee 758{
759 int rc = 0;
610ee341 760 /* protocol "independent" methods
761 *
762 * actually these methods are specific to HTTP:
763 * they are methods we recieve on our HTTP port,
764 * and if we had a FTP listener would not be relevant
765 * there.
766 *
767 * So, we should delegate them to HTTP. The problem is that we
768 * do not have a default protocol from the client side of HTTP.
769 */
62e76326 770
c2a7cefd 771 if (r->method == Http::METHOD_CONNECT)
62e76326 772 return 1;
773
77ce6ba9
AR
774 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
775 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
c2a7cefd 776 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
c8ab5ec6 777 return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
62e76326 778
c2a7cefd 779 if (r->method == Http::METHOD_PURGE)
62e76326 780 return 1;
781
99edd1c3 782 /* does method match the protocol? */
4e3f4dc7 783 switch (r->url.getScheme()) {
62e76326 784
0c3d3f65 785 case AnyP::PROTO_URN:
62e76326 786
0c3d3f65 787 case AnyP::PROTO_HTTP:
62e76326 788
39a19cb7 789 case AnyP::PROTO_CACHE_OBJECT:
62e76326 790 rc = 1;
791 break;
792
0c3d3f65 793 case AnyP::PROTO_FTP:
62e76326 794
c2a7cefd 795 if (r->method == Http::METHOD_PUT)
62e76326 796 rc = 1;
797
0c3d3f65 798 case AnyP::PROTO_GOPHER:
62e76326 799
0c3d3f65 800 case AnyP::PROTO_WAIS:
62e76326 801
0c3d3f65 802 case AnyP::PROTO_WHOIS:
c2a7cefd 803 if (r->method == Http::METHOD_GET)
62e76326 804 rc = 1;
c2a7cefd 805 else if (r->method == Http::METHOD_HEAD)
62e76326 806 rc = 1;
807
808 break;
809
0c3d3f65 810 case AnyP::PROTO_HTTPS:
cb4f4424 811#if USE_OPENSSL
62e76326 812 rc = 1;
418293da
AJ
813#elif USE_GNUTLS
814 rc = 1;
1f7c9178 815#else
62e76326 816 /*
817 * Squid can't originate an SSL connection, so it should
818 * never receive an "https:" URL. It should always be
819 * CONNECT instead.
820 */
821 rc = 0;
1f7c9178 822#endif
0166128b 823 break;
62e76326 824
a8f7d3ee 825 default:
62e76326 826 break;
a8f7d3ee 827 }
62e76326 828
a8f7d3ee 829 return rc;
830}
9ce5e3e6 831
832/*
833 * Quick-n-dirty host extraction from a URL. Steps:
5999b776 834 * Look for a colon
835 * Skip any '/' after the colon
836 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
837 * Look for an ending '/' or ':' and terminate
838 * Look for login info preceeded by '@'
9ce5e3e6 839 */
77bfc324 840
841class URLHostName
842{
843
844public:
845 char * extract(char const *url);
846
847private:
848 static char Host [SQUIDHOSTNAMELEN];
849 void init(char const *);
850 void findHostStart();
851 void trimTrailingChars();
852 void trimAuth();
853 char const *hostStart;
854 char const *url;
855};
856
9ce5e3e6 857char *
858urlHostname(const char *url)
859{
77bfc324 860 return URLHostName().extract(url);
861}
62e76326 862
77bfc324 863char URLHostName::Host[SQUIDHOSTNAMELEN];
864
865void
866URLHostName::init(char const *aUrl)
867{
868 Host[0] = '\0';
aa1cafc4 869 url = aUrl;
77bfc324 870}
62e76326 871
77bfc324 872void
873URLHostName::findHostStart()
874{
875 if (NULL == (hostStart = strchr(url, ':')))
876 return;
62e76326 877
77bfc324 878 ++hostStart;
62e76326 879
77bfc324 880 while (*hostStart != '\0' && *hostStart == '/')
881 ++hostStart;
cc192b50 882
cc192b50 883 if (*hostStart == ']')
884 ++hostStart;
77bfc324 885}
62e76326 886
77bfc324 887void
888URLHostName::trimTrailingChars()
889{
890 char *t;
891
892 if ((t = strchr(Host, '/')))
62e76326 893 *t = '\0';
894
cc192b50 895 if ((t = strrchr(Host, ':')))
62e76326 896 *t = '\0';
cc192b50 897
cc192b50 898 if ((t = strchr(Host, ']')))
899 *t = '\0';
77bfc324 900}
62e76326 901
77bfc324 902void
903URLHostName::trimAuth()
904{
905 char *t;
906
907 if ((t = strrchr(Host, '@'))) {
5db6bf73 908 ++t;
41d00cd3 909 memmove(Host, t, strlen(t) + 1);
9ce5e3e6 910 }
77bfc324 911}
912
913char *
914URLHostName::extract(char const *aUrl)
915{
916 init(aUrl);
917 findHostStart();
918
919 if (hostStart == NULL)
920 return NULL;
921
922 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
923
924 trimTrailingChars();
925
926 trimAuth();
62e76326 927
77bfc324 928 return Host;
9ce5e3e6 929}
f53969cc 930
c8ab5ec6 931AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) :
d59e4742
FC
932 scheme_(aScheme),
933 hostIsNumeric_(false),
934 port_(0)
935{
936 *host_=0;
937}
1a739503 938