]> git.ipfire.org Git - thirdparty/squid.git/blame - src/url.cc
SourceFormat Enforcement
[thirdparty/squid.git] / src / url.cc
CommitLineData
30a4f2a8 1/*
bbc27441 2 * Copyright (C) 1996-2014 The Squid Software Foundation and contributors
e25c139f 3 *
bbc27441
AJ
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
019dd986 7 */
ed43818f 8
bbc27441
AJ
9/* DEBUG: section 23 URL Parsing */
10
f7f3304a 11#include "squid.h"
582c2af2 12#include "globals.h"
528b2c61 13#include "HttpRequest.h"
1fa9b1a7 14#include "rfc1738.h"
4d5904f7 15#include "SquidConfig.h"
7a707cb5 16#include "SquidString.h"
582c2af2 17#include "URL.h"
090089c4 18
4d919a80 19static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
0c3d3f65 20 const AnyP::ProtocolType protocol,
4d919a80
AR
21 const char *const urlpath,
22 const char *const host,
92d6986d 23 const SBuf &login,
4d919a80
AR
24 const int port,
25 HttpRequest *request);
9be14530 26static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
a78278e2 27static const char valid_hostname_chars_u[] =
62e76326 28 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
29 "abcdefghijklmnopqrstuvwxyz"
a78278e2 30 "0123456789-._"
cc192b50 31 "[:]"
a78278e2 32 ;
33static const char valid_hostname_chars[] =
62e76326 34 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
35 "abcdefghijklmnopqrstuvwxyz"
36 "0123456789-."
cc192b50 37 "[:]"
62e76326 38 ;
090089c4 39
2e260208
AJ
40const SBuf &
41URL::Asterisk()
42{
43 static SBuf star("*");
44 return star;
45}
46
b8d8561b 47void
0673c0ba 48urlInitialize(void)
090089c4 49{
bf8fe701 50 debugs(23, 5, "urlInitialize: Initializing...");
985c86bc 51 /* this ensures that the number of protocol strings is the same as
0c3d3f65 52 * the enum slots allocated because the last enum is always 'MAX'.
985c86bc 53 */
0c3d3f65 54 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
9bc73deb 55 /*
56 * These test that our matchDomainName() function works the
57 * way we expect it to.
58 */
59 assert(0 == matchDomainName("foo.com", "foo.com"));
d20b1cd0 60 assert(0 == matchDomainName(".foo.com", "foo.com"));
9bc73deb 61 assert(0 == matchDomainName("foo.com", ".foo.com"));
62 assert(0 == matchDomainName(".foo.com", ".foo.com"));
63 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
64 assert(0 != matchDomainName("x.foo.com", "foo.com"));
65 assert(0 != matchDomainName("foo.com", "x.foo.com"));
66 assert(0 != matchDomainName("bar.com", "foo.com"));
67 assert(0 != matchDomainName(".bar.com", "foo.com"));
68 assert(0 != matchDomainName(".bar.com", ".foo.com"));
69 assert(0 != matchDomainName("bar.com", ".foo.com"));
70 assert(0 < matchDomainName("zzz.com", "foo.com"));
71 assert(0 > matchDomainName("aaa.com", "foo.com"));
72 assert(0 == matchDomainName("FOO.com", "foo.COM"));
aca95add 73 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
74 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
d20b1cd0 75 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
9bc73deb 76 /* more cases? */
090089c4 77}
78
cc192b50 79/**
d4a04ed5 80 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
81 * backwards compatibility, e defaults to NULL, in which case we
82 * assume b is NULL-terminated.
83 */
0c3d3f65 84AnyP::ProtocolType
d4a04ed5 85urlParseProtocol(const char *b, const char *e)
92a6f4b1 86{
d4a04ed5 87 /*
88 * if e is NULL, b must be NULL terminated and we
89 * make e point to the first whitespace character
90 * after b.
91 */
92
93 if (NULL == e)
94 e = b + strcspn(b, ":");
95
96 int len = e - b;
97
fcd2d3ef 98 /* test common stuff first */
62e76326 99
d4a04ed5 100 if (strncasecmp(b, "http", len) == 0)
0c3d3f65 101 return AnyP::PROTO_HTTP;
62e76326 102
d4a04ed5 103 if (strncasecmp(b, "ftp", len) == 0)
0c3d3f65 104 return AnyP::PROTO_FTP;
62e76326 105
d4a04ed5 106 if (strncasecmp(b, "https", len) == 0)
0c3d3f65 107 return AnyP::PROTO_HTTPS;
62e76326 108
d4a04ed5 109 if (strncasecmp(b, "file", len) == 0)
0c3d3f65 110 return AnyP::PROTO_FTP;
62e76326 111
330f829e
AJ
112 if (strncasecmp(b, "coap", len) == 0)
113 return AnyP::PROTO_COAP;
114
115 if (strncasecmp(b, "coaps", len) == 0)
116 return AnyP::PROTO_COAPS;
117
d4a04ed5 118 if (strncasecmp(b, "gopher", len) == 0)
0c3d3f65 119 return AnyP::PROTO_GOPHER;
62e76326 120
d4a04ed5 121 if (strncasecmp(b, "wais", len) == 0)
0c3d3f65 122 return AnyP::PROTO_WAIS;
62e76326 123
d4a04ed5 124 if (strncasecmp(b, "cache_object", len) == 0)
39a19cb7 125 return AnyP::PROTO_CACHE_OBJECT;
62e76326 126
d4a04ed5 127 if (strncasecmp(b, "urn", len) == 0)
0c3d3f65 128 return AnyP::PROTO_URN;
62e76326 129
d4a04ed5 130 if (strncasecmp(b, "whois", len) == 0)
0c3d3f65 131 return AnyP::PROTO_WHOIS;
62e76326 132
0c3d3f65 133 return AnyP::PROTO_NONE;
92a6f4b1 134}
135
3fdadc70 136int
0c3d3f65 137urlDefaultPort(AnyP::ProtocolType p)
92a6f4b1 138{
139 switch (p) {
62e76326 140
0c3d3f65 141 case AnyP::PROTO_HTTP:
62e76326 142 return 80;
143
0c3d3f65 144 case AnyP::PROTO_HTTPS:
62e76326 145 return 443;
146
0c3d3f65 147 case AnyP::PROTO_FTP:
62e76326 148 return 21;
149
330f829e
AJ
150 case AnyP::PROTO_COAP:
151 case AnyP::PROTO_COAPS:
152 // coaps:// default is TBA as of draft-ietf-core-coap-08.
153 // Assuming IANA policy of allocating same port for base and TLS protocol versions will occur.
154 return 5683;
155
0c3d3f65 156 case AnyP::PROTO_GOPHER:
62e76326 157 return 70;
158
0c3d3f65 159 case AnyP::PROTO_WAIS:
62e76326 160 return 210;
161
39a19cb7 162 case AnyP::PROTO_CACHE_OBJECT:
62e76326 163 return CACHE_HTTP_PORT;
164
0c3d3f65 165 case AnyP::PROTO_WHOIS:
62e76326 166 return 43;
167
92a6f4b1 168 default:
62e76326 169 return 0;
92a6f4b1 170 }
171}
7111c86a 172
d4a04ed5 173/*
174 * Parse a URI/URL.
175 *
176 * If the 'request' arg is non-NULL, put parsed values there instead
177 * of allocating a new HttpRequest.
c21ad0f5 178 *
26ac0430 179 * This abuses HttpRequest as a way of representing the parsed url
c21ad0f5 180 * and its components.
181 * method is used to switch parsers and to init the HttpRequest.
c2a7cefd 182 * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
c21ad0f5 183 * looked for.
184 * The url is non const so that if its too long we can NULL-terminate it in place.
d4a04ed5 185 */
cc192b50 186
187/*
188 * This routine parses a URL. Its assumed that the URL is complete -
189 * ie, the end of the string is the end of the URL. Don't pass a partial
190 * URL here as this routine doesn't have any way of knowing whether
191 * its partial or not (ie, it handles the case of no trailing slash as
192 * being "end of host with implied path of /".
193 */
190154cf 194HttpRequest *
60745f24 195urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
7111c86a 196{
f2052513 197 LOCAL_ARRAY(char, proto, MAX_URL);
198 LOCAL_ARRAY(char, login, MAX_URL);
199 LOCAL_ARRAY(char, host, MAX_URL);
200 LOCAL_ARRAY(char, urlpath, MAX_URL);
7111c86a 201 char *t = NULL;
7e3ce7b9 202 char *q = NULL;
7111c86a 203 int port;
0c3d3f65 204 AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
774cc2d8 205 int l;
cc192b50 206 int i;
207 const char *src;
208 char *dst;
983061ed 209 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
7111c86a 210
ba0fd1b6 211 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
62e76326 212 /* terminate so it doesn't overflow other buffers */
213 *(url + (MAX_URL >> 1)) = '\0';
e0236918 214 debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
62e76326 215 return NULL;
0a5b9b32 216 }
c2a7cefd 217 if (method == Http::METHOD_CONNECT) {
62e76326 218 port = CONNECT_PORT;
219
0f0affc7 220 if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
cc192b50 221 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
222 return NULL;
223
c2a7cefd 224 } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
2e260208 225 URL::Asterisk().cmp(url) == 0) {
0c3d3f65 226 protocol = AnyP::PROTO_HTTP;
4d919a80 227 port = urlDefaultPort(protocol);
92d6986d 228 return urlParseFinish(method, protocol, url, host, SBuf(), port, request);
23d92c64 229 } else if (!strncmp(url, "urn:", 4)) {
9be14530 230 return urnParse(method, url, request);
7111c86a 231 } else {
cc192b50 232 /* Parse the URL: */
233 src = url;
234 i = 0;
235 /* Find first : - everything before is protocol */
5db6bf73 236 for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
cc192b50 237 *dst = *src;
238 }
239 if (i >= l)
26ac0430 240 return NULL;
cc192b50 241 *dst = '\0';
242
243 /* Then its :// */
5e245980 244 if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
62e76326 245 return NULL;
cc192b50 246 i += 3;
247 src += 3;
62e76326 248
cc192b50 249 /* Then everything until first /; thats host (and port; which we'll look for here later) */
68338d14
F
250 // bug 1881: If we don't get a "/" then we imply it was there
251 // bug 3074: We could just be given a "?" or "#". These also imply "/"
b2ab59ad 252 // bug 3233: whitespace is also a hostname delimiter.
5db6bf73 253 for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
cc192b50 254 *dst = *src;
255 }
256
26ac0430 257 /*
cc192b50 258 * We can't check for "i >= l" here because we could be at the end of the line
259 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
260 * been -given- a valid URL and the path is just '/'.
261 */
262 if (i > l)
263 return NULL;
264 *dst = '\0';
265
68338d14
F
266 // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
267 if (*src == '?' || *src == '#' || *src == '\0') {
268 urlpath[0] = '/';
269 dst = &urlpath[1];
270 } else {
271 dst = urlpath;
272 }
cc192b50 273 /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
5db6bf73 274 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
cc192b50 275 *dst = *src;
276 }
62e76326 277
cc192b50 278 /* We -could- be at the end of the buffer here */
279 if (i > l)
280 return NULL;
281 /* If the URL path is empty we set it to be "/" */
282 if (dst == urlpath) {
5db6bf73
FC
283 *dst = '/';
284 ++dst;
cc192b50 285 }
286 *dst = '\0';
287
288 protocol = urlParseProtocol(proto);
62e76326 289 port = urlDefaultPort(protocol);
290
cc192b50 291 /* Is there any login information? (we should eventually parse it above) */
810635e3
FC
292 t = strrchr(host, '@');
293 if (t != NULL) {
0a84e4fb
AJ
294 strncpy((char *) login, (char *) host, sizeof(login)-1);
295 login[sizeof(login)-1] = '\0';
62e76326 296 t = strrchr(login, '@');
297 *t = 0;
0a84e4fb
AJ
298 strncpy((char *) host, t + 1, sizeof(host)-1);
299 host[sizeof(host)-1] = '\0';
62e76326 300 }
301
cc192b50 302 /* Is there any host information? (we should eventually parse it above) */
26ac0430 303 if (*host == '[') {
cc192b50 304 /* strip any IPA brackets. valid under IPv6. */
305 dst = host;
cc192b50 306 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
26ac0430 307 src = host;
5db6bf73 308 ++src;
cc192b50 309 l = strlen(host);
310 i = 1;
5db6bf73 311 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
cc192b50 312 *dst = *src;
313 }
314
315 /* we moved in-place, so truncate the actual hostname found */
5db6bf73
FC
316 *dst = '\0';
317 ++dst;
cc192b50 318
319 /* skip ahead to either start of port, or original EOS */
5db6bf73
FC
320 while (*dst != '\0' && *dst != ':')
321 ++dst;
cc192b50 322 t = dst;
323 } else {
324 t = strrchr(host, ':');
325
26ac0430 326 if (t != strchr(host,':') ) {
cc192b50 327 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
328 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
329 /* therefore we MUST accept the case where they are not bracketed at all. */
330 t = NULL;
331 }
332 }
62e76326 333
b5acc277 334 // Bug 3183 sanity check: If scheme is present, host must be too.
ff8b6bcf 335 if (protocol != AnyP::PROTO_NONE && host[0] == '\0') {
ac89842b 336 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
b5acc277
AJ
337 return NULL;
338 }
339
cc192b50 340 if (t && *t == ':') {
26ac0430 341 *t = '\0';
5db6bf73 342 ++t;
cc192b50 343 port = atoi(t);
62e76326 344 }
7111c86a 345 }
62e76326 346
5db6bf73 347 for (t = host; *t; ++t)
62e76326 348 *t = xtolower(*t);
349
30abd221 350 if (stringHasWhitespace(host)) {
62e76326 351 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
352 t = q = host;
62e76326 353 while (*t) {
5db6bf73
FC
354 if (!xisspace(*t)) {
355 *q = *t;
356 ++q;
357 }
358 ++t;
62e76326 359 }
62e76326 360 *q = '\0';
361 }
d20b1cd0 362 }
62e76326 363
cc192b50 364 debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
365
a78278e2 366 if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
e0236918 367 debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
62e76326 368 return NULL;
b3f3dd02 369 }
62e76326 370
532e5dd4
AJ
371 /* For IPV6 addresses also check for a colon */
372 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
cc192b50 373 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
374
1c481e00 375 /* remove trailing dots from hostnames */
79d39a72 376 while ((l = strlen(host)) > 0 && host[--l] == '.')
62e76326 377 host[l] = '\0';
378
cc192b50 379 /* reject duplicate or leading dots */
380 if (strstr(host, "..") || *host == '.') {
e0236918 381 debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
cc192b50 382 return NULL;
383 }
62e76326 384
3a1d4727 385 if (port < 1 || port > 65535) {
bf8fe701 386 debugs(23, 3, "urlParse: Invalid port '" << port << "'");
62e76326 387 return NULL;
7111c86a 388 }
62e76326 389
32d002cb 390#if HARDCODE_DENY_PORTS
429fdbec 391 /* These ports are filtered in the default squid.conf, but
392 * maybe someone wants them hardcoded... */
6d2eb13e 393 if (port == 7 || port == 9 || port == 19) {
fa84c01d 394 debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
62e76326 395 return NULL;
429fdbec 396 }
6ef12318 397#endif
cc192b50 398
30abd221 399 if (stringHasWhitespace(urlpath)) {
bf8fe701 400 debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
62e76326 401
402 switch (Config.uri_whitespace) {
403
404 case URI_WHITESPACE_DENY:
405 return NULL;
406
407 case URI_WHITESPACE_ALLOW:
408 break;
409
410 case URI_WHITESPACE_ENCODE:
411 t = rfc1738_escape_unescaped(urlpath);
412 xstrncpy(urlpath, t, MAX_URL);
413 break;
414
415 case URI_WHITESPACE_CHOP:
416 *(urlpath + strcspn(urlpath, w_space)) = '\0';
417 break;
418
419 case URI_WHITESPACE_STRIP:
62e76326 420 default:
421 t = q = urlpath;
62e76326 422 while (*t) {
5db6bf73
FC
423 if (!xisspace(*t)) {
424 *q = *t;
425 ++q;
426 }
427 ++t;
62e76326 428 }
62e76326 429 *q = '\0';
430 }
d548ee64 431 }
62e76326 432
92d6986d 433 return urlParseFinish(method, protocol, urlpath, host, SBuf(login), port, request);
4d919a80
AR
434}
435
436/**
437 * Update request with parsed URI data. If the request arg is
438 * non-NULL, put parsed values there instead of allocating a new
439 * HttpRequest.
440 */
441static HttpRequest *
442urlParseFinish(const HttpRequestMethod& method,
0c3d3f65 443 const AnyP::ProtocolType protocol,
4d919a80
AR
444 const char *const urlpath,
445 const char *const host,
92d6986d 446 const SBuf &login,
4d919a80
AR
447 const int port,
448 HttpRequest *request)
449{
d4a04ed5 450 if (NULL == request)
5cafad19 451 request = new HttpRequest(method, protocol, urlpath);
d4a04ed5 452 else {
0e8aad88 453 request->initHTTP(method, protocol, urlpath);
9be14530 454 safe_free(request->canonical);
d4a04ed5 455 }
456
cc192b50 457 request->SetHost(host);
92d6986d 458 request->url.userInfo(login);
f45dd259 459 request->port = (unsigned short) port;
7111c86a 460 return request;
461}
462
190154cf 463static HttpRequest *
9be14530 464urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
23d92c64 465{
bf8fe701 466 debugs(50, 5, "urnParse: " << urn);
9be14530
AJ
467 if (request) {
468 request->initHTTP(method, AnyP::PROTO_URN, urn + 4);
469 safe_free(request->canonical);
470 return request;
471 }
472
0c3d3f65 473 return new HttpRequest(method, AnyP::PROTO_URN, urn + 4);
23d92c64 474}
475
4aba13ed 476const char *
190154cf 477urlCanonical(HttpRequest * request)
7111c86a 478{
95d659f0 479 LOCAL_ARRAY(char, portbuf, 32);
9b5d1d21 480 LOCAL_ARRAY(char, urlbuf, MAX_URL);
62e76326 481
9b5d1d21 482 if (request->canonical)
62e76326 483 return request->canonical;
484
4e3f4dc7 485 if (request->url.getScheme() == AnyP::PROTO_URN) {
826a1fed 486 snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
af6a12ee 487 SQUIDSTRINGPRINT(request->urlpath));
9b5d1d21 488 } else {
9e0dafa7
AJ
489 switch (request->method.id()) {
490
491 case Http::METHOD_CONNECT:
492 snprintf(urlbuf, MAX_URL, "%s:%d", request->GetHost(), request->port);
493 break;
494
e2849af8
A
495 default: {
496 portbuf[0] = '\0';
497
498 if (request->port != urlDefaultPort(request->url.getScheme()))
499 snprintf(portbuf, 32, ":%d", request->port);
500
92d6986d 501 snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s%s" SQUIDSTRINGPH,
e2849af8 502 request->url.getScheme().c_str(),
92d6986d
AJ
503 SQUIDSBUFPRINT(request->url.userInfo()),
504 !request->url.userInfo().isEmpty() ? "@" : "",
e2849af8
A
505 request->GetHost(),
506 portbuf,
507 SQUIDSTRINGPRINT(request->urlpath));
508 }
9e0dafa7 509 }
9b5d1d21 510 }
62e76326 511
4aba13ed 512 return (request->canonical = xstrdup(urlbuf));
7111c86a 513}
30a4f2a8 514
b3802bdc 515/** \todo AYJ: Performance: This is an *almost* duplicate of urlCanonical. But elides the query-string.
914b89a2 516 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
517 * and never copy the query-string part in the first place
518 */
88738790 519char *
190154cf 520urlCanonicalClean(const HttpRequest * request)
88738790 521{
522 LOCAL_ARRAY(char, buf, MAX_URL);
523 LOCAL_ARRAY(char, portbuf, 32);
524 char *t;
62e76326 525
4e3f4dc7 526 if (request->url.getScheme() == AnyP::PROTO_URN) {
826a1fed 527 snprintf(buf, MAX_URL, "urn:" SQUIDSTRINGPH,
af6a12ee 528 SQUIDSTRINGPRINT(request->urlpath));
d548ee64 529 } else {
9e0dafa7 530 switch (request->method.id()) {
62e76326 531
9e0dafa7 532 case Http::METHOD_CONNECT:
f9d91107 533 snprintf(buf, MAX_URL, "%s:%d", request->GetHost(), request->port);
9e0dafa7 534 break;
62e76326 535
e2849af8
A
536 default: {
537 portbuf[0] = '\0';
62e76326 538
e2849af8
A
539 if (request->port != urlDefaultPort(request->url.getScheme()))
540 snprintf(portbuf, 32, ":%d", request->port);
62e76326 541
92d6986d 542 snprintf(buf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s%s" SQUIDSTRINGPH,
e2849af8 543 request->url.getScheme().c_str(),
92d6986d
AJ
544 SQUIDSBUFPRINT(request->url.userInfo()),
545 (request->url.userInfo().isEmpty() ? "" : "@"),
e2849af8
A
546 request->GetHost(),
547 portbuf,
548 SQUIDSTRINGPRINT(request->urlpath));
549
550 // strip arguments AFTER a question-mark
551 if (Config.onoff.strip_query_terms)
552 if ((t = strchr(buf, '?')))
553 *(++t) = '\0';
554 }
92d6986d 555 } // switch
d548ee64 556 }
62e76326 557
9bc73deb 558 if (stringHasCntl(buf))
62e76326 559 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
560
88738790 561 return buf;
562}
563
b3802bdc
AJ
564/**
565 * Yet another alternative to urlCanonical.
c2a7cefd 566 * This one adds the https:// parts to Http::METHOD_CONNECT URL
b3802bdc
AJ
567 * for use in error page outputs.
568 * Luckily we can leverage the others instead of duplicating.
569 */
570const char *
571urlCanonicalFakeHttps(const HttpRequest * request)
572{
573 LOCAL_ARRAY(char, buf, MAX_URL);
574
575 // method CONNECT and port HTTPS
c2a7cefd 576 if (request->method == Http::METHOD_CONNECT && request->port == 443) {
b3802bdc
AJ
577 snprintf(buf, MAX_URL, "https://%s/*", request->GetHost());
578 return buf;
579 }
580
581 // else do the normal complete canonical thing.
582 return urlCanonicalClean(request);
583}
584
bf956b0a
BR
585/*
586 * Test if a URL is relative.
587 *
71051277
BR
588 * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
589 * appear before a ':'.
bf956b0a 590 */
6e44cca8 591bool
bf956b0a
BR
592urlIsRelative(const char *url)
593{
594 const char *p;
595
596 if (url == NULL) {
6e44cca8 597 return (false);
bf956b0a
BR
598 }
599 if (*url == '\0') {
6e44cca8 600 return (false);
bf956b0a
BR
601 }
602
5db6bf73 603 for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
bf956b0a
BR
604
605 if (*p == ':') {
6e44cca8 606 return (false);
bf956b0a 607 }
6e44cca8 608 return (true);
bf956b0a
BR
609}
610
611/*
71051277 612 * Convert a relative URL to an absolute URL using the context of a given
bf956b0a 613 * request.
71051277
BR
614 *
615 * It is assumed that you have already ensured that the URL is relative.
616 *
6e44cca8
BR
617 * If NULL is returned it is an indication that the method in use in the
618 * request does not distinguish between relative and absolute and you should
619 * use the url unchanged.
0376a4c9
BR
620 *
621 * If non-NULL is returned, it is up to the caller to free the resulting
622 * memory using safe_free().
bf956b0a 623 */
6e44cca8 624char *
bf956b0a 625urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
3cbbd242 626{
3cbbd242 627
c2a7cefd 628 if (req->method.id() == Http::METHOD_CONNECT) {
f3900427 629 return (NULL);
3cbbd242 630 }
26ac0430 631
6e44cca8 632 char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
26ac0430 633
4e3f4dc7 634 if (req->url.getScheme() == AnyP::PROTO_URN) {
826a1fed 635 snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
af6a12ee 636 SQUIDSTRINGPRINT(req->urlpath));
71051277 637 return (urlbuf);
3cbbd242 638 }
26ac0430 639
6e44cca8
BR
640 size_t urllen;
641
4e3f4dc7 642 if (req->port != urlDefaultPort(req->url.getScheme())) {
92d6986d 643 urllen = snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s:%d",
4e3f4dc7 644 req->url.getScheme().c_str(),
92d6986d
AJ
645 SQUIDSBUFPRINT(req->url.userInfo()),
646 !req->url.userInfo().isEmpty() ? "@" : "",
26ac0430
AJ
647 req->GetHost(),
648 req->port
649 );
6e44cca8 650 } else {
92d6986d 651 urllen = snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s",
4e3f4dc7 652 req->url.getScheme().c_str(),
92d6986d
AJ
653 SQUIDSBUFPRINT(req->url.userInfo()),
654 !req->url.userInfo().isEmpty() ? "@" : "",
26ac0430
AJ
655 req->GetHost()
656 );
6e44cca8
BR
657 }
658
659 if (relUrl[0] == '/') {
660 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
661 } else {
b4f2886c 662 const char *path = req->urlpath.termedBuf();
6e44cca8
BR
663 const char *last_slash = strrchr(path, '/');
664
665 if (last_slash == NULL) {
5db6bf73
FC
666 urlbuf[urllen] = '/';
667 ++urllen;
6e44cca8
BR
668 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
669 } else {
5db6bf73 670 ++last_slash;
6e44cca8
BR
671 size_t pathlen = last_slash - path;
672 if (pathlen > MAX_URL - urllen - 1) {
673 pathlen = MAX_URL - urllen - 1;
674 }
675 strncpy(&urlbuf[urllen], path, pathlen);
676 urllen += pathlen;
677 if (urllen + 1 < MAX_URL) {
678 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
679 }
680 }
681 }
3cbbd242 682
bc9ad11f 683 return (urlbuf);
3cbbd242 684}
685
9bc73deb 686/*
687 * matchDomainName() compares a hostname with a domainname according
688 * to the following rules:
26ac0430 689 *
9bc73deb 690 * HOST DOMAIN MATCH?
691 * ------------- ------------- ------
692 * foo.com foo.com YES
d20b1cd0 693 * .foo.com foo.com YES
9bc73deb 694 * x.foo.com foo.com NO
695 * foo.com .foo.com YES
696 * .foo.com .foo.com YES
697 * x.foo.com .foo.com YES
698 *
d20b1cd0 699 * We strip leading dots on hosts (but not domains!) so that
700 * ".foo.com" is is always the same as "foo.com".
701 *
9bc73deb 702 * Return values:
703 * 0 means the host matches the domain
704 * 1 means the host is greater than the domain
705 * -1 means the host is less than the domain
706 */
707
b8d8561b 708int
9bc73deb 709matchDomainName(const char *h, const char *d)
30a4f2a8 710{
9bc73deb 711 int dl;
712 int hl;
62e76326 713
d20b1cd0 714 while ('.' == *h)
5db6bf73 715 ++h;
62e76326 716
9bc73deb 717 hl = strlen(h);
62e76326 718
9bc73deb 719 dl = strlen(d);
62e76326 720
9bc73deb 721 /*
722 * Start at the ends of the two strings and work towards the
723 * beginning.
724 */
725 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
62e76326 726 if (hl == 0 && dl == 0) {
727 /*
728 * We made it all the way to the beginning of both
729 * strings without finding any difference.
730 */
731 return 0;
732 }
733
734 if (0 == hl) {
735 /*
736 * The host string is shorter than the domain string.
737 * There is only one case when this can be a match.
738 * If the domain is just one character longer, and if
739 * that character is a leading '.' then we call it a
740 * match.
741 */
742
743 if (1 == dl && '.' == d[0])
744 return 0;
745 else
746 return -1;
747 }
748
749 if (0 == dl) {
750 /*
751 * The domain string is shorter than the host string.
752 * This is a match only if the first domain character
753 * is a leading '.'.
754 */
755
756 if ('.' == d[0])
757 return 0;
758 else
759 return 1;
760 }
9bc73deb 761 }
62e76326 762
9bc73deb 763 /*
764 * We found different characters in the same position (from the end).
765 */
d20b1cd0 766 /*
767 * If one of those character is '.' then its special. In order
768 * for splay tree sorting to work properly, "x-foo.com" must
769 * be greater than ".foo.com" even though '-' is less than '.'.
770 */
771 if ('.' == d[dl])
62e76326 772 return 1;
773
d20b1cd0 774 if ('.' == h[hl])
62e76326 775 return -1;
776
9bc73deb 777 return (xtolower(h[hl]) - xtolower(d[dl]));
30a4f2a8 778}
a8f7d3ee 779
985c86bc 780/*
610ee341 781 * return true if we can serve requests for this method.
985c86bc 782 */
b8d8561b 783int
190154cf 784urlCheckRequest(const HttpRequest * r)
a8f7d3ee 785{
786 int rc = 0;
610ee341 787 /* protocol "independent" methods
788 *
789 * actually these methods are specific to HTTP:
790 * they are methods we recieve on our HTTP port,
791 * and if we had a FTP listener would not be relevant
792 * there.
793 *
794 * So, we should delegate them to HTTP. The problem is that we
795 * do not have a default protocol from the client side of HTTP.
796 */
62e76326 797
c2a7cefd 798 if (r->method == Http::METHOD_CONNECT)
62e76326 799 return 1;
800
77ce6ba9
AR
801 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
802 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
c2a7cefd 803 if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE)
2e260208 804 return (r->header.getInt64(HDR_MAX_FORWARDS) == 0 || URL::Asterisk().cmp(r->urlpath.rawBuf(), r->urlpath.size()) != 0);
62e76326 805
c2a7cefd 806 if (r->method == Http::METHOD_PURGE)
62e76326 807 return 1;
808
99edd1c3 809 /* does method match the protocol? */
4e3f4dc7 810 switch (r->url.getScheme()) {
62e76326 811
0c3d3f65 812 case AnyP::PROTO_URN:
62e76326 813
0c3d3f65 814 case AnyP::PROTO_HTTP:
62e76326 815
39a19cb7 816 case AnyP::PROTO_CACHE_OBJECT:
62e76326 817 rc = 1;
818 break;
819
0c3d3f65 820 case AnyP::PROTO_FTP:
62e76326 821
c2a7cefd 822 if (r->method == Http::METHOD_PUT)
62e76326 823 rc = 1;
824
0c3d3f65 825 case AnyP::PROTO_GOPHER:
62e76326 826
0c3d3f65 827 case AnyP::PROTO_WAIS:
62e76326 828
0c3d3f65 829 case AnyP::PROTO_WHOIS:
c2a7cefd 830 if (r->method == Http::METHOD_GET)
62e76326 831 rc = 1;
c2a7cefd 832 else if (r->method == Http::METHOD_HEAD)
62e76326 833 rc = 1;
834
835 break;
836
0c3d3f65 837 case AnyP::PROTO_HTTPS:
cb4f4424 838#if USE_OPENSSL
62e76326 839
840 rc = 1;
841
842 break;
843
1f7c9178 844#else
62e76326 845 /*
846 * Squid can't originate an SSL connection, so it should
847 * never receive an "https:" URL. It should always be
848 * CONNECT instead.
849 */
850 rc = 0;
851
1f7c9178 852#endif
62e76326 853
a8f7d3ee 854 default:
62e76326 855 break;
a8f7d3ee 856 }
62e76326 857
a8f7d3ee 858 return rc;
859}
9ce5e3e6 860
861/*
862 * Quick-n-dirty host extraction from a URL. Steps:
5999b776 863 * Look for a colon
864 * Skip any '/' after the colon
865 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
866 * Look for an ending '/' or ':' and terminate
867 * Look for login info preceeded by '@'
9ce5e3e6 868 */
77bfc324 869
870class URLHostName
871{
872
873public:
874 char * extract(char const *url);
875
876private:
877 static char Host [SQUIDHOSTNAMELEN];
878 void init(char const *);
879 void findHostStart();
880 void trimTrailingChars();
881 void trimAuth();
882 char const *hostStart;
883 char const *url;
884};
885
9ce5e3e6 886char *
887urlHostname(const char *url)
888{
77bfc324 889 return URLHostName().extract(url);
890}
62e76326 891
77bfc324 892char URLHostName::Host[SQUIDHOSTNAMELEN];
893
894void
895URLHostName::init(char const *aUrl)
896{
897 Host[0] = '\0';
aa1cafc4 898 url = aUrl;
77bfc324 899}
62e76326 900
77bfc324 901void
902URLHostName::findHostStart()
903{
904 if (NULL == (hostStart = strchr(url, ':')))
905 return;
62e76326 906
77bfc324 907 ++hostStart;
62e76326 908
77bfc324 909 while (*hostStart != '\0' && *hostStart == '/')
910 ++hostStart;
cc192b50 911
cc192b50 912 if (*hostStart == ']')
913 ++hostStart;
77bfc324 914}
62e76326 915
77bfc324 916void
917URLHostName::trimTrailingChars()
918{
919 char *t;
920
921 if ((t = strchr(Host, '/')))
62e76326 922 *t = '\0';
923
cc192b50 924 if ((t = strrchr(Host, ':')))
62e76326 925 *t = '\0';
cc192b50 926
cc192b50 927 if ((t = strchr(Host, ']')))
928 *t = '\0';
77bfc324 929}
62e76326 930
77bfc324 931void
932URLHostName::trimAuth()
933{
934 char *t;
935
936 if ((t = strrchr(Host, '@'))) {
5db6bf73 937 ++t;
41d00cd3 938 memmove(Host, t, strlen(t) + 1);
9ce5e3e6 939 }
77bfc324 940}
941
942char *
943URLHostName::extract(char const *aUrl)
944{
945 init(aUrl);
946 findHostStart();
947
948 if (hostStart == NULL)
949 return NULL;
950
951 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
952
953 trimTrailingChars();
954
955 trimAuth();
62e76326 956
77bfc324 957 return Host;
9ce5e3e6 958}
f53969cc 959