]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Removed CVS $ markers
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * DEBUG: section 23 URL Parsing
4 * AUTHOR: Duane Wessels
5 *
6 * SQUID Web Proxy Cache http://www.squid-cache.org/
7 * ----------------------------------------------------------
8 *
9 * Squid is the result of efforts by numerous individuals from
10 * the Internet community; see the CONTRIBUTORS file for full
11 * details. Many organizations have provided support for Squid's
12 * development; see the SPONSORS file for full details. Squid is
13 * Copyrighted (C) 2001 by the Regents of the University of
14 * California; see the COPYRIGHT file for full details. Squid
15 * incorporates software developed and/or copyrighted by other
16 * sources; see the CREDITS file for full details.
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License
29 * along with this program; if not, write to the Free Software
30 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
31 *
32 */
33
34 #include "squid.h"
35 #include "globals.h"
36 #include "HttpRequest.h"
37 #include "rfc1738.h"
38 #include "SquidString.h"
39 #include "URL.h"
40 #include "URLScheme.h"
41
42 static HttpRequest *urlParseFinish(const HttpRequestMethod& method,
43 const AnyP::ProtocolType protocol,
44 const char *const urlpath,
45 const char *const host,
46 const char *const login,
47 const int port,
48 HttpRequest *request);
49 static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request);
50 static const char valid_hostname_chars_u[] =
51 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
52 "abcdefghijklmnopqrstuvwxyz"
53 "0123456789-._"
54 "[:]"
55 ;
56 static const char valid_hostname_chars[] =
57 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
58 "abcdefghijklmnopqrstuvwxyz"
59 "0123456789-."
60 "[:]"
61 ;
62
63 void
64 urlInitialize(void)
65 {
66 debugs(23, 5, "urlInitialize: Initializing...");
67 /* this ensures that the number of protocol strings is the same as
68 * the enum slots allocated because the last enum is always 'MAX'.
69 */
70 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
71 /*
72 * These test that our matchDomainName() function works the
73 * way we expect it to.
74 */
75 assert(0 == matchDomainName("foo.com", "foo.com"));
76 assert(0 == matchDomainName(".foo.com", "foo.com"));
77 assert(0 == matchDomainName("foo.com", ".foo.com"));
78 assert(0 == matchDomainName(".foo.com", ".foo.com"));
79 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
80 assert(0 != matchDomainName("x.foo.com", "foo.com"));
81 assert(0 != matchDomainName("foo.com", "x.foo.com"));
82 assert(0 != matchDomainName("bar.com", "foo.com"));
83 assert(0 != matchDomainName(".bar.com", "foo.com"));
84 assert(0 != matchDomainName(".bar.com", ".foo.com"));
85 assert(0 != matchDomainName("bar.com", ".foo.com"));
86 assert(0 < matchDomainName("zzz.com", "foo.com"));
87 assert(0 > matchDomainName("aaa.com", "foo.com"));
88 assert(0 == matchDomainName("FOO.com", "foo.COM"));
89 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
90 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
91 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
92 /* more cases? */
93 }
94
95 /**
96 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
97 * backwards compatibility, e defaults to NULL, in which case we
98 * assume b is NULL-terminated.
99 */
100 AnyP::ProtocolType
101 urlParseProtocol(const char *b, const char *e)
102 {
103 /*
104 * if e is NULL, b must be NULL terminated and we
105 * make e point to the first whitespace character
106 * after b.
107 */
108
109 if (NULL == e)
110 e = b + strcspn(b, ":");
111
112 int len = e - b;
113
114 /* test common stuff first */
115
116 if (strncasecmp(b, "http", len) == 0)
117 return AnyP::PROTO_HTTP;
118
119 if (strncasecmp(b, "ftp", len) == 0)
120 return AnyP::PROTO_FTP;
121
122 if (strncasecmp(b, "https", len) == 0)
123 return AnyP::PROTO_HTTPS;
124
125 if (strncasecmp(b, "file", len) == 0)
126 return AnyP::PROTO_FTP;
127
128 if (strncasecmp(b, "coap", len) == 0)
129 return AnyP::PROTO_COAP;
130
131 if (strncasecmp(b, "coaps", len) == 0)
132 return AnyP::PROTO_COAPS;
133
134 if (strncasecmp(b, "gopher", len) == 0)
135 return AnyP::PROTO_GOPHER;
136
137 if (strncasecmp(b, "wais", len) == 0)
138 return AnyP::PROTO_WAIS;
139
140 if (strncasecmp(b, "cache_object", len) == 0)
141 return AnyP::PROTO_CACHE_OBJECT;
142
143 if (strncasecmp(b, "urn", len) == 0)
144 return AnyP::PROTO_URN;
145
146 if (strncasecmp(b, "whois", len) == 0)
147 return AnyP::PROTO_WHOIS;
148
149 if (strncasecmp(b, "internal", len) == 0)
150 return AnyP::PROTO_INTERNAL;
151
152 return AnyP::PROTO_NONE;
153 }
154
155 int
156 urlDefaultPort(AnyP::ProtocolType p)
157 {
158 switch (p) {
159
160 case AnyP::PROTO_HTTP:
161 return 80;
162
163 case AnyP::PROTO_HTTPS:
164 return 443;
165
166 case AnyP::PROTO_FTP:
167 return 21;
168
169 case AnyP::PROTO_COAP:
170 case AnyP::PROTO_COAPS:
171 // coaps:// default is TBA as of draft-ietf-core-coap-08.
172 // Assuming IANA policy of allocating same port for base and TLS protocol versions will occur.
173 return 5683;
174
175 case AnyP::PROTO_GOPHER:
176 return 70;
177
178 case AnyP::PROTO_WAIS:
179 return 210;
180
181 case AnyP::PROTO_CACHE_OBJECT:
182
183 case AnyP::PROTO_INTERNAL:
184 return CACHE_HTTP_PORT;
185
186 case AnyP::PROTO_WHOIS:
187 return 43;
188
189 default:
190 return 0;
191 }
192 }
193
194 /*
195 * Parse a URI/URL.
196 *
197 * If the 'request' arg is non-NULL, put parsed values there instead
198 * of allocating a new HttpRequest.
199 *
200 * This abuses HttpRequest as a way of representing the parsed url
201 * and its components.
202 * method is used to switch parsers and to init the HttpRequest.
203 * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
204 * looked for.
205 * The url is non const so that if its too long we can NULL-terminate it in place.
206 */
207
208 /*
209 * This routine parses a URL. Its assumed that the URL is complete -
210 * ie, the end of the string is the end of the URL. Don't pass a partial
211 * URL here as this routine doesn't have any way of knowing whether
212 * its partial or not (ie, it handles the case of no trailing slash as
213 * being "end of host with implied path of /".
214 */
215 HttpRequest *
216 urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request)
217 {
218 LOCAL_ARRAY(char, proto, MAX_URL);
219 LOCAL_ARRAY(char, login, MAX_URL);
220 LOCAL_ARRAY(char, host, MAX_URL);
221 LOCAL_ARRAY(char, urlpath, MAX_URL);
222 char *t = NULL;
223 char *q = NULL;
224 int port;
225 AnyP::ProtocolType protocol = AnyP::PROTO_NONE;
226 int l;
227 int i;
228 const char *src;
229 char *dst;
230 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
231
232 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
233 /* terminate so it doesn't overflow other buffers */
234 *(url + (MAX_URL >> 1)) = '\0';
235 debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)");
236 return NULL;
237 }
238 if (method == METHOD_CONNECT) {
239 port = CONNECT_PORT;
240
241 if (sscanf(url, "[%[^]]]:%d", host, &port) < 1)
242 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
243 return NULL;
244
245 } else if ((method == METHOD_OPTIONS || method == METHOD_TRACE) &&
246 strcmp(url, "*") == 0) {
247 protocol = AnyP::PROTO_HTTP;
248 port = urlDefaultPort(protocol);
249 return urlParseFinish(method, protocol, url, host, login, port, request);
250 } else if (!strncmp(url, "urn:", 4)) {
251 return urnParse(method, url, request);
252 } else {
253 /* Parse the URL: */
254 src = url;
255 i = 0;
256 /* Find first : - everything before is protocol */
257 for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
258 *dst = *src;
259 }
260 if (i >= l)
261 return NULL;
262 *dst = '\0';
263
264 /* Then its :// */
265 if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
266 return NULL;
267 i += 3;
268 src += 3;
269
270 /* Then everything until first /; thats host (and port; which we'll look for here later) */
271 // bug 1881: If we don't get a "/" then we imply it was there
272 // bug 3074: We could just be given a "?" or "#". These also imply "/"
273 // bug 3233: whitespace is also a hostname delimiter.
274 for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
275 *dst = *src;
276 }
277
278 /*
279 * We can't check for "i >= l" here because we could be at the end of the line
280 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
281 * been -given- a valid URL and the path is just '/'.
282 */
283 if (i > l)
284 return NULL;
285 *dst = '\0';
286
287 // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
288 if (*src == '?' || *src == '#' || *src == '\0') {
289 urlpath[0] = '/';
290 dst = &urlpath[1];
291 } else {
292 dst = urlpath;
293 }
294 /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
295 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
296 *dst = *src;
297 }
298
299 /* We -could- be at the end of the buffer here */
300 if (i > l)
301 return NULL;
302 /* If the URL path is empty we set it to be "/" */
303 if (dst == urlpath) {
304 *dst = '/';
305 ++dst;
306 }
307 *dst = '\0';
308
309 protocol = urlParseProtocol(proto);
310 port = urlDefaultPort(protocol);
311
312 /* Is there any login information? (we should eventually parse it above) */
313 if ((t = strrchr(host, '@'))) {
314 strcpy((char *) login, (char *) host);
315 t = strrchr(login, '@');
316 *t = 0;
317 strcpy((char *) host, t + 1);
318 }
319
320 /* Is there any host information? (we should eventually parse it above) */
321 if (*host == '[') {
322 /* strip any IPA brackets. valid under IPv6. */
323 dst = host;
324 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
325 src = host;
326 ++src;
327 l = strlen(host);
328 i = 1;
329 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
330 *dst = *src;
331 }
332
333 /* we moved in-place, so truncate the actual hostname found */
334 *dst = '\0';
335 ++dst;
336
337 /* skip ahead to either start of port, or original EOS */
338 while (*dst != '\0' && *dst != ':')
339 ++dst;
340 t = dst;
341 } else {
342 t = strrchr(host, ':');
343
344 if (t != strchr(host,':') ) {
345 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
346 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
347 /* therefore we MUST accept the case where they are not bracketed at all. */
348 t = NULL;
349 }
350 }
351
352 // Bug 3183 sanity check: If scheme is present, host must be too.
353 if (protocol != AnyP::PROTO_NONE && (host == NULL || *host == '\0')) {
354 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
355 return NULL;
356 }
357
358 if (t && *t == ':') {
359 *t = '\0';
360 ++t;
361 port = atoi(t);
362 }
363 }
364
365 for (t = host; *t; ++t)
366 *t = xtolower(*t);
367
368 if (stringHasWhitespace(host)) {
369 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
370 t = q = host;
371 while (*t) {
372 if (!xisspace(*t)) {
373 *q = *t;
374 ++q;
375 }
376 ++t;
377 }
378 *q = '\0';
379 }
380 }
381
382 debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'");
383
384 if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
385 debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'");
386 return NULL;
387 }
388
389 /* For IPV6 addresses also check for a colon */
390 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':'))
391 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
392
393 /* remove trailing dots from hostnames */
394 while ((l = strlen(host)) > 0 && host[--l] == '.')
395 host[l] = '\0';
396
397 /* reject duplicate or leading dots */
398 if (strstr(host, "..") || *host == '.') {
399 debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'");
400 return NULL;
401 }
402
403 if (port < 1 || port > 65535) {
404 debugs(23, 3, "urlParse: Invalid port '" << port << "'");
405 return NULL;
406 }
407
408 #if HARDCODE_DENY_PORTS
409 /* These ports are filtered in the default squid.conf, but
410 * maybe someone wants them hardcoded... */
411 if (port == 7 || port == 9 || port == 19) {
412 debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port);
413 return NULL;
414 }
415 #endif
416
417 if (stringHasWhitespace(urlpath)) {
418 debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
419
420 switch (Config.uri_whitespace) {
421
422 case URI_WHITESPACE_DENY:
423 return NULL;
424
425 case URI_WHITESPACE_ALLOW:
426 break;
427
428 case URI_WHITESPACE_ENCODE:
429 t = rfc1738_escape_unescaped(urlpath);
430 xstrncpy(urlpath, t, MAX_URL);
431 break;
432
433 case URI_WHITESPACE_CHOP:
434 *(urlpath + strcspn(urlpath, w_space)) = '\0';
435 break;
436
437 case URI_WHITESPACE_STRIP:
438 default:
439 t = q = urlpath;
440 while (*t) {
441 if (!xisspace(*t)) {
442 *q = *t;
443 ++q;
444 }
445 ++t;
446 }
447 *q = '\0';
448 }
449 }
450
451 return urlParseFinish(method, protocol, urlpath, host, login, port, request);
452 }
453
454 /**
455 * Update request with parsed URI data. If the request arg is
456 * non-NULL, put parsed values there instead of allocating a new
457 * HttpRequest.
458 */
459 static HttpRequest *
460 urlParseFinish(const HttpRequestMethod& method,
461 const AnyP::ProtocolType protocol,
462 const char *const urlpath,
463 const char *const host,
464 const char *const login,
465 const int port,
466 HttpRequest *request)
467 {
468 if (NULL == request)
469 request = new HttpRequest(method, protocol, urlpath);
470 else {
471 request->initHTTP(method, protocol, urlpath);
472 safe_free(request->canonical);
473 }
474
475 request->SetHost(host);
476 xstrncpy(request->login, login, MAX_LOGIN_SZ);
477 request->port = (unsigned short) port;
478 return request;
479 }
480
481 static HttpRequest *
482 urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request)
483 {
484 debugs(50, 5, "urnParse: " << urn);
485 if (request) {
486 request->initHTTP(method, AnyP::PROTO_URN, urn + 4);
487 safe_free(request->canonical);
488 return request;
489 }
490
491 return new HttpRequest(method, AnyP::PROTO_URN, urn + 4);
492 }
493
494 const char *
495 urlCanonical(HttpRequest * request)
496 {
497 LOCAL_ARRAY(char, portbuf, 32);
498 /// \todo AYJ: Performance: making this a ptr and allocating when needed will be better than a write and future xstrdup().
499 LOCAL_ARRAY(char, urlbuf, MAX_URL);
500
501 if (request->canonical)
502 return request->canonical;
503
504 if (request->protocol == AnyP::PROTO_URN) {
505 snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
506 SQUIDSTRINGPRINT(request->urlpath));
507 } else {
508 /// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
509 switch (request->method.id()) {
510
511 case METHOD_CONNECT:
512 snprintf(urlbuf, MAX_URL, "%s:%d", request->GetHost(), request->port);
513 break;
514
515 default:
516 portbuf[0] = '\0';
517
518 if (request->port != urlDefaultPort(request->protocol))
519 snprintf(portbuf, 32, ":%d", request->port);
520
521 const URLScheme sch = request->protocol; // temporary, until bug 1961 URL handling is fixed.
522 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s" SQUIDSTRINGPH,
523 sch.const_str(),
524 request->login,
525 *request->login ? "@" : null_string,
526 request->GetHost(),
527 portbuf,
528 SQUIDSTRINGPRINT(request->urlpath));
529
530 break;
531 }
532 }
533
534 return (request->canonical = xstrdup(urlbuf));
535 }
536
537 /** \todo AYJ: Performance: This is an *almost* duplicate of urlCanonical. But elides the query-string.
538 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
539 * and never copy the query-string part in the first place
540 */
541 char *
542 urlCanonicalClean(const HttpRequest * request)
543 {
544 LOCAL_ARRAY(char, buf, MAX_URL);
545 LOCAL_ARRAY(char, portbuf, 32);
546 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
547 char *t;
548
549 if (request->protocol == AnyP::PROTO_URN) {
550 snprintf(buf, MAX_URL, "urn:" SQUIDSTRINGPH,
551 SQUIDSTRINGPRINT(request->urlpath));
552 } else {
553 /// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
554 switch (request->method.id()) {
555
556 case METHOD_CONNECT:
557 snprintf(buf, MAX_URL, "%s:%d",
558 request->GetHost(),
559 request->port);
560 break;
561
562 default:
563 portbuf[0] = '\0';
564
565 if (request->port != urlDefaultPort(request->protocol))
566 snprintf(portbuf, 32, ":%d", request->port);
567
568 loginbuf[0] = '\0';
569
570 if ((int) strlen(request->login) > 0) {
571 strcpy(loginbuf, request->login);
572
573 if ((t = strchr(loginbuf, ':')))
574 *t = '\0';
575
576 strcat(loginbuf, "@");
577 }
578
579 const URLScheme sch = request->protocol; // temporary, until bug 1961 URL handling is fixed.
580 snprintf(buf, MAX_URL, "%s://%s%s%s" SQUIDSTRINGPH,
581 sch.const_str(),
582 loginbuf,
583 request->GetHost(),
584 portbuf,
585 SQUIDSTRINGPRINT(request->urlpath));
586 /*
587 * strip arguments AFTER a question-mark
588 */
589
590 if (Config.onoff.strip_query_terms)
591 if ((t = strchr(buf, '?'))) {
592 ++t;
593 *t = '\0';
594 }
595
596 break;
597 }
598 }
599
600 if (stringHasCntl(buf))
601 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
602
603 return buf;
604 }
605
606 /**
607 * Yet another alternative to urlCanonical.
608 * This one addes the https:// parts to METHOD_CONNECT URL
609 * for use in error page outputs.
610 * Luckily we can leverage the others instead of duplicating.
611 */
612 const char *
613 urlCanonicalFakeHttps(const HttpRequest * request)
614 {
615 LOCAL_ARRAY(char, buf, MAX_URL);
616
617 // method CONNECT and port HTTPS
618 if (request->method == METHOD_CONNECT && request->port == 443) {
619 snprintf(buf, MAX_URL, "https://%s/*", request->GetHost());
620 return buf;
621 }
622
623 // else do the normal complete canonical thing.
624 return urlCanonicalClean(request);
625 }
626
627 /*
628 * Test if a URL is relative.
629 *
630 * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
631 * appear before a ':'.
632 */
633 bool
634 urlIsRelative(const char *url)
635 {
636 const char *p;
637
638 if (url == NULL) {
639 return (false);
640 }
641 if (*url == '\0') {
642 return (false);
643 }
644
645 for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
646
647 if (*p == ':') {
648 return (false);
649 }
650 return (true);
651 }
652
653 /*
654 * Convert a relative URL to an absolute URL using the context of a given
655 * request.
656 *
657 * It is assumed that you have already ensured that the URL is relative.
658 *
659 * If NULL is returned it is an indication that the method in use in the
660 * request does not distinguish between relative and absolute and you should
661 * use the url unchanged.
662 *
663 * If non-NULL is returned, it is up to the caller to free the resulting
664 * memory using safe_free().
665 */
666 char *
667 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
668 {
669
670 if (req->method.id() == METHOD_CONNECT) {
671 return (NULL);
672 }
673
674 char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
675
676 if (req->protocol == AnyP::PROTO_URN) {
677 snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH,
678 SQUIDSTRINGPRINT(req->urlpath));
679 return (urlbuf);
680 }
681
682 size_t urllen;
683
684 const URLScheme sch = req->protocol; // temporary, until bug 1961 URL handling is fixed.
685 if (req->port != urlDefaultPort(req->protocol)) {
686 urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s:%d",
687 sch.const_str(),
688 req->login,
689 *req->login ? "@" : null_string,
690 req->GetHost(),
691 req->port
692 );
693 } else {
694 urllen = snprintf(urlbuf, MAX_URL, "%s://%s%s%s",
695 sch.const_str(),
696 req->login,
697 *req->login ? "@" : null_string,
698 req->GetHost()
699 );
700 }
701
702 if (relUrl[0] == '/') {
703 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
704 } else {
705 const char *path = req->urlpath.termedBuf();
706 const char *last_slash = strrchr(path, '/');
707
708 if (last_slash == NULL) {
709 urlbuf[urllen] = '/';
710 ++urllen;
711 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
712 } else {
713 ++last_slash;
714 size_t pathlen = last_slash - path;
715 if (pathlen > MAX_URL - urllen - 1) {
716 pathlen = MAX_URL - urllen - 1;
717 }
718 strncpy(&urlbuf[urllen], path, pathlen);
719 urllen += pathlen;
720 if (urllen + 1 < MAX_URL) {
721 strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
722 }
723 }
724 }
725
726 return (urlbuf);
727 }
728
729 /*
730 * matchDomainName() compares a hostname with a domainname according
731 * to the following rules:
732 *
733 * HOST DOMAIN MATCH?
734 * ------------- ------------- ------
735 * foo.com foo.com YES
736 * .foo.com foo.com YES
737 * x.foo.com foo.com NO
738 * foo.com .foo.com YES
739 * .foo.com .foo.com YES
740 * x.foo.com .foo.com YES
741 *
742 * We strip leading dots on hosts (but not domains!) so that
743 * ".foo.com" is is always the same as "foo.com".
744 *
745 * Return values:
746 * 0 means the host matches the domain
747 * 1 means the host is greater than the domain
748 * -1 means the host is less than the domain
749 */
750
751 int
752 matchDomainName(const char *h, const char *d)
753 {
754 int dl;
755 int hl;
756
757 while ('.' == *h)
758 ++h;
759
760 hl = strlen(h);
761
762 dl = strlen(d);
763
764 /*
765 * Start at the ends of the two strings and work towards the
766 * beginning.
767 */
768 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
769 if (hl == 0 && dl == 0) {
770 /*
771 * We made it all the way to the beginning of both
772 * strings without finding any difference.
773 */
774 return 0;
775 }
776
777 if (0 == hl) {
778 /*
779 * The host string is shorter than the domain string.
780 * There is only one case when this can be a match.
781 * If the domain is just one character longer, and if
782 * that character is a leading '.' then we call it a
783 * match.
784 */
785
786 if (1 == dl && '.' == d[0])
787 return 0;
788 else
789 return -1;
790 }
791
792 if (0 == dl) {
793 /*
794 * The domain string is shorter than the host string.
795 * This is a match only if the first domain character
796 * is a leading '.'.
797 */
798
799 if ('.' == d[0])
800 return 0;
801 else
802 return 1;
803 }
804 }
805
806 /*
807 * We found different characters in the same position (from the end).
808 */
809 /*
810 * If one of those character is '.' then its special. In order
811 * for splay tree sorting to work properly, "x-foo.com" must
812 * be greater than ".foo.com" even though '-' is less than '.'.
813 */
814 if ('.' == d[dl])
815 return 1;
816
817 if ('.' == h[hl])
818 return -1;
819
820 return (xtolower(h[hl]) - xtolower(d[dl]));
821 }
822
823 /*
824 * return true if we can serve requests for this method.
825 */
826 int
827 urlCheckRequest(const HttpRequest * r)
828 {
829 int rc = 0;
830 /* protocol "independent" methods
831 *
832 * actually these methods are specific to HTTP:
833 * they are methods we recieve on our HTTP port,
834 * and if we had a FTP listener would not be relevant
835 * there.
836 *
837 * So, we should delegate them to HTTP. The problem is that we
838 * do not have a default protocol from the client side of HTTP.
839 */
840
841 if (r->method == METHOD_CONNECT)
842 return 1;
843
844 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
845 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
846 if (r->method == METHOD_OPTIONS || r->method == METHOD_TRACE)
847 return (r->header.getInt64(HDR_MAX_FORWARDS) == 0 || r->urlpath != "*");
848
849 if (r->method == METHOD_PURGE)
850 return 1;
851
852 /* does method match the protocol? */
853 switch (r->protocol) {
854
855 case AnyP::PROTO_URN:
856
857 case AnyP::PROTO_HTTP:
858
859 case AnyP::PROTO_CACHE_OBJECT:
860 rc = 1;
861 break;
862
863 case AnyP::PROTO_FTP:
864
865 if (r->method == METHOD_PUT)
866 rc = 1;
867
868 case AnyP::PROTO_GOPHER:
869
870 case AnyP::PROTO_WAIS:
871
872 case AnyP::PROTO_WHOIS:
873 if (r->method == METHOD_GET)
874 rc = 1;
875 else if (r->method == METHOD_HEAD)
876 rc = 1;
877
878 break;
879
880 case AnyP::PROTO_HTTPS:
881 #if USE_SSL
882
883 rc = 1;
884
885 break;
886
887 #else
888 /*
889 * Squid can't originate an SSL connection, so it should
890 * never receive an "https:" URL. It should always be
891 * CONNECT instead.
892 */
893 rc = 0;
894
895 #endif
896
897 default:
898 break;
899 }
900
901 return rc;
902 }
903
904 /*
905 * Quick-n-dirty host extraction from a URL. Steps:
906 * Look for a colon
907 * Skip any '/' after the colon
908 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
909 * Look for an ending '/' or ':' and terminate
910 * Look for login info preceeded by '@'
911 */
912
913 class URLHostName
914 {
915
916 public:
917 char * extract(char const *url);
918
919 private:
920 static char Host [SQUIDHOSTNAMELEN];
921 void init(char const *);
922 void findHostStart();
923 void trimTrailingChars();
924 void trimAuth();
925 char const *hostStart;
926 char const *url;
927 };
928
929 char *
930 urlHostname(const char *url)
931 {
932 return URLHostName().extract(url);
933 }
934
935 char URLHostName::Host[SQUIDHOSTNAMELEN];
936
937 void
938 URLHostName::init(char const *aUrl)
939 {
940 Host[0] = '\0';
941 url = aUrl;
942 }
943
944 void
945 URLHostName::findHostStart()
946 {
947 if (NULL == (hostStart = strchr(url, ':')))
948 return;
949
950 ++hostStart;
951
952 while (*hostStart != '\0' && *hostStart == '/')
953 ++hostStart;
954
955 if (*hostStart == ']')
956 ++hostStart;
957 }
958
959 void
960 URLHostName::trimTrailingChars()
961 {
962 char *t;
963
964 if ((t = strchr(Host, '/')))
965 *t = '\0';
966
967 if ((t = strrchr(Host, ':')))
968 *t = '\0';
969
970 if ((t = strchr(Host, ']')))
971 *t = '\0';
972 }
973
974 void
975 URLHostName::trimAuth()
976 {
977 char *t;
978
979 if ((t = strrchr(Host, '@'))) {
980 ++t;
981 memmove(Host, t, strlen(t) + 1);
982 }
983 }
984
985 char *
986 URLHostName::extract(char const *aUrl)
987 {
988 init(aUrl);
989 findHostStart();
990
991 if (hostStart == NULL)
992 return NULL;
993
994 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
995
996 trimTrailingChars();
997
998 trimAuth();
999
1000 return Host;
1001 }
1002
1003 URL::URL() : scheme()
1004 {}
1005
1006 URL::URL(URLScheme const &aScheme): scheme(aScheme)
1007 {}