]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Move clientCachable logic onto HttpRequest, as all the used fields in the
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.155 2006/05/29 21:44:18 robertc Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "URL.h"
37 #include "HttpRequest.h"
38 #include "URLScheme.h"
39
40 static HttpRequest *urnParse(method_t method, char *urn);
41 #if CHECK_HOSTNAMES
42 static const char *const valid_hostname_chars =
43 #if ALLOW_HOSTNAME_UNDERSCORES
44 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
45 "abcdefghijklmnopqrstuvwxyz"
46 "0123456789-._";
47 #else
48 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
49 "abcdefghijklmnopqrstuvwxyz"
50 "0123456789-."
51 ;
52 #endif
53 #endif /* CHECK_HOSTNAMES */
54
55 /* convert %xx in url string to a character
56 * Allocate a new string and return a pointer to converted string */
57
58 char *
59 url_convert_hex(char *org_url, int allocate)
60 {
61 static char code[] = "00";
62 char *url = NULL;
63 char *s = NULL;
64 char *t = NULL;
65 url = allocate ? (char *) xstrdup(org_url) : org_url;
66
67 if ((int) strlen(url) < 3 || !strchr(url, '%'))
68 return url;
69
70 for (s = t = url; *s; s++) {
71 if (*s == '%' && *(s + 1) && *(s + 2)) {
72 code[0] = *(++s);
73 code[1] = *(++s);
74 *t++ = (char) strtol(code, NULL, 16);
75 } else {
76 *t++ = *s;
77 }
78 }
79
80 do {
81 *t++ = *s;
82 } while (*s++);
83
84 return url;
85 }
86
87 void
88 urlInitialize(void)
89 {
90 debug(23, 5) ("urlInitialize: Initializing...\n");
91 /* this ensures that the number of protocol strings is the same as
92 * the enum slots allocated because the last enum is always 'TOTAL'.
93 */
94 assert(strcmp(ProtocolStr[PROTO_MAX], "TOTAL") == 0);
95 /*
96 * These test that our matchDomainName() function works the
97 * way we expect it to.
98 */
99 assert(0 == matchDomainName("foo.com", "foo.com"));
100 assert(0 == matchDomainName(".foo.com", "foo.com"));
101 assert(0 == matchDomainName("foo.com", ".foo.com"));
102 assert(0 == matchDomainName(".foo.com", ".foo.com"));
103 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
104 assert(0 != matchDomainName("x.foo.com", "foo.com"));
105 assert(0 != matchDomainName("foo.com", "x.foo.com"));
106 assert(0 != matchDomainName("bar.com", "foo.com"));
107 assert(0 != matchDomainName(".bar.com", "foo.com"));
108 assert(0 != matchDomainName(".bar.com", ".foo.com"));
109 assert(0 != matchDomainName("bar.com", ".foo.com"));
110 assert(0 < matchDomainName("zzz.com", "foo.com"));
111 assert(0 > matchDomainName("aaa.com", "foo.com"));
112 assert(0 == matchDomainName("FOO.com", "foo.COM"));
113 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
114 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
115 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
116 /* more cases? */
117 }
118
119 /*
120 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
121 * backwards compatibility, e defaults to NULL, in which case we
122 * assume b is NULL-terminated.
123 */
124 protocol_t
125 urlParseProtocol(const char *b, const char *e)
126 {
127 /*
128 * if e is NULL, b must be NULL terminated and we
129 * make e point to the first whitespace character
130 * after b.
131 */
132
133 if (NULL == e)
134 e = b + strcspn(b, ":");
135
136 int len = e - b;
137
138 /* test common stuff first */
139
140 if (strncasecmp(b, "http", len) == 0)
141 return PROTO_HTTP;
142
143 if (strncasecmp(b, "ftp", len) == 0)
144 return PROTO_FTP;
145
146 if (strncasecmp(b, "https", len) == 0)
147 return PROTO_HTTPS;
148
149 if (strncasecmp(b, "file", len) == 0)
150 return PROTO_FTP;
151
152 if (strncasecmp(b, "gopher", len) == 0)
153 return PROTO_GOPHER;
154
155 if (strncasecmp(b, "wais", len) == 0)
156 return PROTO_WAIS;
157
158 if (strncasecmp(b, "cache_object", len) == 0)
159 return PROTO_CACHEOBJ;
160
161 if (strncasecmp(b, "urn", len) == 0)
162 return PROTO_URN;
163
164 if (strncasecmp(b, "whois", len) == 0)
165 return PROTO_WHOIS;
166
167 if (strncasecmp(b, "internal", len) == 0)
168 return PROTO_INTERNAL;
169
170 return PROTO_NONE;
171 }
172
173 int
174 urlDefaultPort(protocol_t p)
175 {
176 switch (p) {
177
178 case PROTO_HTTP:
179 return 80;
180
181 case PROTO_HTTPS:
182 return 443;
183
184 case PROTO_FTP:
185 return 21;
186
187 case PROTO_GOPHER:
188 return 70;
189
190 case PROTO_WAIS:
191 return 210;
192
193 case PROTO_CACHEOBJ:
194
195 case PROTO_INTERNAL:
196 return CACHE_HTTP_PORT;
197
198 case PROTO_WHOIS:
199 return 43;
200
201 default:
202 return 0;
203 }
204 }
205
206 /*
207 * Parse a URI/URL.
208 *
209 * If the 'request' arg is non-NULL, put parsed values there instead
210 * of allocating a new HttpRequest.
211 *
212 * This abuses HttpRequest as a way of representing the parsed url
213 * and its components.
214 * method is used to switch parsers and to init the HttpRequest.
215 * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
216 * looked for.
217 * The url is non const so that if its too long we can NULL-terminate it in place.
218 */
219 HttpRequest *
220 urlParse(method_t method, char *url, HttpRequest *request)
221 {
222 LOCAL_ARRAY(char, proto, MAX_URL);
223 LOCAL_ARRAY(char, login, MAX_URL);
224 LOCAL_ARRAY(char, host, MAX_URL);
225 LOCAL_ARRAY(char, urlpath, MAX_URL);
226 char *t = NULL;
227 char *q = NULL;
228 int port;
229 protocol_t protocol = PROTO_NONE;
230 int l;
231 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
232
233 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
234 /* terminate so it doesn't overflow other buffers */
235 *(url + (MAX_URL >> 1)) = '\0';
236 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
237 return NULL;
238 }
239
240 if (method == METHOD_CONNECT) {
241 port = CONNECT_PORT;
242
243 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
244 return NULL;
245 } else if (!strncmp(url, "urn:", 4)) {
246 return urnParse(method, url);
247 } else {
248 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
249 return NULL;
250
251 protocol = urlParseProtocol(proto);
252
253 port = urlDefaultPort(protocol);
254
255 /* Is there any login informaiton? */
256 if ((t = strrchr(host, '@'))) {
257 strcpy((char *) login, (char *) host);
258 t = strrchr(login, '@');
259 *t = 0;
260 strcpy((char *) host, t + 1);
261 }
262
263 if ((t = strrchr(host, ':'))) {
264 *t++ = '\0';
265
266 if (*t != '\0')
267 port = atoi(t);
268 }
269 }
270
271 for (t = host; *t; t++)
272 *t = xtolower(*t);
273
274 if (stringHasWhitespace(host)) {
275 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
276 t = q = host;
277
278 while (*t) {
279 if (!xisspace(*t))
280 *q++ = *t;
281
282 t++;
283 }
284
285 *q = '\0';
286 }
287 }
288
289 #if CHECK_HOSTNAMES
290 if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
291 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
292 return NULL;
293 }
294
295 #endif
296 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
297 /* remove trailing dots from hostnames */
298 while ((l = strlen(host)) > 0 && host[--l] == '.')
299 host[l] = '\0';
300
301 /* remove duplicate dots */
302 while ((t = strstr(host, "..")))
303 xmemmove(t, t + 1, strlen(t));
304
305 #endif
306
307 if (Config.appendDomain && !strchr(host, '.'))
308 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
309
310 if (port < 1 || port > 65535) {
311 debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
312 return NULL;
313 }
314
315 #ifdef HARDCODE_DENY_PORTS
316 /* These ports are filtered in the default squid.conf, but
317 * maybe someone wants them hardcoded... */
318 if (port == 7 || port == 9 || port == 19) {
319 debug(23, 0) ("urlParse: Deny access to port %d\n", port);
320 return NULL;
321 }
322
323 #endif
324 if (stringHasWhitespace(urlpath)) {
325 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
326
327 switch (Config.uri_whitespace) {
328
329 case URI_WHITESPACE_DENY:
330 return NULL;
331
332 case URI_WHITESPACE_ALLOW:
333 break;
334
335 case URI_WHITESPACE_ENCODE:
336 t = rfc1738_escape_unescaped(urlpath);
337 xstrncpy(urlpath, t, MAX_URL);
338 break;
339
340 case URI_WHITESPACE_CHOP:
341 *(urlpath + strcspn(urlpath, w_space)) = '\0';
342 break;
343
344 case URI_WHITESPACE_STRIP:
345
346 default:
347 t = q = urlpath;
348
349 while (*t) {
350 if (!xisspace(*t))
351 *q++ = *t;
352
353 t++;
354 }
355
356 *q = '\0';
357 }
358 }
359
360 if (NULL == request)
361 request = new HttpRequest(method, protocol, urlpath);
362 else {
363 request->initHTTP(method, protocol, urlpath);
364 }
365
366 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
367 xstrncpy(request->login, login, MAX_LOGIN_SZ);
368 request->port = (u_short) port;
369 return request;
370 }
371
372 static HttpRequest *
373 urnParse(method_t method, char *urn)
374 {
375 debug(50, 5) ("urnParse: %s\n", urn);
376 return new HttpRequest(method, PROTO_URN, urn + 4);
377 }
378
379 const char *
380 urlCanonical(HttpRequest * request)
381 {
382 LOCAL_ARRAY(char, portbuf, 32);
383 LOCAL_ARRAY(char, urlbuf, MAX_URL);
384
385 if (request->canonical)
386 return request->canonical;
387
388 if (request->protocol == PROTO_URN) {
389 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
390 } else {
391 switch (request->method) {
392
393 case METHOD_CONNECT:
394 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
395 break;
396
397 default:
398 portbuf[0] = '\0';
399
400 if (request->port != urlDefaultPort(request->protocol))
401 snprintf(portbuf, 32, ":%d", request->port);
402
403 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
404 ProtocolStr[request->protocol],
405 request->login,
406 *request->login ? "@" : null_string,
407 request->host,
408 portbuf,
409 request->urlpath.buf());
410
411 break;
412 }
413 }
414
415 return (request->canonical = xstrdup(urlbuf));
416 }
417
418 char *
419 urlCanonicalClean(const HttpRequest * request)
420 {
421 LOCAL_ARRAY(char, buf, MAX_URL);
422 LOCAL_ARRAY(char, portbuf, 32);
423 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
424 char *t;
425
426 if (request->protocol == PROTO_URN) {
427 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
428 } else {
429 switch (request->method) {
430
431 case METHOD_CONNECT:
432 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
433 break;
434
435 default:
436 portbuf[0] = '\0';
437
438 if (request->port != urlDefaultPort(request->protocol))
439 snprintf(portbuf, 32, ":%d", request->port);
440
441 loginbuf[0] = '\0';
442
443 if ((int) strlen(request->login) > 0) {
444 strcpy(loginbuf, request->login);
445
446 if ((t = strchr(loginbuf, ':')))
447 *t = '\0';
448
449 strcat(loginbuf, "@");
450 }
451
452 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
453 ProtocolStr[request->protocol],
454 loginbuf,
455 request->host,
456 portbuf,
457 request->urlpath.buf());
458 /*
459 * strip arguments AFTER a question-mark
460 */
461
462 if (Config.onoff.strip_query_terms)
463 if ((t = strchr(buf, '?')))
464 *(++t) = '\0';
465
466 break;
467 }
468 }
469
470 if (stringHasCntl(buf))
471 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
472
473 return buf;
474 }
475
476 /*
477 * matchDomainName() compares a hostname with a domainname according
478 * to the following rules:
479 *
480 * HOST DOMAIN MATCH?
481 * ------------- ------------- ------
482 * foo.com foo.com YES
483 * .foo.com foo.com YES
484 * x.foo.com foo.com NO
485 * foo.com .foo.com YES
486 * .foo.com .foo.com YES
487 * x.foo.com .foo.com YES
488 *
489 * We strip leading dots on hosts (but not domains!) so that
490 * ".foo.com" is is always the same as "foo.com".
491 *
492 * Return values:
493 * 0 means the host matches the domain
494 * 1 means the host is greater than the domain
495 * -1 means the host is less than the domain
496 */
497
498 int
499 matchDomainName(const char *h, const char *d)
500 {
501 int dl;
502 int hl;
503
504 while ('.' == *h)
505 h++;
506
507 hl = strlen(h);
508
509 dl = strlen(d);
510
511 /*
512 * Start at the ends of the two strings and work towards the
513 * beginning.
514 */
515 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
516 if (hl == 0 && dl == 0) {
517 /*
518 * We made it all the way to the beginning of both
519 * strings without finding any difference.
520 */
521 return 0;
522 }
523
524 if (0 == hl) {
525 /*
526 * The host string is shorter than the domain string.
527 * There is only one case when this can be a match.
528 * If the domain is just one character longer, and if
529 * that character is a leading '.' then we call it a
530 * match.
531 */
532
533 if (1 == dl && '.' == d[0])
534 return 0;
535 else
536 return -1;
537 }
538
539 if (0 == dl) {
540 /*
541 * The domain string is shorter than the host string.
542 * This is a match only if the first domain character
543 * is a leading '.'.
544 */
545
546 if ('.' == d[0])
547 return 0;
548 else
549 return 1;
550 }
551 }
552
553 /*
554 * We found different characters in the same position (from the end).
555 */
556 /*
557 * If one of those character is '.' then its special. In order
558 * for splay tree sorting to work properly, "x-foo.com" must
559 * be greater than ".foo.com" even though '-' is less than '.'.
560 */
561 if ('.' == d[dl])
562 return 1;
563
564 if ('.' == h[hl])
565 return -1;
566
567 return (xtolower(h[hl]) - xtolower(d[dl]));
568 }
569
570
571 /*
572 * return true if we can serve requests for this method.
573 */
574 int
575 urlCheckRequest(const HttpRequest * r)
576 {
577 int rc = 0;
578 /* protocol "independent" methods
579 *
580 * actually these methods are specific to HTTP:
581 * they are methods we recieve on our HTTP port,
582 * and if we had a FTP listener would not be relevant
583 * there.
584 *
585 * So, we should delegate them to HTTP. The problem is that we
586 * do not have a default protocol from the client side of HTTP.
587 */
588
589 if (r->method == METHOD_CONNECT)
590 return 1;
591
592 if (r->method == METHOD_TRACE)
593 return 1;
594
595 if (r->method == METHOD_PURGE)
596 return 1;
597
598 /* does method match the protocol? */
599 switch (r->protocol) {
600
601 case PROTO_URN:
602
603 case PROTO_HTTP:
604
605 case PROTO_CACHEOBJ:
606 rc = 1;
607 break;
608
609 case PROTO_FTP:
610
611 if (r->method == METHOD_PUT)
612 rc = 1;
613
614 case PROTO_GOPHER:
615
616 case PROTO_WAIS:
617
618 case PROTO_WHOIS:
619 if (r->method == METHOD_GET)
620 rc = 1;
621 else if (r->method == METHOD_HEAD)
622 rc = 1;
623
624 break;
625
626 case PROTO_HTTPS:
627 #ifdef USE_SSL
628
629 rc = 1;
630
631 break;
632
633 #else
634 /*
635 * Squid can't originate an SSL connection, so it should
636 * never receive an "https:" URL. It should always be
637 * CONNECT instead.
638 */
639 rc = 0;
640
641 #endif
642
643 default:
644 break;
645 }
646
647 return rc;
648 }
649
650 /*
651 * Quick-n-dirty host extraction from a URL. Steps:
652 * Look for a colon
653 * Skip any '/' after the colon
654 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
655 * Look for an ending '/' or ':' and terminate
656 * Look for login info preceeded by '@'
657 */
658
659 class URLHostName
660 {
661
662 public:
663 char * extract(char const *url);
664
665 private:
666 static char Host [SQUIDHOSTNAMELEN];
667 void init(char const *);
668 void findHostStart();
669 void trimTrailingChars();
670 void trimAuth();
671 char const *hostStart;
672 char const *url;
673 };
674
675 char *
676 urlHostname(const char *url)
677 {
678 return URLHostName().extract(url);
679 }
680
681 char URLHostName::Host[SQUIDHOSTNAMELEN];
682
683 void
684 URLHostName::init(char const *aUrl)
685 {
686 Host[0] = '\0';
687 url = url;
688 }
689
690 void
691 URLHostName::findHostStart()
692 {
693 if (NULL == (hostStart = strchr(url, ':')))
694 return;
695
696 ++hostStart;
697
698 while (*hostStart != '\0' && *hostStart == '/')
699 ++hostStart;
700 }
701
702 void
703 URLHostName::trimTrailingChars()
704 {
705 char *t;
706
707 if ((t = strchr(Host, '/')))
708 *t = '\0';
709
710 if ((t = strchr(Host, ':')))
711 *t = '\0';
712 }
713
714 void
715 URLHostName::trimAuth()
716 {
717 char *t;
718
719 if ((t = strrchr(Host, '@'))) {
720 t++;
721 xmemmove(Host, t, strlen(t) + 1);
722 }
723 }
724
725 char *
726 URLHostName::extract(char const *aUrl)
727 {
728 init(aUrl);
729 findHostStart();
730
731 if (hostStart == NULL)
732 return NULL;
733
734 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
735
736 trimTrailingChars();
737
738 trimAuth();
739
740 return Host;
741 }
742
743 URL::URL() : scheme()
744 {}
745
746 URL::URL(URLScheme const &aScheme): scheme(aScheme)
747 {}