]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
BUGFIX: max_user_ip was broken: initialising to -1 meant that the ACL appeared
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.152 2006/04/23 11:10:32 robertc Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "squid.h"
37 #include "HttpRequest.h"
38 #include "wordlist.h"
39
40 const char *RequestMethodStr[] =
41 {
42 "NONE",
43 "GET",
44 "POST",
45 "PUT",
46 "HEAD",
47 "CONNECT",
48 "TRACE",
49 "PURGE",
50 "OPTIONS",
51 "DELETE",
52 "PROPFIND",
53 "PROPPATCH",
54 "MKCOL",
55 "COPY",
56 "MOVE",
57 "LOCK",
58 "UNLOCK",
59 "BMOVE",
60 "BDELETE",
61 "BPROPFIND",
62 "BPROPPATCH",
63 "BCOPY",
64 "SEARCH",
65 "SUBSCRIBE",
66 "UNSUBSCRIBE",
67 "POLL",
68 "REPORT",
69 "%EXT00",
70 "%EXT01",
71 "%EXT02",
72 "%EXT03",
73 "%EXT04",
74 "%EXT05",
75 "%EXT06",
76 "%EXT07",
77 "%EXT08",
78 "%EXT09",
79 "%EXT10",
80 "%EXT11",
81 "%EXT12",
82 "%EXT13",
83 "%EXT14",
84 "%EXT15",
85 "%EXT16",
86 "%EXT17",
87 "%EXT18",
88 "%EXT19",
89 "ERROR"
90 };
91
92 const char *ProtocolStr[] =
93 {
94 "NONE",
95 "http",
96 "ftp",
97 "gopher",
98 "wais",
99 "cache_object",
100 "icp",
101 #if USE_HTCP
102 "htcp",
103 #endif
104 "urn",
105 "whois",
106 "internal",
107 "https",
108 "TOTAL"
109 };
110
111 static HttpRequest *urnParse(method_t method, char *urn);
112 #if CHECK_HOSTNAMES
113 static const char *const valid_hostname_chars =
114 #if ALLOW_HOSTNAME_UNDERSCORES
115 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
116 "abcdefghijklmnopqrstuvwxyz"
117 "0123456789-._";
118 #else
119 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
120 "abcdefghijklmnopqrstuvwxyz"
121 "0123456789-."
122 ;
123 #endif
124 #endif /* CHECK_HOSTNAMES */
125
126 /* convert %xx in url string to a character
127 * Allocate a new string and return a pointer to converted string */
128
129 char *
130 url_convert_hex(char *org_url, int allocate)
131 {
132 static char code[] = "00";
133 char *url = NULL;
134 char *s = NULL;
135 char *t = NULL;
136 url = allocate ? (char *) xstrdup(org_url) : org_url;
137
138 if ((int) strlen(url) < 3 || !strchr(url, '%'))
139 return url;
140
141 for (s = t = url; *s; s++) {
142 if (*s == '%' && *(s + 1) && *(s + 2)) {
143 code[0] = *(++s);
144 code[1] = *(++s);
145 *t++ = (char) strtol(code, NULL, 16);
146 } else {
147 *t++ = *s;
148 }
149 }
150
151 do {
152 *t++ = *s;
153 } while (*s++);
154
155 return url;
156 }
157
158 void
159 urlInitialize(void)
160 {
161 debug(23, 5) ("urlInitialize: Initializing...\n");
162 assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
163 /*
164 * These test that our matchDomainName() function works the
165 * way we expect it to.
166 */
167 assert(0 == matchDomainName("foo.com", "foo.com"));
168 assert(0 == matchDomainName(".foo.com", "foo.com"));
169 assert(0 == matchDomainName("foo.com", ".foo.com"));
170 assert(0 == matchDomainName(".foo.com", ".foo.com"));
171 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
172 assert(0 != matchDomainName("x.foo.com", "foo.com"));
173 assert(0 != matchDomainName("foo.com", "x.foo.com"));
174 assert(0 != matchDomainName("bar.com", "foo.com"));
175 assert(0 != matchDomainName(".bar.com", "foo.com"));
176 assert(0 != matchDomainName(".bar.com", ".foo.com"));
177 assert(0 != matchDomainName("bar.com", ".foo.com"));
178 assert(0 < matchDomainName("zzz.com", "foo.com"));
179 assert(0 > matchDomainName("aaa.com", "foo.com"));
180 assert(0 == matchDomainName("FOO.com", "foo.COM"));
181 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
182 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
183 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
184 /* more cases? */
185 }
186
187 method_t &operator++ (method_t &aMethod)
188 {
189 int tmp = (int)aMethod;
190 aMethod = (method_t)(++tmp);
191 return aMethod;
192 }
193
194 /*
195 * urlParseMethod() takes begin and end pointers, but for backwards
196 * compatibility, end defaults to NULL, in which case we assume begin
197 * is NULL-terminated.
198 */
199 method_t
200 urlParseMethod(const char *b, const char *e)
201 {
202 method_t method = METHOD_NONE;
203 /*
204 * This check for '%' makes sure that we don't
205 * match one of the extension method placeholders,
206 * which have the form %EXT[0-9][0-9]
207 */
208
209 if (*b == '%')
210 return METHOD_NONE;
211
212 /*
213 * if e is NULL, b must be NULL terminated and we
214 * make e point to the first whitespace character
215 * after b.
216 */
217 if (NULL == e)
218 e = b + strcspn(b, w_space);
219
220 for (++method; method < METHOD_ENUM_END; ++method) {
221 if (0 == strncasecmp(b, RequestMethodStr[method], e-b))
222 return method;
223 }
224
225 return METHOD_NONE;
226 }
227
228 /*
229 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
230 * backwards compatibility, e defaults to NULL, in which case we
231 * assume b is NULL-terminated.
232 */
233 protocol_t
234 urlParseProtocol(const char *b, const char *e)
235 {
236 /*
237 * if e is NULL, b must be NULL terminated and we
238 * make e point to the first whitespace character
239 * after b.
240 */
241
242 if (NULL == e)
243 e = b + strcspn(b, ":");
244
245 int len = e - b;
246
247 /* test common stuff first */
248
249 if (strncasecmp(b, "http", len) == 0)
250 return PROTO_HTTP;
251
252 if (strncasecmp(b, "ftp", len) == 0)
253 return PROTO_FTP;
254
255 if (strncasecmp(b, "https", len) == 0)
256 return PROTO_HTTPS;
257
258 if (strncasecmp(b, "file", len) == 0)
259 return PROTO_FTP;
260
261 if (strncasecmp(b, "gopher", len) == 0)
262 return PROTO_GOPHER;
263
264 if (strncasecmp(b, "wais", len) == 0)
265 return PROTO_WAIS;
266
267 if (strncasecmp(b, "cache_object", len) == 0)
268 return PROTO_CACHEOBJ;
269
270 if (strncasecmp(b, "urn", len) == 0)
271 return PROTO_URN;
272
273 if (strncasecmp(b, "whois", len) == 0)
274 return PROTO_WHOIS;
275
276 if (strncasecmp(b, "internal", len) == 0)
277 return PROTO_INTERNAL;
278
279 return PROTO_NONE;
280 }
281
282 int
283 urlDefaultPort(protocol_t p)
284 {
285 switch (p) {
286
287 case PROTO_HTTP:
288 return 80;
289
290 case PROTO_HTTPS:
291 return 443;
292
293 case PROTO_FTP:
294 return 21;
295
296 case PROTO_GOPHER:
297 return 70;
298
299 case PROTO_WAIS:
300 return 210;
301
302 case PROTO_CACHEOBJ:
303
304 case PROTO_INTERNAL:
305 return CACHE_HTTP_PORT;
306
307 case PROTO_WHOIS:
308 return 43;
309
310 default:
311 return 0;
312 }
313 }
314
315 /*
316 * Parse a URI/URL.
317 *
318 * If the 'request' arg is non-NULL, put parsed values there instead
319 * of allocating a new HttpRequest.
320 */
321 HttpRequest *
322 urlParse(method_t method, char *url, HttpRequest *request)
323 {
324 LOCAL_ARRAY(char, proto, MAX_URL);
325 LOCAL_ARRAY(char, login, MAX_URL);
326 LOCAL_ARRAY(char, host, MAX_URL);
327 LOCAL_ARRAY(char, urlpath, MAX_URL);
328 char *t = NULL;
329 char *q = NULL;
330 int port;
331 protocol_t protocol = PROTO_NONE;
332 int l;
333 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
334
335 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
336 /* terminate so it doesn't overflow other buffers */
337 *(url + (MAX_URL >> 1)) = '\0';
338 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
339 return NULL;
340 }
341
342 if (method == METHOD_CONNECT) {
343 port = CONNECT_PORT;
344
345 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
346 return NULL;
347 } else if (!strncmp(url, "urn:", 4)) {
348 return urnParse(method, url);
349 } else {
350 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
351 return NULL;
352
353 protocol = urlParseProtocol(proto);
354
355 port = urlDefaultPort(protocol);
356
357 /* Is there any login informaiton? */
358 if ((t = strrchr(host, '@'))) {
359 strcpy((char *) login, (char *) host);
360 t = strrchr(login, '@');
361 *t = 0;
362 strcpy((char *) host, t + 1);
363 }
364
365 if ((t = strrchr(host, ':'))) {
366 *t++ = '\0';
367
368 if (*t != '\0')
369 port = atoi(t);
370 }
371 }
372
373 for (t = host; *t; t++)
374 *t = xtolower(*t);
375
376 if (stringHasWhitespace(host)) {
377 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
378 t = q = host;
379
380 while (*t) {
381 if (!xisspace(*t))
382 *q++ = *t;
383
384 t++;
385 }
386
387 *q = '\0';
388 }
389 }
390
391 #if CHECK_HOSTNAMES
392 if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
393 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
394 return NULL;
395 }
396
397 #endif
398 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
399 /* remove trailing dots from hostnames */
400 while ((l = strlen(host)) > 0 && host[--l] == '.')
401 host[l] = '\0';
402
403 /* remove duplicate dots */
404 while ((t = strstr(host, "..")))
405 xmemmove(t, t + 1, strlen(t));
406
407 #endif
408
409 if (Config.appendDomain && !strchr(host, '.'))
410 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
411
412 if (port < 1 || port > 65535) {
413 debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
414 return NULL;
415 }
416
417 #ifdef HARDCODE_DENY_PORTS
418 /* These ports are filtered in the default squid.conf, but
419 * maybe someone wants them hardcoded... */
420 if (port == 7 || port == 9 || port == 19) {
421 debug(23, 0) ("urlParse: Deny access to port %d\n", port);
422 return NULL;
423 }
424
425 #endif
426 if (stringHasWhitespace(urlpath)) {
427 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
428
429 switch (Config.uri_whitespace) {
430
431 case URI_WHITESPACE_DENY:
432 return NULL;
433
434 case URI_WHITESPACE_ALLOW:
435 break;
436
437 case URI_WHITESPACE_ENCODE:
438 t = rfc1738_escape_unescaped(urlpath);
439 xstrncpy(urlpath, t, MAX_URL);
440 break;
441
442 case URI_WHITESPACE_CHOP:
443 *(urlpath + strcspn(urlpath, w_space)) = '\0';
444 break;
445
446 case URI_WHITESPACE_STRIP:
447
448 default:
449 t = q = urlpath;
450
451 while (*t) {
452 if (!xisspace(*t))
453 *q++ = *t;
454
455 t++;
456 }
457
458 *q = '\0';
459 }
460 }
461
462 if (NULL == request)
463 request = new HttpRequest(method, protocol, urlpath);
464 else {
465 request->initHTTP(method, protocol, urlpath);
466 }
467
468 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
469 xstrncpy(request->login, login, MAX_LOGIN_SZ);
470 request->port = (u_short) port;
471 return request;
472 }
473
474 static HttpRequest *
475 urnParse(method_t method, char *urn)
476 {
477 debug(50, 5) ("urnParse: %s\n", urn);
478 return new HttpRequest(method, PROTO_URN, urn + 4);
479 }
480
481 const char *
482 urlCanonical(HttpRequest * request)
483 {
484 LOCAL_ARRAY(char, portbuf, 32);
485 LOCAL_ARRAY(char, urlbuf, MAX_URL);
486
487 if (request->canonical)
488 return request->canonical;
489
490 if (request->protocol == PROTO_URN) {
491 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
492 } else {
493 switch (request->method) {
494
495 case METHOD_CONNECT:
496 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
497 break;
498
499 default:
500 portbuf[0] = '\0';
501
502 if (request->port != urlDefaultPort(request->protocol))
503 snprintf(portbuf, 32, ":%d", request->port);
504
505 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
506 ProtocolStr[request->protocol],
507 request->login,
508 *request->login ? "@" : null_string,
509 request->host,
510 portbuf,
511 request->urlpath.buf());
512
513 break;
514 }
515 }
516
517 return (request->canonical = xstrdup(urlbuf));
518 }
519
520 char *
521 urlCanonicalClean(const HttpRequest * request)
522 {
523 LOCAL_ARRAY(char, buf, MAX_URL);
524 LOCAL_ARRAY(char, portbuf, 32);
525 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
526 char *t;
527
528 if (request->protocol == PROTO_URN) {
529 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
530 } else {
531 switch (request->method) {
532
533 case METHOD_CONNECT:
534 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
535 break;
536
537 default:
538 portbuf[0] = '\0';
539
540 if (request->port != urlDefaultPort(request->protocol))
541 snprintf(portbuf, 32, ":%d", request->port);
542
543 loginbuf[0] = '\0';
544
545 if ((int) strlen(request->login) > 0) {
546 strcpy(loginbuf, request->login);
547
548 if ((t = strchr(loginbuf, ':')))
549 *t = '\0';
550
551 strcat(loginbuf, "@");
552 }
553
554 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
555 ProtocolStr[request->protocol],
556 loginbuf,
557 request->host,
558 portbuf,
559 request->urlpath.buf());
560 /*
561 * strip arguments AFTER a question-mark
562 */
563
564 if (Config.onoff.strip_query_terms)
565 if ((t = strchr(buf, '?')))
566 *(++t) = '\0';
567
568 break;
569 }
570 }
571
572 if (stringHasCntl(buf))
573 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
574
575 return buf;
576 }
577
578 /*
579 * matchDomainName() compares a hostname with a domainname according
580 * to the following rules:
581 *
582 * HOST DOMAIN MATCH?
583 * ------------- ------------- ------
584 * foo.com foo.com YES
585 * .foo.com foo.com YES
586 * x.foo.com foo.com NO
587 * foo.com .foo.com YES
588 * .foo.com .foo.com YES
589 * x.foo.com .foo.com YES
590 *
591 * We strip leading dots on hosts (but not domains!) so that
592 * ".foo.com" is is always the same as "foo.com".
593 *
594 * Return values:
595 * 0 means the host matches the domain
596 * 1 means the host is greater than the domain
597 * -1 means the host is less than the domain
598 */
599
600 int
601 matchDomainName(const char *h, const char *d)
602 {
603 int dl;
604 int hl;
605
606 while ('.' == *h)
607 h++;
608
609 hl = strlen(h);
610
611 dl = strlen(d);
612
613 /*
614 * Start at the ends of the two strings and work towards the
615 * beginning.
616 */
617 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
618 if (hl == 0 && dl == 0) {
619 /*
620 * We made it all the way to the beginning of both
621 * strings without finding any difference.
622 */
623 return 0;
624 }
625
626 if (0 == hl) {
627 /*
628 * The host string is shorter than the domain string.
629 * There is only one case when this can be a match.
630 * If the domain is just one character longer, and if
631 * that character is a leading '.' then we call it a
632 * match.
633 */
634
635 if (1 == dl && '.' == d[0])
636 return 0;
637 else
638 return -1;
639 }
640
641 if (0 == dl) {
642 /*
643 * The domain string is shorter than the host string.
644 * This is a match only if the first domain character
645 * is a leading '.'.
646 */
647
648 if ('.' == d[0])
649 return 0;
650 else
651 return 1;
652 }
653 }
654
655 /*
656 * We found different characters in the same position (from the end).
657 */
658 /*
659 * If one of those character is '.' then its special. In order
660 * for splay tree sorting to work properly, "x-foo.com" must
661 * be greater than ".foo.com" even though '-' is less than '.'.
662 */
663 if ('.' == d[dl])
664 return 1;
665
666 if ('.' == h[hl])
667 return -1;
668
669 return (xtolower(h[hl]) - xtolower(d[dl]));
670 }
671
672 int
673 urlCheckRequest(const HttpRequest * r)
674 {
675 int rc = 0;
676 /* protocol "independent" methods */
677
678 if (r->method == METHOD_CONNECT)
679 return 1;
680
681 if (r->method == METHOD_TRACE)
682 return 1;
683
684 if (r->method == METHOD_PURGE)
685 return 1;
686
687 /* does method match the protocol? */
688 switch (r->protocol) {
689
690 case PROTO_URN:
691
692 case PROTO_HTTP:
693
694 case PROTO_CACHEOBJ:
695 rc = 1;
696 break;
697
698 case PROTO_FTP:
699
700 if (r->method == METHOD_PUT)
701 rc = 1;
702
703 case PROTO_GOPHER:
704
705 case PROTO_WAIS:
706
707 case PROTO_WHOIS:
708 if (r->method == METHOD_GET)
709 rc = 1;
710 else if (r->method == METHOD_HEAD)
711 rc = 1;
712
713 break;
714
715 case PROTO_HTTPS:
716 #ifdef USE_SSL
717
718 rc = 1;
719
720 break;
721
722 #else
723 /*
724 * Squid can't originate an SSL connection, so it should
725 * never receive an "https:" URL. It should always be
726 * CONNECT instead.
727 */
728 rc = 0;
729
730 #endif
731
732 default:
733 break;
734 }
735
736 return rc;
737 }
738
739 /*
740 * Quick-n-dirty host extraction from a URL. Steps:
741 * Look for a colon
742 * Skip any '/' after the colon
743 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
744 * Look for an ending '/' or ':' and terminate
745 * Look for login info preceeded by '@'
746 */
747
748 class URLHostName
749 {
750
751 public:
752 char * extract(char const *url);
753
754 private:
755 static char Host [SQUIDHOSTNAMELEN];
756 void init(char const *);
757 void findHostStart();
758 void trimTrailingChars();
759 void trimAuth();
760 char const *hostStart;
761 char const *url;
762 };
763
764 char *
765 urlHostname(const char *url)
766 {
767 return URLHostName().extract(url);
768 }
769
770 char URLHostName::Host[SQUIDHOSTNAMELEN];
771
772 void
773 URLHostName::init(char const *aUrl)
774 {
775 Host[0] = '\0';
776 url = url;
777 }
778
779 void
780 URLHostName::findHostStart()
781 {
782 if (NULL == (hostStart = strchr(url, ':')))
783 return;
784
785 ++hostStart;
786
787 while (*hostStart != '\0' && *hostStart == '/')
788 ++hostStart;
789 }
790
791 void
792 URLHostName::trimTrailingChars()
793 {
794 char *t;
795
796 if ((t = strchr(Host, '/')))
797 *t = '\0';
798
799 if ((t = strchr(Host, ':')))
800 *t = '\0';
801 }
802
803 void
804 URLHostName::trimAuth()
805 {
806 char *t;
807
808 if ((t = strrchr(Host, '@'))) {
809 t++;
810 xmemmove(Host, t, strlen(t) + 1);
811 }
812 }
813
814 char *
815 URLHostName::extract(char const *aUrl)
816 {
817 init(aUrl);
818 findHostStart();
819
820 if (hostStart == NULL)
821 return NULL;
822
823 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
824
825 trimTrailingChars();
826
827 trimAuth();
828
829 return Host;
830 }
831
832 static void
833 urlExtMethodAdd(const char *mstr)
834 {
835 method_t method = METHOD_NONE;
836
837 for (++method; method < METHOD_ENUM_END; ++method) {
838 if (0 == strcmp(mstr, RequestMethodStr[method])) {
839 debug(23, 2) ("Extension method '%s' already exists\n", mstr);
840 return;
841 }
842
843 if (0 != strncmp("%EXT", RequestMethodStr[method], 4))
844 continue;
845
846 /* Don't free statically allocated "%EXTnn" string */
847 RequestMethodStr[method] = xstrdup(mstr);
848
849 debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr, (int) method);
850
851 return;
852 }
853
854 debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr);
855 }
856
857 void
858 urlExtMethodConfigure(void)
859 {
860 wordlist *w = Config.ext_methods;
861
862 while (w) {
863 char *s;
864
865 for (s = w->key; *s; s++)
866 *s = xtoupper(*s);
867
868 urlExtMethodAdd(w->key);
869
870 w = w->next;
871 }
872 }