]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Bug #1434: Fails to process long host names
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.151 2006/04/23 09:02:13 serassio Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "squid.h"
37 #include "HttpRequest.h"
38
39 const char *RequestMethodStr[] =
40 {
41 "NONE",
42 "GET",
43 "POST",
44 "PUT",
45 "HEAD",
46 "CONNECT",
47 "TRACE",
48 "PURGE",
49 "OPTIONS",
50 "DELETE",
51 "PROPFIND",
52 "PROPPATCH",
53 "MKCOL",
54 "COPY",
55 "MOVE",
56 "LOCK",
57 "UNLOCK",
58 "BMOVE",
59 "BDELETE",
60 "BPROPFIND",
61 "BPROPPATCH",
62 "BCOPY",
63 "SEARCH",
64 "SUBSCRIBE",
65 "UNSUBSCRIBE",
66 "POLL",
67 "REPORT",
68 "%EXT00",
69 "%EXT01",
70 "%EXT02",
71 "%EXT03",
72 "%EXT04",
73 "%EXT05",
74 "%EXT06",
75 "%EXT07",
76 "%EXT08",
77 "%EXT09",
78 "%EXT10",
79 "%EXT11",
80 "%EXT12",
81 "%EXT13",
82 "%EXT14",
83 "%EXT15",
84 "%EXT16",
85 "%EXT17",
86 "%EXT18",
87 "%EXT19",
88 "ERROR"
89 };
90
91 const char *ProtocolStr[] =
92 {
93 "NONE",
94 "http",
95 "ftp",
96 "gopher",
97 "wais",
98 "cache_object",
99 "icp",
100 #if USE_HTCP
101 "htcp",
102 #endif
103 "urn",
104 "whois",
105 "internal",
106 "https",
107 "TOTAL"
108 };
109
110 static HttpRequest *urnParse(method_t method, char *urn);
111 #if CHECK_HOSTNAMES
112 static const char *const valid_hostname_chars =
113 #if ALLOW_HOSTNAME_UNDERSCORES
114 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
115 "abcdefghijklmnopqrstuvwxyz"
116 "0123456789-._";
117 #else
118 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
119 "abcdefghijklmnopqrstuvwxyz"
120 "0123456789-."
121 ;
122 #endif
123 #endif /* CHECK_HOSTNAMES */
124
125 /* convert %xx in url string to a character
126 * Allocate a new string and return a pointer to converted string */
127
128 char *
129 url_convert_hex(char *org_url, int allocate)
130 {
131 static char code[] = "00";
132 char *url = NULL;
133 char *s = NULL;
134 char *t = NULL;
135 url = allocate ? (char *) xstrdup(org_url) : org_url;
136
137 if ((int) strlen(url) < 3 || !strchr(url, '%'))
138 return url;
139
140 for (s = t = url; *s; s++) {
141 if (*s == '%' && *(s + 1) && *(s + 2)) {
142 code[0] = *(++s);
143 code[1] = *(++s);
144 *t++ = (char) strtol(code, NULL, 16);
145 } else {
146 *t++ = *s;
147 }
148 }
149
150 do {
151 *t++ = *s;
152 } while (*s++);
153
154 return url;
155 }
156
157 void
158 urlInitialize(void)
159 {
160 debug(23, 5) ("urlInitialize: Initializing...\n");
161 assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
162 /*
163 * These test that our matchDomainName() function works the
164 * way we expect it to.
165 */
166 assert(0 == matchDomainName("foo.com", "foo.com"));
167 assert(0 == matchDomainName(".foo.com", "foo.com"));
168 assert(0 == matchDomainName("foo.com", ".foo.com"));
169 assert(0 == matchDomainName(".foo.com", ".foo.com"));
170 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
171 assert(0 != matchDomainName("x.foo.com", "foo.com"));
172 assert(0 != matchDomainName("foo.com", "x.foo.com"));
173 assert(0 != matchDomainName("bar.com", "foo.com"));
174 assert(0 != matchDomainName(".bar.com", "foo.com"));
175 assert(0 != matchDomainName(".bar.com", ".foo.com"));
176 assert(0 != matchDomainName("bar.com", ".foo.com"));
177 assert(0 < matchDomainName("zzz.com", "foo.com"));
178 assert(0 > matchDomainName("aaa.com", "foo.com"));
179 assert(0 == matchDomainName("FOO.com", "foo.COM"));
180 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
181 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
182 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
183 /* more cases? */
184 }
185
186 method_t &operator++ (method_t &aMethod)
187 {
188 int tmp = (int)aMethod;
189 aMethod = (method_t)(++tmp);
190 return aMethod;
191 }
192
193 /*
194 * urlParseMethod() takes begin and end pointers, but for backwards
195 * compatibility, end defaults to NULL, in which case we assume begin
196 * is NULL-terminated.
197 */
198 method_t
199 urlParseMethod(const char *b, const char *e)
200 {
201 method_t method = METHOD_NONE;
202 /*
203 * This check for '%' makes sure that we don't
204 * match one of the extension method placeholders,
205 * which have the form %EXT[0-9][0-9]
206 */
207
208 if (*b == '%')
209 return METHOD_NONE;
210
211 /*
212 * if e is NULL, b must be NULL terminated and we
213 * make e point to the first whitespace character
214 * after b.
215 */
216 if (NULL == e)
217 e = b + strcspn(b, w_space);
218
219 for (++method; method < METHOD_ENUM_END; ++method) {
220 if (0 == strncasecmp(b, RequestMethodStr[method], e-b))
221 return method;
222 }
223
224 return METHOD_NONE;
225 }
226
227 /*
228 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
229 * backwards compatibility, e defaults to NULL, in which case we
230 * assume b is NULL-terminated.
231 */
232 protocol_t
233 urlParseProtocol(const char *b, const char *e)
234 {
235 /*
236 * if e is NULL, b must be NULL terminated and we
237 * make e point to the first whitespace character
238 * after b.
239 */
240
241 if (NULL == e)
242 e = b + strcspn(b, ":");
243
244 int len = e - b;
245
246 /* test common stuff first */
247
248 if (strncasecmp(b, "http", len) == 0)
249 return PROTO_HTTP;
250
251 if (strncasecmp(b, "ftp", len) == 0)
252 return PROTO_FTP;
253
254 if (strncasecmp(b, "https", len) == 0)
255 return PROTO_HTTPS;
256
257 if (strncasecmp(b, "file", len) == 0)
258 return PROTO_FTP;
259
260 if (strncasecmp(b, "gopher", len) == 0)
261 return PROTO_GOPHER;
262
263 if (strncasecmp(b, "wais", len) == 0)
264 return PROTO_WAIS;
265
266 if (strncasecmp(b, "cache_object", len) == 0)
267 return PROTO_CACHEOBJ;
268
269 if (strncasecmp(b, "urn", len) == 0)
270 return PROTO_URN;
271
272 if (strncasecmp(b, "whois", len) == 0)
273 return PROTO_WHOIS;
274
275 if (strncasecmp(b, "internal", len) == 0)
276 return PROTO_INTERNAL;
277
278 return PROTO_NONE;
279 }
280
281 int
282 urlDefaultPort(protocol_t p)
283 {
284 switch (p) {
285
286 case PROTO_HTTP:
287 return 80;
288
289 case PROTO_HTTPS:
290 return 443;
291
292 case PROTO_FTP:
293 return 21;
294
295 case PROTO_GOPHER:
296 return 70;
297
298 case PROTO_WAIS:
299 return 210;
300
301 case PROTO_CACHEOBJ:
302
303 case PROTO_INTERNAL:
304 return CACHE_HTTP_PORT;
305
306 case PROTO_WHOIS:
307 return 43;
308
309 default:
310 return 0;
311 }
312 }
313
314 /*
315 * Parse a URI/URL.
316 *
317 * If the 'request' arg is non-NULL, put parsed values there instead
318 * of allocating a new HttpRequest.
319 */
320 HttpRequest *
321 urlParse(method_t method, char *url, HttpRequest *request)
322 {
323 LOCAL_ARRAY(char, proto, MAX_URL);
324 LOCAL_ARRAY(char, login, MAX_URL);
325 LOCAL_ARRAY(char, host, MAX_URL);
326 LOCAL_ARRAY(char, urlpath, MAX_URL);
327 char *t = NULL;
328 char *q = NULL;
329 int port;
330 protocol_t protocol = PROTO_NONE;
331 int l;
332 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
333
334 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
335 /* terminate so it doesn't overflow other buffers */
336 *(url + (MAX_URL >> 1)) = '\0';
337 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
338 return NULL;
339 }
340
341 if (method == METHOD_CONNECT) {
342 port = CONNECT_PORT;
343
344 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
345 return NULL;
346 } else if (!strncmp(url, "urn:", 4)) {
347 return urnParse(method, url);
348 } else {
349 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
350 return NULL;
351
352 protocol = urlParseProtocol(proto);
353
354 port = urlDefaultPort(protocol);
355
356 /* Is there any login informaiton? */
357 if ((t = strrchr(host, '@'))) {
358 strcpy((char *) login, (char *) host);
359 t = strrchr(login, '@');
360 *t = 0;
361 strcpy((char *) host, t + 1);
362 }
363
364 if ((t = strrchr(host, ':'))) {
365 *t++ = '\0';
366
367 if (*t != '\0')
368 port = atoi(t);
369 }
370 }
371
372 for (t = host; *t; t++)
373 *t = xtolower(*t);
374
375 if (stringHasWhitespace(host)) {
376 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
377 t = q = host;
378
379 while (*t) {
380 if (!xisspace(*t))
381 *q++ = *t;
382
383 t++;
384 }
385
386 *q = '\0';
387 }
388 }
389
390 #if CHECK_HOSTNAMES
391 if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
392 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
393 return NULL;
394 }
395
396 #endif
397 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
398 /* remove trailing dots from hostnames */
399 while ((l = strlen(host)) > 0 && host[--l] == '.')
400 host[l] = '\0';
401
402 /* remove duplicate dots */
403 while ((t = strstr(host, "..")))
404 xmemmove(t, t + 1, strlen(t));
405
406 #endif
407
408 if (Config.appendDomain && !strchr(host, '.'))
409 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
410
411 if (port < 1 || port > 65535) {
412 debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
413 return NULL;
414 }
415
416 #ifdef HARDCODE_DENY_PORTS
417 /* These ports are filtered in the default squid.conf, but
418 * maybe someone wants them hardcoded... */
419 if (port == 7 || port == 9 || port == 19) {
420 debug(23, 0) ("urlParse: Deny access to port %d\n", port);
421 return NULL;
422 }
423
424 #endif
425 if (stringHasWhitespace(urlpath)) {
426 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
427
428 switch (Config.uri_whitespace) {
429
430 case URI_WHITESPACE_DENY:
431 return NULL;
432
433 case URI_WHITESPACE_ALLOW:
434 break;
435
436 case URI_WHITESPACE_ENCODE:
437 t = rfc1738_escape_unescaped(urlpath);
438 xstrncpy(urlpath, t, MAX_URL);
439 break;
440
441 case URI_WHITESPACE_CHOP:
442 *(urlpath + strcspn(urlpath, w_space)) = '\0';
443 break;
444
445 case URI_WHITESPACE_STRIP:
446
447 default:
448 t = q = urlpath;
449
450 while (*t) {
451 if (!xisspace(*t))
452 *q++ = *t;
453
454 t++;
455 }
456
457 *q = '\0';
458 }
459 }
460
461 if (NULL == request)
462 request = new HttpRequest(method, protocol, urlpath);
463 else {
464 request->initHTTP(method, protocol, urlpath);
465 }
466
467 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
468 xstrncpy(request->login, login, MAX_LOGIN_SZ);
469 request->port = (u_short) port;
470 return request;
471 }
472
473 static HttpRequest *
474 urnParse(method_t method, char *urn)
475 {
476 debug(50, 5) ("urnParse: %s\n", urn);
477 return new HttpRequest(method, PROTO_URN, urn + 4);
478 }
479
480 const char *
481 urlCanonical(HttpRequest * request)
482 {
483 LOCAL_ARRAY(char, portbuf, 32);
484 LOCAL_ARRAY(char, urlbuf, MAX_URL);
485
486 if (request->canonical)
487 return request->canonical;
488
489 if (request->protocol == PROTO_URN) {
490 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
491 } else {
492 switch (request->method) {
493
494 case METHOD_CONNECT:
495 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
496 break;
497
498 default:
499 portbuf[0] = '\0';
500
501 if (request->port != urlDefaultPort(request->protocol))
502 snprintf(portbuf, 32, ":%d", request->port);
503
504 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
505 ProtocolStr[request->protocol],
506 request->login,
507 *request->login ? "@" : null_string,
508 request->host,
509 portbuf,
510 request->urlpath.buf());
511
512 break;
513 }
514 }
515
516 return (request->canonical = xstrdup(urlbuf));
517 }
518
519 char *
520 urlCanonicalClean(const HttpRequest * request)
521 {
522 LOCAL_ARRAY(char, buf, MAX_URL);
523 LOCAL_ARRAY(char, portbuf, 32);
524 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
525 char *t;
526
527 if (request->protocol == PROTO_URN) {
528 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
529 } else {
530 switch (request->method) {
531
532 case METHOD_CONNECT:
533 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
534 break;
535
536 default:
537 portbuf[0] = '\0';
538
539 if (request->port != urlDefaultPort(request->protocol))
540 snprintf(portbuf, 32, ":%d", request->port);
541
542 loginbuf[0] = '\0';
543
544 if ((int) strlen(request->login) > 0) {
545 strcpy(loginbuf, request->login);
546
547 if ((t = strchr(loginbuf, ':')))
548 *t = '\0';
549
550 strcat(loginbuf, "@");
551 }
552
553 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
554 ProtocolStr[request->protocol],
555 loginbuf,
556 request->host,
557 portbuf,
558 request->urlpath.buf());
559 /*
560 * strip arguments AFTER a question-mark
561 */
562
563 if (Config.onoff.strip_query_terms)
564 if ((t = strchr(buf, '?')))
565 *(++t) = '\0';
566
567 break;
568 }
569 }
570
571 if (stringHasCntl(buf))
572 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
573
574 return buf;
575 }
576
577 /*
578 * matchDomainName() compares a hostname with a domainname according
579 * to the following rules:
580 *
581 * HOST DOMAIN MATCH?
582 * ------------- ------------- ------
583 * foo.com foo.com YES
584 * .foo.com foo.com YES
585 * x.foo.com foo.com NO
586 * foo.com .foo.com YES
587 * .foo.com .foo.com YES
588 * x.foo.com .foo.com YES
589 *
590 * We strip leading dots on hosts (but not domains!) so that
591 * ".foo.com" is is always the same as "foo.com".
592 *
593 * Return values:
594 * 0 means the host matches the domain
595 * 1 means the host is greater than the domain
596 * -1 means the host is less than the domain
597 */
598
599 int
600 matchDomainName(const char *h, const char *d)
601 {
602 int dl;
603 int hl;
604
605 while ('.' == *h)
606 h++;
607
608 hl = strlen(h);
609
610 dl = strlen(d);
611
612 /*
613 * Start at the ends of the two strings and work towards the
614 * beginning.
615 */
616 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
617 if (hl == 0 && dl == 0) {
618 /*
619 * We made it all the way to the beginning of both
620 * strings without finding any difference.
621 */
622 return 0;
623 }
624
625 if (0 == hl) {
626 /*
627 * The host string is shorter than the domain string.
628 * There is only one case when this can be a match.
629 * If the domain is just one character longer, and if
630 * that character is a leading '.' then we call it a
631 * match.
632 */
633
634 if (1 == dl && '.' == d[0])
635 return 0;
636 else
637 return -1;
638 }
639
640 if (0 == dl) {
641 /*
642 * The domain string is shorter than the host string.
643 * This is a match only if the first domain character
644 * is a leading '.'.
645 */
646
647 if ('.' == d[0])
648 return 0;
649 else
650 return 1;
651 }
652 }
653
654 /*
655 * We found different characters in the same position (from the end).
656 */
657 /*
658 * If one of those character is '.' then its special. In order
659 * for splay tree sorting to work properly, "x-foo.com" must
660 * be greater than ".foo.com" even though '-' is less than '.'.
661 */
662 if ('.' == d[dl])
663 return 1;
664
665 if ('.' == h[hl])
666 return -1;
667
668 return (xtolower(h[hl]) - xtolower(d[dl]));
669 }
670
671 int
672 urlCheckRequest(const HttpRequest * r)
673 {
674 int rc = 0;
675 /* protocol "independent" methods */
676
677 if (r->method == METHOD_CONNECT)
678 return 1;
679
680 if (r->method == METHOD_TRACE)
681 return 1;
682
683 if (r->method == METHOD_PURGE)
684 return 1;
685
686 /* does method match the protocol? */
687 switch (r->protocol) {
688
689 case PROTO_URN:
690
691 case PROTO_HTTP:
692
693 case PROTO_CACHEOBJ:
694 rc = 1;
695 break;
696
697 case PROTO_FTP:
698
699 if (r->method == METHOD_PUT)
700 rc = 1;
701
702 case PROTO_GOPHER:
703
704 case PROTO_WAIS:
705
706 case PROTO_WHOIS:
707 if (r->method == METHOD_GET)
708 rc = 1;
709 else if (r->method == METHOD_HEAD)
710 rc = 1;
711
712 break;
713
714 case PROTO_HTTPS:
715 #ifdef USE_SSL
716
717 rc = 1;
718
719 break;
720
721 #else
722 /*
723 * Squid can't originate an SSL connection, so it should
724 * never receive an "https:" URL. It should always be
725 * CONNECT instead.
726 */
727 rc = 0;
728
729 #endif
730
731 default:
732 break;
733 }
734
735 return rc;
736 }
737
738 /*
739 * Quick-n-dirty host extraction from a URL. Steps:
740 * Look for a colon
741 * Skip any '/' after the colon
742 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
743 * Look for an ending '/' or ':' and terminate
744 * Look for login info preceeded by '@'
745 */
746
747 class URLHostName
748 {
749
750 public:
751 char * extract(char const *url);
752
753 private:
754 static char Host [SQUIDHOSTNAMELEN];
755 void init(char const *);
756 void findHostStart();
757 void trimTrailingChars();
758 void trimAuth();
759 char const *hostStart;
760 char const *url;
761 };
762
763 char *
764 urlHostname(const char *url)
765 {
766 return URLHostName().extract(url);
767 }
768
769 char URLHostName::Host[SQUIDHOSTNAMELEN];
770
771 void
772 URLHostName::init(char const *aUrl)
773 {
774 Host[0] = '\0';
775 url = url;
776 }
777
778 void
779 URLHostName::findHostStart()
780 {
781 if (NULL == (hostStart = strchr(url, ':')))
782 return;
783
784 ++hostStart;
785
786 while (*hostStart != '\0' && *hostStart == '/')
787 ++hostStart;
788 }
789
790 void
791 URLHostName::trimTrailingChars()
792 {
793 char *t;
794
795 if ((t = strchr(Host, '/')))
796 *t = '\0';
797
798 if ((t = strchr(Host, ':')))
799 *t = '\0';
800 }
801
802 void
803 URLHostName::trimAuth()
804 {
805 char *t;
806
807 if ((t = strrchr(Host, '@'))) {
808 t++;
809 xmemmove(Host, t, strlen(t) + 1);
810 }
811 }
812
813 char *
814 URLHostName::extract(char const *aUrl)
815 {
816 init(aUrl);
817 findHostStart();
818
819 if (hostStart == NULL)
820 return NULL;
821
822 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
823
824 trimTrailingChars();
825
826 trimAuth();
827
828 return Host;
829 }
830
831 static void
832 urlExtMethodAdd(const char *mstr)
833 {
834 method_t method = METHOD_NONE;
835
836 for (++method; method < METHOD_ENUM_END; ++method) {
837 if (0 == strcmp(mstr, RequestMethodStr[method])) {
838 debug(23, 2) ("Extension method '%s' already exists\n", mstr);
839 return;
840 }
841
842 if (0 != strncmp("%EXT", RequestMethodStr[method], 4))
843 continue;
844
845 /* Don't free statically allocated "%EXTnn" string */
846 RequestMethodStr[method] = xstrdup(mstr);
847
848 debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr, (int) method);
849
850 return;
851 }
852
853 debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr);
854 }
855
856 void
857 urlExtMethodConfigure(void)
858 {
859 wordlist *w = Config.ext_methods;
860
861 while (w) {
862 char *s;
863
864 for (s = w->key; *s; s++)
865 *s = xtoupper(*s);
866
867 urlExtMethodAdd(w->key);
868
869 w = w->next;
870 }
871 }