]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Summary: Final MSVC fixups.
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.146 2003/08/10 11:00:45 robertc Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "squid.h"
37 #include "HttpRequest.h"
38
39 const char *RequestMethodStr[] =
40 {
41 "NONE",
42 "GET",
43 "POST",
44 "PUT",
45 "HEAD",
46 "CONNECT",
47 "TRACE",
48 "PURGE",
49 "OPTIONS",
50 "DELETE",
51 "PROPFIND",
52 "PROPPATCH",
53 "MKCOL",
54 "COPY",
55 "MOVE",
56 "LOCK",
57 "UNLOCK",
58 "BMOVE",
59 "BDELETE",
60 "BPROPFIND",
61 "BPROPPATCH",
62 "BCOPY",
63 "SEARCH",
64 "SUBSCRIBE",
65 "UNSUBSCRIBE",
66 "POLL",
67 "%EXT00",
68 "%EXT01",
69 "%EXT02",
70 "%EXT03",
71 "%EXT04",
72 "%EXT05",
73 "%EXT06",
74 "%EXT07",
75 "%EXT08",
76 "%EXT09",
77 "%EXT10",
78 "%EXT11",
79 "%EXT12",
80 "%EXT13",
81 "%EXT14",
82 "%EXT15",
83 "%EXT16",
84 "%EXT17",
85 "%EXT18",
86 "%EXT19",
87 "ERROR"
88 };
89
90 const char *ProtocolStr[] =
91 {
92 "NONE",
93 "http",
94 "ftp",
95 "gopher",
96 "wais",
97 "cache_object",
98 "icp",
99 #if USE_HTCP
100 "htcp",
101 #endif
102 "urn",
103 "whois",
104 "internal",
105 "https",
106 "TOTAL"
107 };
108
109 static HttpRequest *urnParse(method_t method, char *urn);
110 #if CHECK_HOSTNAMES
111 static const char *const valid_hostname_chars =
112 #if ALLOW_HOSTNAME_UNDERSCORES
113 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
114 "abcdefghijklmnopqrstuvwxyz"
115 "0123456789-._";
116 #else
117 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
118 "abcdefghijklmnopqrstuvwxyz"
119 "0123456789-."
120 ;
121 #endif
122 #endif /* CHECK_HOSTNAMES */
123
124 /* convert %xx in url string to a character
125 * Allocate a new string and return a pointer to converted string */
126
127 char *
128 url_convert_hex(char *org_url, int allocate)
129 {
130 static char code[] = "00";
131 char *url = NULL;
132 char *s = NULL;
133 char *t = NULL;
134 url = allocate ? (char *) xstrdup(org_url) : org_url;
135
136 if ((int) strlen(url) < 3 || !strchr(url, '%'))
137 return url;
138
139 for (s = t = url; *s; s++) {
140 if (*s == '%' && *(s + 1) && *(s + 2)) {
141 code[0] = *(++s);
142 code[1] = *(++s);
143 *t++ = (char) strtol(code, NULL, 16);
144 } else {
145 *t++ = *s;
146 }
147 }
148
149 do {
150 *t++ = *s;
151 } while (*s++);
152
153 return url;
154 }
155
156 void
157 urlInitialize(void)
158 {
159 debug(23, 5) ("urlInitialize: Initializing...\n");
160 assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
161 /*
162 * These test that our matchDomainName() function works the
163 * way we expect it to.
164 */
165 assert(0 == matchDomainName("foo.com", "foo.com"));
166 assert(0 == matchDomainName(".foo.com", "foo.com"));
167 assert(0 == matchDomainName("foo.com", ".foo.com"));
168 assert(0 == matchDomainName(".foo.com", ".foo.com"));
169 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
170 assert(0 != matchDomainName("x.foo.com", "foo.com"));
171 assert(0 != matchDomainName("foo.com", "x.foo.com"));
172 assert(0 != matchDomainName("bar.com", "foo.com"));
173 assert(0 != matchDomainName(".bar.com", "foo.com"));
174 assert(0 != matchDomainName(".bar.com", ".foo.com"));
175 assert(0 != matchDomainName("bar.com", ".foo.com"));
176 assert(0 < matchDomainName("zzz.com", "foo.com"));
177 assert(0 > matchDomainName("aaa.com", "foo.com"));
178 assert(0 == matchDomainName("FOO.com", "foo.COM"));
179 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
180 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
181 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
182 /* more cases? */
183 }
184
185 method_t &operator++ (method_t &aMethod)
186 {
187 int tmp = (int)aMethod;
188 aMethod = (method_t)(++tmp);
189 return aMethod;
190 }
191
192
193 method_t
194 urlParseMethod(const char *s)
195 {
196 method_t method = METHOD_NONE;
197 /*
198 * This check for '%' makes sure that we don't
199 * match one of the extension method placeholders,
200 * which have the form %EXT[0-9][0-9]
201 */
202
203 if (*s == '%')
204 return METHOD_NONE;
205
206 for (++method; method < METHOD_ENUM_END; ++method) {
207 if (0 == strcasecmp(s, RequestMethodStr[method]))
208 return method;
209 }
210
211 return METHOD_NONE;
212 }
213
214
215 protocol_t
216 urlParseProtocol(const char *s)
217 {
218 /* test common stuff first */
219
220 if (strcasecmp(s, "http") == 0)
221 return PROTO_HTTP;
222
223 if (strcasecmp(s, "ftp") == 0)
224 return PROTO_FTP;
225
226 if (strcasecmp(s, "https") == 0)
227 return PROTO_HTTPS;
228
229 if (strcasecmp(s, "file") == 0)
230 return PROTO_FTP;
231
232 if (strcasecmp(s, "gopher") == 0)
233 return PROTO_GOPHER;
234
235 if (strcasecmp(s, "wais") == 0)
236 return PROTO_WAIS;
237
238 if (strcasecmp(s, "cache_object") == 0)
239 return PROTO_CACHEOBJ;
240
241 if (strcasecmp(s, "urn") == 0)
242 return PROTO_URN;
243
244 if (strcasecmp(s, "whois") == 0)
245 return PROTO_WHOIS;
246
247 if (strcasecmp(s, "internal") == 0)
248 return PROTO_INTERNAL;
249
250 return PROTO_NONE;
251 }
252
253
254 int
255 urlDefaultPort(protocol_t p)
256 {
257 switch (p) {
258
259 case PROTO_HTTP:
260 return 80;
261
262 case PROTO_HTTPS:
263 return 443;
264
265 case PROTO_FTP:
266 return 21;
267
268 case PROTO_GOPHER:
269 return 70;
270
271 case PROTO_WAIS:
272 return 210;
273
274 case PROTO_CACHEOBJ:
275
276 case PROTO_INTERNAL:
277 return CACHE_HTTP_PORT;
278
279 case PROTO_WHOIS:
280 return 43;
281
282 default:
283 return 0;
284 }
285 }
286
287 HttpRequest *
288 urlParse(method_t method, char *url)
289 {
290 LOCAL_ARRAY(char, proto, MAX_URL);
291 LOCAL_ARRAY(char, login, MAX_URL);
292 LOCAL_ARRAY(char, host, MAX_URL);
293 LOCAL_ARRAY(char, urlpath, MAX_URL);
294 HttpRequest *request = NULL;
295 char *t = NULL;
296 char *q = NULL;
297 int port;
298 protocol_t protocol = PROTO_NONE;
299 int l;
300 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
301
302 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
303 /* terminate so it doesn't overflow other buffers */
304 *(url + (MAX_URL >> 1)) = '\0';
305 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
306 return NULL;
307 }
308
309 if (method == METHOD_CONNECT) {
310 port = CONNECT_PORT;
311
312 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
313 return NULL;
314 } else if (!strncmp(url, "urn:", 4)) {
315 return urnParse(method, url);
316 } else {
317 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
318 return NULL;
319
320 protocol = urlParseProtocol(proto);
321
322 port = urlDefaultPort(protocol);
323
324 /* Is there any login informaiton? */
325 if ((t = strrchr(host, '@'))) {
326 strcpy((char *) login, (char *) host);
327 t = strrchr(login, '@');
328 *t = 0;
329 strcpy((char *) host, t + 1);
330 }
331
332 if ((t = strrchr(host, ':'))) {
333 *t++ = '\0';
334
335 if (*t != '\0')
336 port = atoi(t);
337 }
338 }
339
340 for (t = host; *t; t++)
341 *t = xtolower(*t);
342
343 if (stringHasWhitespace(host)) {
344 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
345 t = q = host;
346
347 while (*t) {
348 if (!xisspace(*t))
349 *q++ = *t;
350
351 t++;
352 }
353
354 *q = '\0';
355 }
356 }
357
358 #if CHECK_HOSTNAMES
359 if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
360 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
361 return NULL;
362 }
363
364 #endif
365 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
366 /* remove trailing dots from hostnames */
367 while ((l = strlen(host)) > 0 && host[--l] == '.')
368 host[l] = '\0';
369
370 /* remove duplicate dots */
371 while ((t = strstr(host, "..")))
372 xmemmove(t, t + 1, strlen(t));
373
374 #endif
375
376 if (Config.appendDomain && !strchr(host, '.'))
377 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN);
378
379 if (port < 1 || port > 65535) {
380 debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
381 return NULL;
382 }
383
384 #ifdef HARDCODE_DENY_PORTS
385 /* These ports are filtered in the default squid.conf, but
386 * maybe someone wants them hardcoded... */
387 if (port == 7 || port == 9 || port == 19) {
388 debug(23, 0) ("urlParse: Deny access to port %d\n", port);
389 return NULL;
390 }
391
392 #endif
393 if (stringHasWhitespace(urlpath)) {
394 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
395
396 switch (Config.uri_whitespace) {
397
398 case URI_WHITESPACE_DENY:
399 return NULL;
400
401 case URI_WHITESPACE_ALLOW:
402 break;
403
404 case URI_WHITESPACE_ENCODE:
405 t = rfc1738_escape_unescaped(urlpath);
406 xstrncpy(urlpath, t, MAX_URL);
407 break;
408
409 case URI_WHITESPACE_CHOP:
410 *(urlpath + strcspn(urlpath, w_space)) = '\0';
411 break;
412
413 case URI_WHITESPACE_STRIP:
414
415 default:
416 t = q = urlpath;
417
418 while (*t) {
419 if (!xisspace(*t))
420 *q++ = *t;
421
422 t++;
423 }
424
425 *q = '\0';
426 }
427 }
428
429 request = requestCreate(method, protocol, urlpath);
430 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
431 xstrncpy(request->login, login, MAX_LOGIN_SZ);
432 request->port = (u_short) port;
433 return request;
434 }
435
436 static HttpRequest *
437 urnParse(method_t method, char *urn)
438 {
439 debug(50, 5) ("urnParse: %s\n", urn);
440 return requestCreate(method, PROTO_URN, urn + 4);
441 }
442
443 const char *
444 urlCanonical(HttpRequest * request)
445 {
446 LOCAL_ARRAY(char, portbuf, 32);
447 LOCAL_ARRAY(char, urlbuf, MAX_URL);
448
449 if (request->canonical)
450 return request->canonical;
451
452 if (request->protocol == PROTO_URN) {
453 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
454 } else {
455 switch (request->method) {
456
457 case METHOD_CONNECT:
458 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
459 break;
460
461 default:
462 portbuf[0] = '\0';
463
464 if (request->port != urlDefaultPort(request->protocol))
465 snprintf(portbuf, 32, ":%d", request->port);
466
467 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
468 ProtocolStr[request->protocol],
469 request->login,
470 *request->login ? "@" : null_string,
471 request->host,
472 portbuf,
473 request->urlpath.buf());
474
475 break;
476 }
477 }
478
479 return (request->canonical = xstrdup(urlbuf));
480 }
481
482 char *
483 urlCanonicalClean(const HttpRequest * request)
484 {
485 LOCAL_ARRAY(char, buf, MAX_URL);
486 LOCAL_ARRAY(char, portbuf, 32);
487 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
488 char *t;
489
490 if (request->protocol == PROTO_URN) {
491 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
492 } else {
493 switch (request->method) {
494
495 case METHOD_CONNECT:
496 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
497 break;
498
499 default:
500 portbuf[0] = '\0';
501
502 if (request->port != urlDefaultPort(request->protocol))
503 snprintf(portbuf, 32, ":%d", request->port);
504
505 loginbuf[0] = '\0';
506
507 if ((int) strlen(request->login) > 0) {
508 strcpy(loginbuf, request->login);
509
510 if ((t = strchr(loginbuf, ':')))
511 *t = '\0';
512
513 strcat(loginbuf, "@");
514 }
515
516 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
517 ProtocolStr[request->protocol],
518 loginbuf,
519 request->host,
520 portbuf,
521 request->urlpath.buf());
522 /*
523 * strip arguments AFTER a question-mark
524 */
525
526 if (Config.onoff.strip_query_terms)
527 if ((t = strchr(buf, '?')))
528 *(++t) = '\0';
529
530 break;
531 }
532 }
533
534 if (stringHasCntl(buf))
535 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
536
537 return buf;
538 }
539
540 /*
541 * matchDomainName() compares a hostname with a domainname according
542 * to the following rules:
543 *
544 * HOST DOMAIN MATCH?
545 * ------------- ------------- ------
546 * foo.com foo.com YES
547 * .foo.com foo.com YES
548 * x.foo.com foo.com NO
549 * foo.com .foo.com YES
550 * .foo.com .foo.com YES
551 * x.foo.com .foo.com YES
552 *
553 * We strip leading dots on hosts (but not domains!) so that
554 * ".foo.com" is is always the same as "foo.com".
555 *
556 * Return values:
557 * 0 means the host matches the domain
558 * 1 means the host is greater than the domain
559 * -1 means the host is less than the domain
560 */
561
562 int
563 matchDomainName(const char *h, const char *d)
564 {
565 int dl;
566 int hl;
567
568 while ('.' == *h)
569 h++;
570
571 hl = strlen(h);
572
573 dl = strlen(d);
574
575 /*
576 * Start at the ends of the two strings and work towards the
577 * beginning.
578 */
579 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
580 if (hl == 0 && dl == 0) {
581 /*
582 * We made it all the way to the beginning of both
583 * strings without finding any difference.
584 */
585 return 0;
586 }
587
588 if (0 == hl) {
589 /*
590 * The host string is shorter than the domain string.
591 * There is only one case when this can be a match.
592 * If the domain is just one character longer, and if
593 * that character is a leading '.' then we call it a
594 * match.
595 */
596
597 if (1 == dl && '.' == d[0])
598 return 0;
599 else
600 return -1;
601 }
602
603 if (0 == dl) {
604 /*
605 * The domain string is shorter than the host string.
606 * This is a match only if the first domain character
607 * is a leading '.'.
608 */
609
610 if ('.' == d[0])
611 return 0;
612 else
613 return 1;
614 }
615 }
616
617 /*
618 * We found different characters in the same position (from the end).
619 */
620 /*
621 * If one of those character is '.' then its special. In order
622 * for splay tree sorting to work properly, "x-foo.com" must
623 * be greater than ".foo.com" even though '-' is less than '.'.
624 */
625 if ('.' == d[dl])
626 return 1;
627
628 if ('.' == h[hl])
629 return -1;
630
631 return (xtolower(h[hl]) - xtolower(d[dl]));
632 }
633
634 int
635 urlCheckRequest(const HttpRequest * r)
636 {
637 int rc = 0;
638 /* protocol "independent" methods */
639
640 if (r->method == METHOD_CONNECT)
641 return 1;
642
643 if (r->method == METHOD_TRACE)
644 return 1;
645
646 if (r->method == METHOD_PURGE)
647 return 1;
648
649 /* does method match the protocol? */
650 switch (r->protocol) {
651
652 case PROTO_URN:
653
654 case PROTO_HTTP:
655
656 case PROTO_CACHEOBJ:
657 rc = 1;
658 break;
659
660 case PROTO_FTP:
661
662 if (r->method == METHOD_PUT)
663 rc = 1;
664
665 case PROTO_GOPHER:
666
667 case PROTO_WAIS:
668
669 case PROTO_WHOIS:
670 if (r->method == METHOD_GET)
671 rc = 1;
672 else if (r->method == METHOD_HEAD)
673 rc = 1;
674
675 break;
676
677 case PROTO_HTTPS:
678 #ifdef USE_SSL
679
680 rc = 1;
681
682 break;
683
684 #else
685 /*
686 * Squid can't originate an SSL connection, so it should
687 * never receive an "https:" URL. It should always be
688 * CONNECT instead.
689 */
690 rc = 0;
691
692 #endif
693
694 default:
695 break;
696 }
697
698 return rc;
699 }
700
701 /*
702 * Quick-n-dirty host extraction from a URL. Steps:
703 * Look for a colon
704 * Skip any '/' after the colon
705 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
706 * Look for an ending '/' or ':' and terminate
707 * Look for login info preceeded by '@'
708 */
709
710 class URLHostName
711 {
712
713 public:
714 char * extract(char const *url);
715
716 private:
717 static char Host [SQUIDHOSTNAMELEN];
718 void init(char const *);
719 void findHostStart();
720 void trimTrailingChars();
721 void trimAuth();
722 char const *hostStart;
723 char const *url;
724 };
725
726 char *
727 urlHostname(const char *url)
728 {
729 return URLHostName().extract(url);
730 }
731
732 char URLHostName::Host[SQUIDHOSTNAMELEN];
733
734 void
735 URLHostName::init(char const *aUrl)
736 {
737 Host[0] = '\0';
738 url = url;
739 }
740
741 void
742 URLHostName::findHostStart()
743 {
744 if (NULL == (hostStart = strchr(url, ':')))
745 return;
746
747 ++hostStart;
748
749 while (*hostStart != '\0' && *hostStart == '/')
750 ++hostStart;
751 }
752
753 void
754 URLHostName::trimTrailingChars()
755 {
756 char *t;
757
758 if ((t = strchr(Host, '/')))
759 *t = '\0';
760
761 if ((t = strchr(Host, ':')))
762 *t = '\0';
763 }
764
765 void
766 URLHostName::trimAuth()
767 {
768 char *t;
769
770 if ((t = strrchr(Host, '@'))) {
771 t++;
772 xmemmove(Host, t, strlen(t) + 1);
773 }
774 }
775
776 char *
777 URLHostName::extract(char const *aUrl)
778 {
779 init(aUrl);
780 findHostStart();
781
782 if (hostStart == NULL)
783 return NULL;
784
785 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
786
787 trimTrailingChars();
788
789 trimAuth();
790
791 return Host;
792 }
793
794 static void
795 urlExtMethodAdd(const char *mstr)
796 {
797 method_t method = METHOD_NONE;
798
799 for (++method; method < METHOD_ENUM_END; ++method) {
800 if (0 == strcmp(mstr, RequestMethodStr[method])) {
801 debug(23, 2) ("Extension method '%s' already exists\n", mstr);
802 return;
803 }
804
805 if (0 != strncmp("%EXT", RequestMethodStr[method], 4))
806 continue;
807
808 /* Don't free statically allocated "%EXTnn" string */
809 RequestMethodStr[method] = xstrdup(mstr);
810
811 debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr, (int) method);
812
813 return;
814 }
815
816 debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr);
817 }
818
819 void
820 urlExtMethodConfigure(void)
821 {
822 wordlist *w = Config.ext_methods;
823
824 while (w) {
825 char *s;
826
827 for (s = w->key; *s; s++)
828 *s = xtoupper(*s);
829
830 urlExtMethodAdd(w->key);
831
832 w = w->next;
833 }
834 }