]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Author: Tsantilas Christos <chtsanti@users.sourceforge.net>
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.157 2007/04/28 22:26:38 hno Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "URL.h"
37 #include "HttpRequest.h"
38 #include "URLScheme.h"
39
40 static HttpRequest *urnParse(method_t method, char *urn);
41 static const char valid_hostname_chars_u[] =
42 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
43 "abcdefghijklmnopqrstuvwxyz"
44 "0123456789-._"
45 ;
46 static const char valid_hostname_chars[] =
47 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
48 "abcdefghijklmnopqrstuvwxyz"
49 "0123456789-."
50 ;
51
52 /* convert %xx in url string to a character
53 * Allocate a new string and return a pointer to converted string */
54
55 char *
56 url_convert_hex(char *org_url, int allocate)
57 {
58 static char code[] = "00";
59 char *url = NULL;
60 char *s = NULL;
61 char *t = NULL;
62 url = allocate ? (char *) xstrdup(org_url) : org_url;
63
64 if ((int) strlen(url) < 3 || !strchr(url, '%'))
65 return url;
66
67 for (s = t = url; *s; s++) {
68 if (*s == '%' && *(s + 1) && *(s + 2)) {
69 code[0] = *(++s);
70 code[1] = *(++s);
71 *t++ = (char) strtol(code, NULL, 16);
72 } else {
73 *t++ = *s;
74 }
75 }
76
77 do {
78 *t++ = *s;
79 } while (*s++);
80
81 return url;
82 }
83
84 void
85 urlInitialize(void)
86 {
87 debugs(23, 5, "urlInitialize: Initializing...");
88 /* this ensures that the number of protocol strings is the same as
89 * the enum slots allocated because the last enum is always 'TOTAL'.
90 */
91 assert(strcmp(ProtocolStr[PROTO_MAX], "TOTAL") == 0);
92 /*
93 * These test that our matchDomainName() function works the
94 * way we expect it to.
95 */
96 assert(0 == matchDomainName("foo.com", "foo.com"));
97 assert(0 == matchDomainName(".foo.com", "foo.com"));
98 assert(0 == matchDomainName("foo.com", ".foo.com"));
99 assert(0 == matchDomainName(".foo.com", ".foo.com"));
100 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
101 assert(0 != matchDomainName("x.foo.com", "foo.com"));
102 assert(0 != matchDomainName("foo.com", "x.foo.com"));
103 assert(0 != matchDomainName("bar.com", "foo.com"));
104 assert(0 != matchDomainName(".bar.com", "foo.com"));
105 assert(0 != matchDomainName(".bar.com", ".foo.com"));
106 assert(0 != matchDomainName("bar.com", ".foo.com"));
107 assert(0 < matchDomainName("zzz.com", "foo.com"));
108 assert(0 > matchDomainName("aaa.com", "foo.com"));
109 assert(0 == matchDomainName("FOO.com", "foo.COM"));
110 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
111 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
112 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
113 /* more cases? */
114 }
115
116 /*
117 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
118 * backwards compatibility, e defaults to NULL, in which case we
119 * assume b is NULL-terminated.
120 */
121 protocol_t
122 urlParseProtocol(const char *b, const char *e)
123 {
124 /*
125 * if e is NULL, b must be NULL terminated and we
126 * make e point to the first whitespace character
127 * after b.
128 */
129
130 if (NULL == e)
131 e = b + strcspn(b, ":");
132
133 int len = e - b;
134
135 /* test common stuff first */
136
137 if (strncasecmp(b, "http", len) == 0)
138 return PROTO_HTTP;
139
140 if (strncasecmp(b, "ftp", len) == 0)
141 return PROTO_FTP;
142
143 if (strncasecmp(b, "https", len) == 0)
144 return PROTO_HTTPS;
145
146 if (strncasecmp(b, "file", len) == 0)
147 return PROTO_FTP;
148
149 if (strncasecmp(b, "gopher", len) == 0)
150 return PROTO_GOPHER;
151
152 if (strncasecmp(b, "wais", len) == 0)
153 return PROTO_WAIS;
154
155 if (strncasecmp(b, "cache_object", len) == 0)
156 return PROTO_CACHEOBJ;
157
158 if (strncasecmp(b, "urn", len) == 0)
159 return PROTO_URN;
160
161 if (strncasecmp(b, "whois", len) == 0)
162 return PROTO_WHOIS;
163
164 if (strncasecmp(b, "internal", len) == 0)
165 return PROTO_INTERNAL;
166
167 return PROTO_NONE;
168 }
169
170 int
171 urlDefaultPort(protocol_t p)
172 {
173 switch (p) {
174
175 case PROTO_HTTP:
176 return 80;
177
178 case PROTO_HTTPS:
179 return 443;
180
181 case PROTO_FTP:
182 return 21;
183
184 case PROTO_GOPHER:
185 return 70;
186
187 case PROTO_WAIS:
188 return 210;
189
190 case PROTO_CACHEOBJ:
191
192 case PROTO_INTERNAL:
193 return CACHE_HTTP_PORT;
194
195 case PROTO_WHOIS:
196 return 43;
197
198 default:
199 return 0;
200 }
201 }
202
203 /*
204 * Parse a URI/URL.
205 *
206 * If the 'request' arg is non-NULL, put parsed values there instead
207 * of allocating a new HttpRequest.
208 *
209 * This abuses HttpRequest as a way of representing the parsed url
210 * and its components.
211 * method is used to switch parsers and to init the HttpRequest.
212 * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
213 * looked for.
214 * The url is non const so that if its too long we can NULL-terminate it in place.
215 */
216 HttpRequest *
217 urlParse(method_t method, char *url, HttpRequest *request)
218 {
219 LOCAL_ARRAY(char, proto, MAX_URL);
220 LOCAL_ARRAY(char, login, MAX_URL);
221 LOCAL_ARRAY(char, host, MAX_URL);
222 LOCAL_ARRAY(char, urlpath, MAX_URL);
223 char *t = NULL;
224 char *q = NULL;
225 int port;
226 protocol_t protocol = PROTO_NONE;
227 int l;
228 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
229
230 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
231 /* terminate so it doesn't overflow other buffers */
232 *(url + (MAX_URL >> 1)) = '\0';
233 debugs(23, 1, "urlParse: URL too large (" << l << " bytes)");
234 return NULL;
235 }
236
237 if (method == METHOD_CONNECT) {
238 port = CONNECT_PORT;
239
240 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
241 return NULL;
242 } else if (!strncmp(url, "urn:", 4)) {
243 return urnParse(method, url);
244 } else {
245 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
246 return NULL;
247
248 protocol = urlParseProtocol(proto);
249
250 port = urlDefaultPort(protocol);
251
252 /* Is there any login informaiton? */
253 if ((t = strrchr(host, '@'))) {
254 strcpy((char *) login, (char *) host);
255 t = strrchr(login, '@');
256 *t = 0;
257 strcpy((char *) host, t + 1);
258 }
259
260 if ((t = strrchr(host, ':'))) {
261 *t++ = '\0';
262
263 if (*t != '\0')
264 port = atoi(t);
265 }
266 }
267
268 for (t = host; *t; t++)
269 *t = xtolower(*t);
270
271 if (stringHasWhitespace(host)) {
272 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
273 t = q = host;
274
275 while (*t) {
276 if (!xisspace(*t))
277 *q++ = *t;
278
279 t++;
280 }
281
282 *q = '\0';
283 }
284 }
285
286 if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
287 debugs(23, 1, "urlParse: Illegal character in hostname '" << host << "'");
288 return NULL;
289 }
290
291 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
292 /* remove trailing dots from hostnames */
293 while ((l = strlen(host)) > 0 && host[--l] == '.')
294 host[l] = '\0';
295
296 /* remove duplicate dots */
297 while ((t = strstr(host, "..")))
298 xmemmove(t, t + 1, strlen(t));
299
300 #endif
301
302 if (Config.appendDomain && !strchr(host, '.'))
303 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
304
305 if (port < 1 || port > 65535) {
306 debugs(23, 3, "urlParse: Invalid port '" << port << "'");
307 return NULL;
308 }
309
310 #ifdef HARDCODE_DENY_PORTS
311 /* These ports are filtered in the default squid.conf, but
312 * maybe someone wants them hardcoded... */
313 if (port == 7 || port == 9 || port == 19) {
314 debugs(23, 0, "urlParse: Deny access to port " << port);
315 return NULL;
316 }
317
318 #endif
319 if (stringHasWhitespace(urlpath)) {
320 debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
321
322 switch (Config.uri_whitespace) {
323
324 case URI_WHITESPACE_DENY:
325 return NULL;
326
327 case URI_WHITESPACE_ALLOW:
328 break;
329
330 case URI_WHITESPACE_ENCODE:
331 t = rfc1738_escape_unescaped(urlpath);
332 xstrncpy(urlpath, t, MAX_URL);
333 break;
334
335 case URI_WHITESPACE_CHOP:
336 *(urlpath + strcspn(urlpath, w_space)) = '\0';
337 break;
338
339 case URI_WHITESPACE_STRIP:
340
341 default:
342 t = q = urlpath;
343
344 while (*t) {
345 if (!xisspace(*t))
346 *q++ = *t;
347
348 t++;
349 }
350
351 *q = '\0';
352 }
353 }
354
355 if (NULL == request)
356 request = new HttpRequest(method, protocol, urlpath);
357 else {
358 request->initHTTP(method, protocol, urlpath);
359 }
360
361 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
362 xstrncpy(request->login, login, MAX_LOGIN_SZ);
363 request->port = (u_short) port;
364 return request;
365 }
366
367 static HttpRequest *
368 urnParse(method_t method, char *urn)
369 {
370 debugs(50, 5, "urnParse: " << urn);
371 return new HttpRequest(method, PROTO_URN, urn + 4);
372 }
373
374 const char *
375 urlCanonical(HttpRequest * request)
376 {
377 LOCAL_ARRAY(char, portbuf, 32);
378 LOCAL_ARRAY(char, urlbuf, MAX_URL);
379
380 if (request->canonical)
381 return request->canonical;
382
383 if (request->protocol == PROTO_URN) {
384 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
385 } else {
386 switch (request->method) {
387
388 case METHOD_CONNECT:
389 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
390 break;
391
392 default:
393 portbuf[0] = '\0';
394
395 if (request->port != urlDefaultPort(request->protocol))
396 snprintf(portbuf, 32, ":%d", request->port);
397
398 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
399 ProtocolStr[request->protocol],
400 request->login,
401 *request->login ? "@" : null_string,
402 request->host,
403 portbuf,
404 request->urlpath.buf());
405
406 break;
407 }
408 }
409
410 return (request->canonical = xstrdup(urlbuf));
411 }
412
413 char *
414 urlCanonicalClean(const HttpRequest * request)
415 {
416 LOCAL_ARRAY(char, buf, MAX_URL);
417 LOCAL_ARRAY(char, portbuf, 32);
418 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
419 char *t;
420
421 if (request->protocol == PROTO_URN) {
422 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
423 } else {
424 switch (request->method) {
425
426 case METHOD_CONNECT:
427 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
428 break;
429
430 default:
431 portbuf[0] = '\0';
432
433 if (request->port != urlDefaultPort(request->protocol))
434 snprintf(portbuf, 32, ":%d", request->port);
435
436 loginbuf[0] = '\0';
437
438 if ((int) strlen(request->login) > 0) {
439 strcpy(loginbuf, request->login);
440
441 if ((t = strchr(loginbuf, ':')))
442 *t = '\0';
443
444 strcat(loginbuf, "@");
445 }
446
447 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
448 ProtocolStr[request->protocol],
449 loginbuf,
450 request->host,
451 portbuf,
452 request->urlpath.buf());
453 /*
454 * strip arguments AFTER a question-mark
455 */
456
457 if (Config.onoff.strip_query_terms)
458 if ((t = strchr(buf, '?')))
459 *(++t) = '\0';
460
461 break;
462 }
463 }
464
465 if (stringHasCntl(buf))
466 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
467
468 return buf;
469 }
470
471 /*
472 * matchDomainName() compares a hostname with a domainname according
473 * to the following rules:
474 *
475 * HOST DOMAIN MATCH?
476 * ------------- ------------- ------
477 * foo.com foo.com YES
478 * .foo.com foo.com YES
479 * x.foo.com foo.com NO
480 * foo.com .foo.com YES
481 * .foo.com .foo.com YES
482 * x.foo.com .foo.com YES
483 *
484 * We strip leading dots on hosts (but not domains!) so that
485 * ".foo.com" is is always the same as "foo.com".
486 *
487 * Return values:
488 * 0 means the host matches the domain
489 * 1 means the host is greater than the domain
490 * -1 means the host is less than the domain
491 */
492
493 int
494 matchDomainName(const char *h, const char *d)
495 {
496 int dl;
497 int hl;
498
499 while ('.' == *h)
500 h++;
501
502 hl = strlen(h);
503
504 dl = strlen(d);
505
506 /*
507 * Start at the ends of the two strings and work towards the
508 * beginning.
509 */
510 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
511 if (hl == 0 && dl == 0) {
512 /*
513 * We made it all the way to the beginning of both
514 * strings without finding any difference.
515 */
516 return 0;
517 }
518
519 if (0 == hl) {
520 /*
521 * The host string is shorter than the domain string.
522 * There is only one case when this can be a match.
523 * If the domain is just one character longer, and if
524 * that character is a leading '.' then we call it a
525 * match.
526 */
527
528 if (1 == dl && '.' == d[0])
529 return 0;
530 else
531 return -1;
532 }
533
534 if (0 == dl) {
535 /*
536 * The domain string is shorter than the host string.
537 * This is a match only if the first domain character
538 * is a leading '.'.
539 */
540
541 if ('.' == d[0])
542 return 0;
543 else
544 return 1;
545 }
546 }
547
548 /*
549 * We found different characters in the same position (from the end).
550 */
551 /*
552 * If one of those character is '.' then its special. In order
553 * for splay tree sorting to work properly, "x-foo.com" must
554 * be greater than ".foo.com" even though '-' is less than '.'.
555 */
556 if ('.' == d[dl])
557 return 1;
558
559 if ('.' == h[hl])
560 return -1;
561
562 return (xtolower(h[hl]) - xtolower(d[dl]));
563 }
564
565
566 /*
567 * return true if we can serve requests for this method.
568 */
569 int
570 urlCheckRequest(const HttpRequest * r)
571 {
572 int rc = 0;
573 /* protocol "independent" methods
574 *
575 * actually these methods are specific to HTTP:
576 * they are methods we recieve on our HTTP port,
577 * and if we had a FTP listener would not be relevant
578 * there.
579 *
580 * So, we should delegate them to HTTP. The problem is that we
581 * do not have a default protocol from the client side of HTTP.
582 */
583
584 if (r->method == METHOD_CONNECT)
585 return 1;
586
587 if (r->method == METHOD_TRACE)
588 return 1;
589
590 if (r->method == METHOD_PURGE)
591 return 1;
592
593 /* does method match the protocol? */
594 switch (r->protocol) {
595
596 case PROTO_URN:
597
598 case PROTO_HTTP:
599
600 case PROTO_CACHEOBJ:
601 rc = 1;
602 break;
603
604 case PROTO_FTP:
605
606 if (r->method == METHOD_PUT)
607 rc = 1;
608
609 case PROTO_GOPHER:
610
611 case PROTO_WAIS:
612
613 case PROTO_WHOIS:
614 if (r->method == METHOD_GET)
615 rc = 1;
616 else if (r->method == METHOD_HEAD)
617 rc = 1;
618
619 break;
620
621 case PROTO_HTTPS:
622 #ifdef USE_SSL
623
624 rc = 1;
625
626 break;
627
628 #else
629 /*
630 * Squid can't originate an SSL connection, so it should
631 * never receive an "https:" URL. It should always be
632 * CONNECT instead.
633 */
634 rc = 0;
635
636 #endif
637
638 default:
639 break;
640 }
641
642 return rc;
643 }
644
645 /*
646 * Quick-n-dirty host extraction from a URL. Steps:
647 * Look for a colon
648 * Skip any '/' after the colon
649 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
650 * Look for an ending '/' or ':' and terminate
651 * Look for login info preceeded by '@'
652 */
653
654 class URLHostName
655 {
656
657 public:
658 char * extract(char const *url);
659
660 private:
661 static char Host [SQUIDHOSTNAMELEN];
662 void init(char const *);
663 void findHostStart();
664 void trimTrailingChars();
665 void trimAuth();
666 char const *hostStart;
667 char const *url;
668 };
669
670 char *
671 urlHostname(const char *url)
672 {
673 return URLHostName().extract(url);
674 }
675
676 char URLHostName::Host[SQUIDHOSTNAMELEN];
677
678 void
679 URLHostName::init(char const *aUrl)
680 {
681 Host[0] = '\0';
682 url = url;
683 }
684
685 void
686 URLHostName::findHostStart()
687 {
688 if (NULL == (hostStart = strchr(url, ':')))
689 return;
690
691 ++hostStart;
692
693 while (*hostStart != '\0' && *hostStart == '/')
694 ++hostStart;
695 }
696
697 void
698 URLHostName::trimTrailingChars()
699 {
700 char *t;
701
702 if ((t = strchr(Host, '/')))
703 *t = '\0';
704
705 if ((t = strchr(Host, ':')))
706 *t = '\0';
707 }
708
709 void
710 URLHostName::trimAuth()
711 {
712 char *t;
713
714 if ((t = strrchr(Host, '@'))) {
715 t++;
716 xmemmove(Host, t, strlen(t) + 1);
717 }
718 }
719
720 char *
721 URLHostName::extract(char const *aUrl)
722 {
723 init(aUrl);
724 findHostStart();
725
726 if (hostStart == NULL)
727 return NULL;
728
729 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
730
731 trimTrailingChars();
732
733 trimAuth();
734
735 return Host;
736 }
737
738 URL::URL() : scheme()
739 {}
740
741 URL::URL(URLScheme const &aScheme): scheme(aScheme)
742 {}