]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Kill the redundant url_convert_hex function. Equivalent to rfc1738_unescape
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.161 2007/05/23 21:10:07 hno Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "URL.h"
37 #include "HttpRequest.h"
38 #include "URLScheme.h"
39
40 static HttpRequest *urnParse(method_t method, char *urn);
41 static const char valid_hostname_chars_u[] =
42 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
43 "abcdefghijklmnopqrstuvwxyz"
44 "0123456789-._"
45 ;
46 static const char valid_hostname_chars[] =
47 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
48 "abcdefghijklmnopqrstuvwxyz"
49 "0123456789-."
50 ;
51
52 void
53 urlInitialize(void)
54 {
55 debugs(23, 5, "urlInitialize: Initializing...");
56 /* this ensures that the number of protocol strings is the same as
57 * the enum slots allocated because the last enum is always 'TOTAL'.
58 */
59 assert(strcmp(ProtocolStr[PROTO_MAX], "TOTAL") == 0);
60 /*
61 * These test that our matchDomainName() function works the
62 * way we expect it to.
63 */
64 assert(0 == matchDomainName("foo.com", "foo.com"));
65 assert(0 == matchDomainName(".foo.com", "foo.com"));
66 assert(0 == matchDomainName("foo.com", ".foo.com"));
67 assert(0 == matchDomainName(".foo.com", ".foo.com"));
68 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
69 assert(0 != matchDomainName("x.foo.com", "foo.com"));
70 assert(0 != matchDomainName("foo.com", "x.foo.com"));
71 assert(0 != matchDomainName("bar.com", "foo.com"));
72 assert(0 != matchDomainName(".bar.com", "foo.com"));
73 assert(0 != matchDomainName(".bar.com", ".foo.com"));
74 assert(0 != matchDomainName("bar.com", ".foo.com"));
75 assert(0 < matchDomainName("zzz.com", "foo.com"));
76 assert(0 > matchDomainName("aaa.com", "foo.com"));
77 assert(0 == matchDomainName("FOO.com", "foo.COM"));
78 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
79 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
80 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
81 /* more cases? */
82 }
83
84 /*
85 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
86 * backwards compatibility, e defaults to NULL, in which case we
87 * assume b is NULL-terminated.
88 */
89 protocol_t
90 urlParseProtocol(const char *b, const char *e)
91 {
92 /*
93 * if e is NULL, b must be NULL terminated and we
94 * make e point to the first whitespace character
95 * after b.
96 */
97
98 if (NULL == e)
99 e = b + strcspn(b, ":");
100
101 int len = e - b;
102
103 /* test common stuff first */
104
105 if (strncasecmp(b, "http", len) == 0)
106 return PROTO_HTTP;
107
108 if (strncasecmp(b, "ftp", len) == 0)
109 return PROTO_FTP;
110
111 if (strncasecmp(b, "https", len) == 0)
112 return PROTO_HTTPS;
113
114 if (strncasecmp(b, "file", len) == 0)
115 return PROTO_FTP;
116
117 if (strncasecmp(b, "gopher", len) == 0)
118 return PROTO_GOPHER;
119
120 if (strncasecmp(b, "wais", len) == 0)
121 return PROTO_WAIS;
122
123 if (strncasecmp(b, "cache_object", len) == 0)
124 return PROTO_CACHEOBJ;
125
126 if (strncasecmp(b, "urn", len) == 0)
127 return PROTO_URN;
128
129 if (strncasecmp(b, "whois", len) == 0)
130 return PROTO_WHOIS;
131
132 if (strncasecmp(b, "internal", len) == 0)
133 return PROTO_INTERNAL;
134
135 return PROTO_NONE;
136 }
137
138 int
139 urlDefaultPort(protocol_t p)
140 {
141 switch (p) {
142
143 case PROTO_HTTP:
144 return 80;
145
146 case PROTO_HTTPS:
147 return 443;
148
149 case PROTO_FTP:
150 return 21;
151
152 case PROTO_GOPHER:
153 return 70;
154
155 case PROTO_WAIS:
156 return 210;
157
158 case PROTO_CACHEOBJ:
159
160 case PROTO_INTERNAL:
161 return CACHE_HTTP_PORT;
162
163 case PROTO_WHOIS:
164 return 43;
165
166 default:
167 return 0;
168 }
169 }
170
171 /*
172 * Parse a URI/URL.
173 *
174 * If the 'request' arg is non-NULL, put parsed values there instead
175 * of allocating a new HttpRequest.
176 *
177 * This abuses HttpRequest as a way of representing the parsed url
178 * and its components.
179 * method is used to switch parsers and to init the HttpRequest.
180 * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
181 * looked for.
182 * The url is non const so that if its too long we can NULL-terminate it in place.
183 */
184 HttpRequest *
185 urlParse(method_t method, char *url, HttpRequest *request)
186 {
187 LOCAL_ARRAY(char, proto, MAX_URL);
188 LOCAL_ARRAY(char, login, MAX_URL);
189 LOCAL_ARRAY(char, host, MAX_URL);
190 LOCAL_ARRAY(char, urlpath, MAX_URL);
191 char *t = NULL;
192 char *q = NULL;
193 int port;
194 protocol_t protocol = PROTO_NONE;
195 int l;
196 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
197
198 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
199 /* terminate so it doesn't overflow other buffers */
200 *(url + (MAX_URL >> 1)) = '\0';
201 debugs(23, 1, "urlParse: URL too large (" << l << " bytes)");
202 return NULL;
203 }
204
205 if (method == METHOD_CONNECT) {
206 port = CONNECT_PORT;
207
208 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
209 return NULL;
210 } else if (!strncmp(url, "urn:", 4)) {
211 return urnParse(method, url);
212 } else {
213 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
214 return NULL;
215
216 protocol = urlParseProtocol(proto);
217
218 port = urlDefaultPort(protocol);
219
220 /* Is there any login informaiton? */
221 if ((t = strrchr(host, '@'))) {
222 strcpy((char *) login, (char *) host);
223 t = strrchr(login, '@');
224 *t = 0;
225 strcpy((char *) host, t + 1);
226 }
227
228 if ((t = strrchr(host, ':'))) {
229 *t++ = '\0';
230
231 if (*t != '\0')
232 port = atoi(t);
233 }
234 }
235
236 for (t = host; *t; t++)
237 *t = xtolower(*t);
238
239 if (strpbrk(host, w_space) != NULL) {
240 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
241 t = q = host;
242
243 while (*t) {
244 if (!xisspace(*t))
245 *q++ = *t;
246
247 t++;
248 }
249
250 *q = '\0';
251 }
252 }
253
254 if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) {
255 debugs(23, 1, "urlParse: Illegal character in hostname '" << host << "'");
256 return NULL;
257 }
258
259 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
260 /* remove trailing dots from hostnames */
261 while ((l = strlen(host)) > 0 && host[--l] == '.')
262 host[l] = '\0';
263
264 /* remove duplicate dots */
265 while ((t = strstr(host, "..")))
266 xmemmove(t, t + 1, strlen(t));
267
268 #endif
269
270 if (Config.appendDomain && !strchr(host, '.'))
271 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1);
272
273 if (port < 1 || port > 65535) {
274 debugs(23, 3, "urlParse: Invalid port '" << port << "'");
275 return NULL;
276 }
277
278 #ifdef HARDCODE_DENY_PORTS
279 /* These ports are filtered in the default squid.conf, but
280 * maybe someone wants them hardcoded... */
281 if (port == 7 || port == 9 || port == 19) {
282 debugs(23, 0, "urlParse: Deny access to port " << port);
283 return NULL;
284 }
285
286 #endif
287 if (strpbrk(urlpath, w_space) != NULL) {
288 debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}");
289
290 switch (Config.uri_whitespace) {
291
292 case URI_WHITESPACE_DENY:
293 return NULL;
294
295 case URI_WHITESPACE_ALLOW:
296 break;
297
298 case URI_WHITESPACE_ENCODE:
299 t = rfc1738_escape_unescaped(urlpath);
300 xstrncpy(urlpath, t, MAX_URL);
301 break;
302
303 case URI_WHITESPACE_CHOP:
304 *(urlpath + strcspn(urlpath, w_space)) = '\0';
305 break;
306
307 case URI_WHITESPACE_STRIP:
308
309 default:
310 t = q = urlpath;
311
312 while (*t) {
313 if (!xisspace(*t))
314 *q++ = *t;
315
316 t++;
317 }
318
319 *q = '\0';
320 }
321 }
322
323 if (NULL == request)
324 request = new HttpRequest(method, protocol, urlpath);
325 else {
326 request->initHTTP(method, protocol, urlpath);
327 }
328
329 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
330 xstrncpy(request->login, login, MAX_LOGIN_SZ);
331 request->port = (u_short) port;
332 return request;
333 }
334
335 static HttpRequest *
336 urnParse(method_t method, char *urn)
337 {
338 debugs(50, 5, "urnParse: " << urn);
339 return new HttpRequest(method, PROTO_URN, urn + 4);
340 }
341
342 const char *
343 urlCanonical(HttpRequest * request)
344 {
345 LOCAL_ARRAY(char, portbuf, 32);
346 LOCAL_ARRAY(char, urlbuf, MAX_URL);
347
348 if (request->canonical)
349 return request->canonical;
350
351 if (request->protocol == PROTO_URN) {
352 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.c_str());
353 } else {
354 switch (request->method) {
355
356 case METHOD_CONNECT:
357 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
358 break;
359
360 default:
361 portbuf[0] = '\0';
362
363 if (request->port != urlDefaultPort(request->protocol))
364 snprintf(portbuf, 32, ":%d", request->port);
365
366 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
367 ProtocolStr[request->protocol],
368 request->login,
369 *request->login ? "@" : null_string,
370 request->host,
371 portbuf,
372 request->urlpath.c_str());
373
374 break;
375 }
376 }
377
378 return (request->canonical = xstrdup(urlbuf));
379 }
380
381 int
382 stringHasCntl(const char *s)
383 {
384 unsigned char c;
385
386 while ((c = (unsigned char) *s++) != '\0') {
387 if (c <= 0x1f)
388 return 1;
389
390 if (c >= 0x7f && c <= 0x9f)
391 return 1;
392 }
393
394 return 0;
395 }
396
397 char *
398 urlCanonicalClean(const HttpRequest * request)
399 {
400 LOCAL_ARRAY(char, buf, MAX_URL);
401 LOCAL_ARRAY(char, portbuf, 32);
402 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
403 char *t;
404
405 if (request->protocol == PROTO_URN) {
406 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.c_str());
407 } else {
408 switch (request->method) {
409
410 case METHOD_CONNECT:
411 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
412 break;
413
414 default:
415 portbuf[0] = '\0';
416
417 if (request->port != urlDefaultPort(request->protocol))
418 snprintf(portbuf, 32, ":%d", request->port);
419
420 loginbuf[0] = '\0';
421
422 if ((int) strlen(request->login) > 0) {
423 strcpy(loginbuf, request->login);
424
425 if ((t = strchr(loginbuf, ':')))
426 *t = '\0';
427
428 strcat(loginbuf, "@");
429 }
430
431 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
432 ProtocolStr[request->protocol],
433 loginbuf,
434 request->host,
435 portbuf,
436 request->urlpath.c_str());
437 /*
438 * strip arguments AFTER a question-mark
439 */
440
441 if (Config.onoff.strip_query_terms)
442 if ((t = strchr(buf, '?')))
443 *(++t) = '\0';
444
445 break;
446 }
447 }
448
449 if (stringHasCntl(buf))
450 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
451
452 return buf;
453 }
454
455 /*
456 * matchDomainName() compares a hostname with a domainname according
457 * to the following rules:
458 *
459 * HOST DOMAIN MATCH?
460 * ------------- ------------- ------
461 * foo.com foo.com YES
462 * .foo.com foo.com YES
463 * x.foo.com foo.com NO
464 * foo.com .foo.com YES
465 * .foo.com .foo.com YES
466 * x.foo.com .foo.com YES
467 *
468 * We strip leading dots on hosts (but not domains!) so that
469 * ".foo.com" is is always the same as "foo.com".
470 *
471 * Return values:
472 * 0 means the host matches the domain
473 * 1 means the host is greater than the domain
474 * -1 means the host is less than the domain
475 */
476
477 int
478 matchDomainName(const char *h, const char *d)
479 {
480 int dl;
481 int hl;
482
483 while ('.' == *h)
484 h++;
485
486 hl = strlen(h);
487
488 dl = strlen(d);
489
490 /*
491 * Start at the ends of the two strings and work towards the
492 * beginning.
493 */
494 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
495 if (hl == 0 && dl == 0) {
496 /*
497 * We made it all the way to the beginning of both
498 * strings without finding any difference.
499 */
500 return 0;
501 }
502
503 if (0 == hl) {
504 /*
505 * The host string is shorter than the domain string.
506 * There is only one case when this can be a match.
507 * If the domain is just one character longer, and if
508 * that character is a leading '.' then we call it a
509 * match.
510 */
511
512 if (1 == dl && '.' == d[0])
513 return 0;
514 else
515 return -1;
516 }
517
518 if (0 == dl) {
519 /*
520 * The domain string is shorter than the host string.
521 * This is a match only if the first domain character
522 * is a leading '.'.
523 */
524
525 if ('.' == d[0])
526 return 0;
527 else
528 return 1;
529 }
530 }
531
532 /*
533 * We found different characters in the same position (from the end).
534 */
535 /*
536 * If one of those character is '.' then its special. In order
537 * for splay tree sorting to work properly, "x-foo.com" must
538 * be greater than ".foo.com" even though '-' is less than '.'.
539 */
540 if ('.' == d[dl])
541 return 1;
542
543 if ('.' == h[hl])
544 return -1;
545
546 return (xtolower(h[hl]) - xtolower(d[dl]));
547 }
548
549
550 /*
551 * return true if we can serve requests for this method.
552 */
553 int
554 urlCheckRequest(const HttpRequest * r)
555 {
556 int rc = 0;
557 /* protocol "independent" methods
558 *
559 * actually these methods are specific to HTTP:
560 * they are methods we recieve on our HTTP port,
561 * and if we had a FTP listener would not be relevant
562 * there.
563 *
564 * So, we should delegate them to HTTP. The problem is that we
565 * do not have a default protocol from the client side of HTTP.
566 */
567
568 if (r->method == METHOD_CONNECT)
569 return 1;
570
571 if (r->method == METHOD_TRACE)
572 return 1;
573
574 if (r->method == METHOD_PURGE)
575 return 1;
576
577 /* does method match the protocol? */
578 switch (r->protocol) {
579
580 case PROTO_URN:
581
582 case PROTO_HTTP:
583
584 case PROTO_CACHEOBJ:
585 rc = 1;
586 break;
587
588 case PROTO_FTP:
589
590 if (r->method == METHOD_PUT)
591 rc = 1;
592
593 case PROTO_GOPHER:
594
595 case PROTO_WAIS:
596
597 case PROTO_WHOIS:
598 if (r->method == METHOD_GET)
599 rc = 1;
600 else if (r->method == METHOD_HEAD)
601 rc = 1;
602
603 break;
604
605 case PROTO_HTTPS:
606 #ifdef USE_SSL
607
608 rc = 1;
609
610 break;
611
612 #else
613 /*
614 * Squid can't originate an SSL connection, so it should
615 * never receive an "https:" URL. It should always be
616 * CONNECT instead.
617 */
618 rc = 0;
619
620 #endif
621
622 default:
623 break;
624 }
625
626 return rc;
627 }
628
629 /*
630 * Quick-n-dirty host extraction from a URL. Steps:
631 * Look for a colon
632 * Skip any '/' after the colon
633 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
634 * Look for an ending '/' or ':' and terminate
635 * Look for login info preceeded by '@'
636 */
637
638 class URLHostName
639 {
640
641 public:
642 char * extract(char const *url);
643
644 private:
645 static char Host [SQUIDHOSTNAMELEN];
646 void init(char const *);
647 void findHostStart();
648 void trimTrailingChars();
649 void trimAuth();
650 char const *hostStart;
651 char const *url;
652 };
653
654 char *
655 urlHostname(const char *url)
656 {
657 return URLHostName().extract(url);
658 }
659
660 char URLHostName::Host[SQUIDHOSTNAMELEN];
661
662 void
663 URLHostName::init(char const *aUrl)
664 {
665 Host[0] = '\0';
666 url = url;
667 }
668
669 void
670 URLHostName::findHostStart()
671 {
672 if (NULL == (hostStart = strchr(url, ':')))
673 return;
674
675 ++hostStart;
676
677 while (*hostStart != '\0' && *hostStart == '/')
678 ++hostStart;
679 }
680
681 void
682 URLHostName::trimTrailingChars()
683 {
684 char *t;
685
686 if ((t = strchr(Host, '/')))
687 *t = '\0';
688
689 if ((t = strchr(Host, ':')))
690 *t = '\0';
691 }
692
693 void
694 URLHostName::trimAuth()
695 {
696 char *t;
697
698 if ((t = strrchr(Host, '@'))) {
699 t++;
700 xmemmove(Host, t, strlen(t) + 1);
701 }
702 }
703
704 char *
705 URLHostName::extract(char const *aUrl)
706 {
707 init(aUrl);
708 findHostStart();
709
710 if (hostStart == NULL)
711 return NULL;
712
713 xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
714
715 trimTrailingChars();
716
717 trimAuth();
718
719 return Host;
720 }
721
722 URL::URL() : scheme()
723 {}
724
725 URL::URL(URLScheme const &aScheme): scheme(aScheme)
726 {}