]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
Import of fix-ranges branch
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.142 2003/01/23 00:37:29 robertc Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "squid.h"
37 #include "HttpRequest.h"
38
39 const char *RequestMethodStr[] =
40 {
41 "NONE",
42 "GET",
43 "POST",
44 "PUT",
45 "HEAD",
46 "CONNECT",
47 "TRACE",
48 "PURGE",
49 "OPTIONS",
50 "DELETE",
51 "PROPFIND",
52 "PROPPATCH",
53 "MKCOL",
54 "COPY",
55 "MOVE",
56 "LOCK",
57 "UNLOCK",
58 "BMOVE",
59 "BDELETE",
60 "BPROPFIND",
61 "BPROPPATCH",
62 "BCOPY",
63 "SEARCH",
64 "SUBSCRIBE",
65 "UNSUBSCRIBE",
66 "POLL",
67 "%EXT00",
68 "%EXT01",
69 "%EXT02",
70 "%EXT03",
71 "%EXT04",
72 "%EXT05",
73 "%EXT06",
74 "%EXT07",
75 "%EXT08",
76 "%EXT09",
77 "%EXT10",
78 "%EXT11",
79 "%EXT12",
80 "%EXT13",
81 "%EXT14",
82 "%EXT15",
83 "%EXT16",
84 "%EXT17",
85 "%EXT18",
86 "%EXT19",
87 "ERROR"
88 };
89
90 const char *ProtocolStr[] =
91 {
92 "NONE",
93 "http",
94 "ftp",
95 "gopher",
96 "wais",
97 "cache_object",
98 "icp",
99 #if USE_HTCP
100 "htcp",
101 #endif
102 "urn",
103 "whois",
104 "internal",
105 "https",
106 "TOTAL"
107 };
108
109 static request_t *urnParse(method_t method, char *urn);
110 #if CHECK_HOSTNAMES
111 static const char *const valid_hostname_chars =
112 #if ALLOW_HOSTNAME_UNDERSCORES
113 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
114 "abcdefghijklmnopqrstuvwxyz"
115 "0123456789-._";
116 #else
117 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
118 "abcdefghijklmnopqrstuvwxyz"
119 "0123456789-.";
120 #endif
121 #endif /* CHECK_HOSTNAMES */
122
123 /* convert %xx in url string to a character
124 * Allocate a new string and return a pointer to converted string */
125
126 char *
127 url_convert_hex(char *org_url, int allocate)
128 {
129 static char code[] = "00";
130 char *url = NULL;
131 char *s = NULL;
132 char *t = NULL;
133 url = allocate ? (char *) xstrdup(org_url) : org_url;
134 if ((int) strlen(url) < 3 || !strchr(url, '%'))
135 return url;
136 for (s = t = url; *s; s++) {
137 if (*s == '%' && *(s + 1) && *(s + 2)) {
138 code[0] = *(++s);
139 code[1] = *(++s);
140 *t++ = (char) strtol(code, NULL, 16);
141 } else {
142 *t++ = *s;
143 }
144 }
145 do {
146 *t++ = *s;
147 } while (*s++);
148 return url;
149 }
150
151 void
152 urlInitialize(void)
153 {
154 debug(23, 5) ("urlInitialize: Initializing...\n");
155 assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
156 /*
157 * These test that our matchDomainName() function works the
158 * way we expect it to.
159 */
160 assert(0 == matchDomainName("foo.com", "foo.com"));
161 assert(0 == matchDomainName(".foo.com", "foo.com"));
162 assert(0 == matchDomainName("foo.com", ".foo.com"));
163 assert(0 == matchDomainName(".foo.com", ".foo.com"));
164 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
165 assert(0 != matchDomainName("x.foo.com", "foo.com"));
166 assert(0 != matchDomainName("foo.com", "x.foo.com"));
167 assert(0 != matchDomainName("bar.com", "foo.com"));
168 assert(0 != matchDomainName(".bar.com", "foo.com"));
169 assert(0 != matchDomainName(".bar.com", ".foo.com"));
170 assert(0 != matchDomainName("bar.com", ".foo.com"));
171 assert(0 < matchDomainName("zzz.com", "foo.com"));
172 assert(0 > matchDomainName("aaa.com", "foo.com"));
173 assert(0 == matchDomainName("FOO.com", "foo.COM"));
174 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
175 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
176 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
177 /* more cases? */
178 }
179
180 method_t &operator++ (method_t &aMethod)
181 {
182 aMethod = (method_t)(++(int)aMethod);
183 return aMethod;
184 }
185
186
187 method_t
188 urlParseMethod(const char *s)
189 {
190 method_t method = METHOD_NONE;
191 /*
192 * This check for '%' makes sure that we don't
193 * match one of the extension method placeholders,
194 * which have the form %EXT[0-9][0-9]
195 */
196 if (*s == '%')
197 return METHOD_NONE;
198 for (++method; method < METHOD_ENUM_END; ++method) {
199 if (0 == strcasecmp(s, RequestMethodStr[method]))
200 return method;
201 }
202 return METHOD_NONE;
203 }
204
205
206 protocol_t
207 urlParseProtocol(const char *s)
208 {
209 /* test common stuff first */
210 if (strcasecmp(s, "http") == 0)
211 return PROTO_HTTP;
212 if (strcasecmp(s, "ftp") == 0)
213 return PROTO_FTP;
214 if (strcasecmp(s, "https") == 0)
215 return PROTO_HTTPS;
216 if (strcasecmp(s, "file") == 0)
217 return PROTO_FTP;
218 if (strcasecmp(s, "gopher") == 0)
219 return PROTO_GOPHER;
220 if (strcasecmp(s, "wais") == 0)
221 return PROTO_WAIS;
222 if (strcasecmp(s, "cache_object") == 0)
223 return PROTO_CACHEOBJ;
224 if (strcasecmp(s, "urn") == 0)
225 return PROTO_URN;
226 if (strcasecmp(s, "whois") == 0)
227 return PROTO_WHOIS;
228 if (strcasecmp(s, "internal") == 0)
229 return PROTO_INTERNAL;
230 return PROTO_NONE;
231 }
232
233
234 int
235 urlDefaultPort(protocol_t p)
236 {
237 switch (p) {
238 case PROTO_HTTP:
239 return 80;
240 case PROTO_HTTPS:
241 return 443;
242 case PROTO_FTP:
243 return 21;
244 case PROTO_GOPHER:
245 return 70;
246 case PROTO_WAIS:
247 return 210;
248 case PROTO_CACHEOBJ:
249 case PROTO_INTERNAL:
250 return CACHE_HTTP_PORT;
251 case PROTO_WHOIS:
252 return 43;
253 default:
254 return 0;
255 }
256 }
257
258 request_t *
259 urlParse(method_t method, char *url)
260 {
261 LOCAL_ARRAY(char, proto, MAX_URL);
262 LOCAL_ARRAY(char, login, MAX_URL);
263 LOCAL_ARRAY(char, host, MAX_URL);
264 LOCAL_ARRAY(char, urlpath, MAX_URL);
265 request_t *request = NULL;
266 char *t = NULL;
267 char *q = NULL;
268 int port;
269 protocol_t protocol = PROTO_NONE;
270 int l;
271 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
272
273 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
274 /* terminate so it doesn't overflow other buffers */
275 *(url + (MAX_URL >> 1)) = '\0';
276 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
277 return NULL;
278 }
279 if (method == METHOD_CONNECT) {
280 port = CONNECT_PORT;
281 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
282 return NULL;
283 } else if (!strncmp(url, "urn:", 4)) {
284 return urnParse(method, url);
285 } else {
286 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
287 return NULL;
288 protocol = urlParseProtocol(proto);
289 port = urlDefaultPort(protocol);
290 /* Is there any login informaiton? */
291 if ((t = strrchr(host, '@'))) {
292 strcpy((char *) login, (char *) host);
293 t = strrchr(login, '@');
294 *t = 0;
295 strcpy((char *) host, t + 1);
296 }
297 if ((t = strrchr(host, ':'))) {
298 *t++ = '\0';
299 if (*t != '\0')
300 port = atoi(t);
301 }
302 }
303 for (t = host; *t; t++)
304 *t = xtolower(*t);
305 if (stringHasWhitespace(host)) {
306 if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
307 t = q = host;
308 while (*t) {
309 if (!xisspace(*t))
310 *q++ = *t;
311 t++;
312 }
313 *q = '\0';
314 }
315 }
316 #if CHECK_HOSTNAMES
317 if (Config.onoff.check_hostnames && strspn(host, valid_hostname_chars) != strlen(host)) {
318 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
319 return NULL;
320 }
321 #endif
322 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
323 /* remove trailing dots from hostnames */
324 while ((l = strlen(host)) > 0 && host[--l] == '.')
325 host[l] = '\0';
326 /* remove duplicate dots */
327 while ((t = strstr(host, "..")))
328 xmemmove(t, t + 1, strlen(t));
329 #endif
330 if (Config.appendDomain && !strchr(host, '.'))
331 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN);
332 if (port < 1 || port > 65535) {
333 debug(23, 3) ("urlParse: Invalid port '%d'\n", port);
334 return NULL;
335 }
336 #ifdef HARDCODE_DENY_PORTS
337 /* These ports are filtered in the default squid.conf, but
338 * maybe someone wants them hardcoded... */
339 if (port == 7 || port == 9 || port == 19) {
340 debug(23, 0) ("urlParse: Deny access to port %d\n", port);
341 return NULL;
342 }
343 #endif
344 if (stringHasWhitespace(urlpath)) {
345 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
346 switch (Config.uri_whitespace) {
347 case URI_WHITESPACE_DENY:
348 return NULL;
349 case URI_WHITESPACE_ALLOW:
350 break;
351 case URI_WHITESPACE_ENCODE:
352 t = rfc1738_escape_unescaped(urlpath);
353 xstrncpy(urlpath, t, MAX_URL);
354 break;
355 case URI_WHITESPACE_CHOP:
356 *(urlpath + strcspn(urlpath, w_space)) = '\0';
357 break;
358 case URI_WHITESPACE_STRIP:
359 default:
360 t = q = urlpath;
361 while (*t) {
362 if (!xisspace(*t))
363 *q++ = *t;
364 t++;
365 }
366 *q = '\0';
367 }
368 }
369 request = requestCreate(method, protocol, urlpath);
370 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
371 xstrncpy(request->login, login, MAX_LOGIN_SZ);
372 request->port = (u_short) port;
373 return request;
374 }
375
376 static request_t *
377 urnParse(method_t method, char *urn)
378 {
379 debug(50, 5) ("urnParse: %s\n", urn);
380 return requestCreate(method, PROTO_URN, urn + 4);
381 }
382
383 const char *
384 urlCanonical(request_t * request)
385 {
386 LOCAL_ARRAY(char, portbuf, 32);
387 LOCAL_ARRAY(char, urlbuf, MAX_URL);
388 if (request->canonical)
389 return request->canonical;
390 if (request->protocol == PROTO_URN) {
391 snprintf(urlbuf, MAX_URL, "urn:%s", request->urlpath.buf());
392 } else {
393 switch (request->method) {
394 case METHOD_CONNECT:
395 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
396 break;
397 default:
398 portbuf[0] = '\0';
399 if (request->port != urlDefaultPort(request->protocol))
400 snprintf(portbuf, 32, ":%d", request->port);
401 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
402 ProtocolStr[request->protocol],
403 request->login,
404 *request->login ? "@" : null_string,
405 request->host,
406 portbuf,
407 request->urlpath.buf());
408 break;
409 }
410 }
411 return (request->canonical = xstrdup(urlbuf));
412 }
413
414 char *
415 urlCanonicalClean(const request_t * request)
416 {
417 LOCAL_ARRAY(char, buf, MAX_URL);
418 LOCAL_ARRAY(char, portbuf, 32);
419 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
420 char *t;
421 if (request->protocol == PROTO_URN) {
422 snprintf(buf, MAX_URL, "urn:%s", request->urlpath.buf());
423 } else {
424 switch (request->method) {
425 case METHOD_CONNECT:
426 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
427 break;
428 default:
429 portbuf[0] = '\0';
430 if (request->port != urlDefaultPort(request->protocol))
431 snprintf(portbuf, 32, ":%d", request->port);
432 loginbuf[0] = '\0';
433 if ((int) strlen(request->login) > 0) {
434 strcpy(loginbuf, request->login);
435 if ((t = strchr(loginbuf, ':')))
436 *t = '\0';
437 strcat(loginbuf, "@");
438 }
439 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
440 ProtocolStr[request->protocol],
441 loginbuf,
442 request->host,
443 portbuf,
444 request->urlpath.buf());
445 /*
446 * strip arguments AFTER a question-mark
447 */
448 if (Config.onoff.strip_query_terms)
449 if ((t = strchr(buf, '?')))
450 *(++t) = '\0';
451 break;
452 }
453 }
454 if (stringHasCntl(buf))
455 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
456 return buf;
457 }
458
459 /*
460 * matchDomainName() compares a hostname with a domainname according
461 * to the following rules:
462 *
463 * HOST DOMAIN MATCH?
464 * ------------- ------------- ------
465 * foo.com foo.com YES
466 * .foo.com foo.com YES
467 * x.foo.com foo.com NO
468 * foo.com .foo.com YES
469 * .foo.com .foo.com YES
470 * x.foo.com .foo.com YES
471 *
472 * We strip leading dots on hosts (but not domains!) so that
473 * ".foo.com" is is always the same as "foo.com".
474 *
475 * Return values:
476 * 0 means the host matches the domain
477 * 1 means the host is greater than the domain
478 * -1 means the host is less than the domain
479 */
480
481 int
482 matchDomainName(const char *h, const char *d)
483 {
484 int dl;
485 int hl;
486 while ('.' == *h)
487 h++;
488 hl = strlen(h);
489 dl = strlen(d);
490 /*
491 * Start at the ends of the two strings and work towards the
492 * beginning.
493 */
494 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
495 if (hl == 0 && dl == 0) {
496 /*
497 * We made it all the way to the beginning of both
498 * strings without finding any difference.
499 */
500 return 0;
501 }
502 if (0 == hl) {
503 /*
504 * The host string is shorter than the domain string.
505 * There is only one case when this can be a match.
506 * If the domain is just one character longer, and if
507 * that character is a leading '.' then we call it a
508 * match.
509 */
510 if (1 == dl && '.' == d[0])
511 return 0;
512 else
513 return -1;
514 }
515 if (0 == dl) {
516 /*
517 * The domain string is shorter than the host string.
518 * This is a match only if the first domain character
519 * is a leading '.'.
520 */
521 if ('.' == d[0])
522 return 0;
523 else
524 return 1;
525 }
526 }
527 /*
528 * We found different characters in the same position (from the end).
529 */
530 /*
531 * If one of those character is '.' then its special. In order
532 * for splay tree sorting to work properly, "x-foo.com" must
533 * be greater than ".foo.com" even though '-' is less than '.'.
534 */
535 if ('.' == d[dl])
536 return 1;
537 if ('.' == h[hl])
538 return -1;
539 return (xtolower(h[hl]) - xtolower(d[dl]));
540 }
541
542 int
543 urlCheckRequest(const request_t * r)
544 {
545 int rc = 0;
546 /* protocol "independent" methods */
547 if (r->method == METHOD_CONNECT)
548 return 1;
549 if (r->method == METHOD_TRACE)
550 return 1;
551 if (r->method == METHOD_PURGE)
552 return 1;
553 /* does method match the protocol? */
554 switch (r->protocol) {
555 case PROTO_URN:
556 case PROTO_HTTP:
557 case PROTO_CACHEOBJ:
558 rc = 1;
559 break;
560 case PROTO_FTP:
561 if (r->method == METHOD_PUT)
562 rc = 1;
563 case PROTO_GOPHER:
564 case PROTO_WAIS:
565 case PROTO_WHOIS:
566 if (r->method == METHOD_GET)
567 rc = 1;
568 else if (r->method == METHOD_HEAD)
569 rc = 1;
570 break;
571 case PROTO_HTTPS:
572 #ifdef USE_SSL
573 rc = 1;
574 break;
575 #else
576 /*
577 * Squid can't originate an SSL connection, so it should
578 * never receive an "https:" URL. It should always be
579 * CONNECT instead.
580 */
581 rc = 0;
582 #endif
583 default:
584 break;
585 }
586 return rc;
587 }
588
589 /*
590 * Quick-n-dirty host extraction from a URL. Steps:
591 * Look for a colon
592 * Skip any '/' after the colon
593 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
594 * Look for an ending '/' or ':' and terminate
595 * Look for login info preceeded by '@'
596 */
597 char *
598 urlHostname(const char *url)
599 {
600 LOCAL_ARRAY(char, host, SQUIDHOSTNAMELEN);
601 char *t;
602 host[0] = '\0';
603 if (NULL == (t = strchr(url, ':')))
604 return NULL;
605 t++;
606 while (*t != '\0' && *t == '/')
607 t++;
608 xstrncpy(host, t, SQUIDHOSTNAMELEN);
609 if ((t = strchr(host, '/')))
610 *t = '\0';
611 if ((t = strchr(host, ':')))
612 *t = '\0';
613 if ((t = strrchr(host, '@'))) {
614 t++;
615 xmemmove(host, t, strlen(t) + 1);
616 }
617 return host;
618 }
619
620 static void
621 urlExtMethodAdd(const char *mstr)
622 {
623 method_t method = METHOD_NONE;
624 for (++method; method < METHOD_ENUM_END; ++method) {
625 if (0 == strcmp(mstr, RequestMethodStr[method])) {
626 debug(23, 2) ("Extension method '%s' already exists\n", mstr);
627 return;
628 }
629 if (0 != strncmp("%EXT", RequestMethodStr[method], 4))
630 continue;
631 /* Don't free statically allocated "%EXTnn" string */
632 RequestMethodStr[method] = xstrdup(mstr);
633 debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr, (int) method);
634 return;
635 }
636 debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr);
637 }
638
639 void
640 urlExtMethodConfigure(void)
641 {
642 wordlist *w = Config.ext_methods;
643 while (w) {
644 char *s;
645 for (s = w->key; *s; s++)
646 *s = xtoupper(*s);
647 urlExtMethodAdd(w->key);
648 w = w->next;
649 }
650 }