]> git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
DW:
[thirdparty/squid.git] / src / url.cc
1
2 /*
3 * $Id: url.cc,v 1.124 2000/05/12 00:29:10 wessels Exp $
4 *
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
7 *
8 * SQUID Internet Object Cache http://squid.nlanr.net/Squid/
9 * ----------------------------------------------------------
10 *
11 * Squid is the result of efforts by numerous individuals from the
12 * Internet community. Development is led by Duane Wessels of the
13 * National Laboratory for Applied Network Research and funded by the
14 * National Science Foundation. Squid is Copyrighted (C) 1998 by
15 * the Regents of the University of California. Please see the
16 * COPYRIGHT file for full details. Squid incorporates software
17 * developed and/or copyrighted by other sources. Please see the
18 * CREDITS file for full details.
19 *
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
33 *
34 */
35
36 #include "squid.h"
37
38 const char *RequestMethodStr[] =
39 {
40 "NONE",
41 "GET",
42 "POST",
43 "PUT",
44 "HEAD",
45 "CONNECT",
46 "TRACE",
47 "PURGE",
48 "OPTIONS",
49 "DELETE",
50 "PROPFIND",
51 "PROPPATCH",
52 "MKCOL",
53 "COPY",
54 "MOVE",
55 "LOCK",
56 "UNLOCK",
57 "ERROR"
58 };
59
60 const char *ProtocolStr[] =
61 {
62 "NONE",
63 "http",
64 "ftp",
65 "gopher",
66 "wais",
67 "cache_object",
68 "icp",
69 #if USE_HTCP
70 "htcp",
71 #endif
72 "urn",
73 "whois",
74 "internal",
75 "https",
76 "TOTAL"
77 };
78
79 static request_t *urnParse(method_t method, char *urn);
80 static const char *const valid_hostname_chars =
81 #if ALLOW_HOSTNAME_UNDERSCORES
82 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
83 "abcdefghijklmnopqrstuvwxyz"
84 "0123456789-._";
85 #else
86 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
87 "abcdefghijklmnopqrstuvwxyz"
88 "0123456789-.";
89 #endif
90
91 /* convert %xx in url string to a character
92 * Allocate a new string and return a pointer to converted string */
93
94 char *
95 url_convert_hex(char *org_url, int allocate)
96 {
97 static char code[] = "00";
98 char *url = NULL;
99 char *s = NULL;
100 char *t = NULL;
101 url = allocate ? (char *) xstrdup(org_url) : org_url;
102 if ((int) strlen(url) < 3 || !strchr(url, '%'))
103 return url;
104 for (s = t = url; *(s + 2); s++) {
105 if (*s == '%') {
106 code[0] = *(++s);
107 code[1] = *(++s);
108 *t++ = (char) strtol(code, NULL, 16);
109 } else {
110 *t++ = *s;
111 }
112 }
113 do {
114 *t++ = *s;
115 } while (*s++);
116 return url;
117 }
118
119 void
120 urlInitialize(void)
121 {
122 debug(23, 5) ("urlInitialize: Initializing...\n");
123 assert(sizeof(ProtocolStr) == (PROTO_MAX + 1) * sizeof(char *));
124 memset(&null_request_flags, '\0', sizeof(null_request_flags));
125 /*
126 * These test that our matchDomainName() function works the
127 * way we expect it to.
128 */
129 assert(0 == matchDomainName("foo.com", "foo.com"));
130 assert(0 < matchDomainName(".foo.com", "foo.com"));
131 assert(0 == matchDomainName("foo.com", ".foo.com"));
132 assert(0 == matchDomainName(".foo.com", ".foo.com"));
133 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
134 assert(0 != matchDomainName("x.foo.com", "foo.com"));
135 assert(0 != matchDomainName("foo.com", "x.foo.com"));
136 assert(0 != matchDomainName("bar.com", "foo.com"));
137 assert(0 != matchDomainName(".bar.com", "foo.com"));
138 assert(0 != matchDomainName(".bar.com", ".foo.com"));
139 assert(0 != matchDomainName("bar.com", ".foo.com"));
140 assert(0 < matchDomainName("zzz.com", "foo.com"));
141 assert(0 > matchDomainName("aaa.com", "foo.com"));
142 assert(0 == matchDomainName("FOO.com", "foo.COM"));
143 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
144 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
145 /* more cases? */
146 }
147
148 method_t
149 urlParseMethod(const char *s)
150 {
151 method_t method = METHOD_NONE;
152 for (method++; method < METHOD_ENUM_END; method++) {
153 if (0 == strcasecmp(s, RequestMethodStr[method]))
154 return method;
155 }
156 return METHOD_NONE;
157 }
158
159
160 protocol_t
161 urlParseProtocol(const char *s)
162 {
163 /* test common stuff first */
164 if (strcasecmp(s, "http") == 0)
165 return PROTO_HTTP;
166 if (strcasecmp(s, "ftp") == 0)
167 return PROTO_FTP;
168 if (strcasecmp(s, "https") == 0)
169 return PROTO_HTTPS;
170 if (strcasecmp(s, "file") == 0)
171 return PROTO_FTP;
172 if (strcasecmp(s, "gopher") == 0)
173 return PROTO_GOPHER;
174 if (strcasecmp(s, "wais") == 0)
175 return PROTO_WAIS;
176 if (strcasecmp(s, "cache_object") == 0)
177 return PROTO_CACHEOBJ;
178 if (strcasecmp(s, "urn") == 0)
179 return PROTO_URN;
180 if (strcasecmp(s, "whois") == 0)
181 return PROTO_WHOIS;
182 if (strcasecmp(s, "internal") == 0)
183 return PROTO_INTERNAL;
184 return PROTO_NONE;
185 }
186
187
188 int
189 urlDefaultPort(protocol_t p)
190 {
191 switch (p) {
192 case PROTO_HTTP:
193 return 80;
194 case PROTO_HTTPS:
195 return 443;
196 case PROTO_FTP:
197 return 21;
198 case PROTO_GOPHER:
199 return 70;
200 case PROTO_WAIS:
201 return 210;
202 case PROTO_CACHEOBJ:
203 case PROTO_INTERNAL:
204 return CACHE_HTTP_PORT;
205 case PROTO_WHOIS:
206 return 43;
207 default:
208 return 0;
209 }
210 }
211
212 request_t *
213 urlParse(method_t method, char *url)
214 {
215 LOCAL_ARRAY(char, proto, MAX_URL);
216 LOCAL_ARRAY(char, login, MAX_URL);
217 LOCAL_ARRAY(char, host, MAX_URL);
218 LOCAL_ARRAY(char, urlpath, MAX_URL);
219 request_t *request = NULL;
220 char *t = NULL;
221 char *q = NULL;
222 int port;
223 protocol_t protocol = PROTO_NONE;
224 int l;
225 proto[0] = host[0] = urlpath[0] = login[0] = '\0';
226
227 if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
228 /* terminate so it doesn't overflow other buffers */
229 *(url + (MAX_URL >> 1)) = '\0';
230 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l);
231 return NULL;
232 }
233 if (method == METHOD_CONNECT) {
234 port = CONNECT_PORT;
235 if (sscanf(url, "%[^:]:%d", host, &port) < 1)
236 return NULL;
237 } else if (!strncmp(url, "urn:", 4)) {
238 return urnParse(method, url);
239 } else {
240 if (sscanf(url, "%[^:]://%[^/]%[^\r\n]", proto, host, urlpath) < 2)
241 return NULL;
242 protocol = urlParseProtocol(proto);
243 port = urlDefaultPort(protocol);
244 /* Is there any login informaiton? */
245 if ((t = strrchr(host, '@'))) {
246 strcpy((char *) login, (char *) host);
247 t = strrchr(login, '@');
248 *t = 0;
249 strcpy((char *) host, t + 1);
250 }
251 if ((t = strrchr(host, ':'))) {
252 *t++ = '\0';
253 if (*t != '\0')
254 port = atoi(t);
255 }
256 }
257 for (t = host; *t; t++)
258 *t = xtolower(*t);
259 if (strspn(host, valid_hostname_chars) != strlen(host)) {
260 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host);
261 return NULL;
262 }
263 /* remove trailing dots from hostnames */
264 while ((l = strlen(host)) > 0 && host[--l] == '.')
265 host[l] = '\0';
266 if (Config.appendDomain && !strchr(host, '.'))
267 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN);
268 if (port == 0) {
269 debug(23, 3) ("urlParse: Invalid port == 0\n");
270 return NULL;
271 }
272 #ifdef HARDCODE_DENY_PORTS
273 /* These ports are filtered in the default squid.conf, but
274 * maybe someone wants them hardcoded... */
275 if (port == 7 || port == 9 || port = 19) {
276 debug(23, 0) ("urlParse: Deny access to port %d\n", port);
277 return NULL;
278 }
279 #endif
280 if (stringHasWhitespace(urlpath)) {
281 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url);
282 switch (Config.uri_whitespace) {
283 case URI_WHITESPACE_DENY:
284 return NULL;
285 case URI_WHITESPACE_ALLOW:
286 break;
287 case URI_WHITESPACE_ENCODE:
288 t = rfc1738_escape_unescaped(urlpath);
289 xstrncpy(urlpath, t, MAX_URL);
290 break;
291 case URI_WHITESPACE_CHOP:
292 *(urlpath + strcspn(urlpath, w_space)) = '\0';
293 break;
294 case URI_WHITESPACE_STRIP:
295 default:
296 t = q = urlpath;
297 while (*t) {
298 if (!xisspace(*t))
299 *q++ = *t;
300 t++;
301 }
302 *q = '\0';
303 }
304 }
305 request = requestCreate(method, protocol, urlpath);
306 xstrncpy(request->host, host, SQUIDHOSTNAMELEN);
307 xstrncpy(request->login, login, MAX_LOGIN_SZ);
308 request->port = (u_short) port;
309 return request;
310 }
311
312 static request_t *
313 urnParse(method_t method, char *urn)
314 {
315 debug(50, 5) ("urnParse: %s\n", urn);
316 return requestCreate(method, PROTO_URN, urn + 4);
317 }
318
319 const char *
320 urlCanonical(request_t * request)
321 {
322 LOCAL_ARRAY(char, portbuf, 32);
323 LOCAL_ARRAY(char, urlbuf, MAX_URL);
324 if (request->canonical)
325 return request->canonical;
326 if (request->protocol == PROTO_URN) {
327 snprintf(urlbuf, MAX_URL, "urn:%s", strBuf(request->urlpath));
328 } else {
329 switch (request->method) {
330 case METHOD_CONNECT:
331 snprintf(urlbuf, MAX_URL, "%s:%d", request->host, request->port);
332 break;
333 default:
334 portbuf[0] = '\0';
335 if (request->port != urlDefaultPort(request->protocol))
336 snprintf(portbuf, 32, ":%d", request->port);
337 snprintf(urlbuf, MAX_URL, "%s://%s%s%s%s%s",
338 ProtocolStr[request->protocol],
339 request->login,
340 *request->login ? "@" : null_string,
341 request->host,
342 portbuf,
343 strBuf(request->urlpath));
344 break;
345 }
346 }
347 return (request->canonical = xstrdup(urlbuf));
348 }
349
350 char *
351 urlCanonicalClean(const request_t * request)
352 {
353 LOCAL_ARRAY(char, buf, MAX_URL);
354 LOCAL_ARRAY(char, portbuf, 32);
355 LOCAL_ARRAY(char, loginbuf, MAX_LOGIN_SZ + 1);
356 char *t;
357 if (request->protocol == PROTO_URN) {
358 snprintf(buf, MAX_URL, "urn:%s", strBuf(request->urlpath));
359 } else {
360 switch (request->method) {
361 case METHOD_CONNECT:
362 snprintf(buf, MAX_URL, "%s:%d", request->host, request->port);
363 break;
364 default:
365 portbuf[0] = '\0';
366 if (request->port != urlDefaultPort(request->protocol))
367 snprintf(portbuf, 32, ":%d", request->port);
368 loginbuf[0] = '\0';
369 if ((int) strlen(request->login) > 0) {
370 strcpy(loginbuf, request->login);
371 if ((t = strchr(loginbuf, ':')))
372 *t = '\0';
373 strcat(loginbuf, "@");
374 }
375 snprintf(buf, MAX_URL, "%s://%s%s%s%s",
376 ProtocolStr[request->protocol],
377 loginbuf,
378 request->host,
379 portbuf,
380 strBuf(request->urlpath));
381 /*
382 * strip arguments AFTER a question-mark
383 */
384 if (Config.onoff.strip_query_terms)
385 if ((t = strchr(buf, '?')))
386 *(++t) = '\0';
387 break;
388 }
389 }
390 if (stringHasCntl(buf))
391 xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL);
392 return buf;
393 }
394
395 /*
396 * matchDomainName() compares a hostname with a domainname according
397 * to the following rules:
398 *
399 * HOST DOMAIN MATCH?
400 * ------------- ------------- ------
401 * foo.com foo.com YES
402 * .foo.com foo.com NO
403 * x.foo.com foo.com NO
404 * foo.com .foo.com YES
405 * .foo.com .foo.com YES
406 * x.foo.com .foo.com YES
407 *
408 * Return values:
409 * 0 means the host matches the domain
410 * 1 means the host is greater than the domain
411 * -1 means the host is less than the domain
412 */
413
414 int
415 matchDomainName(const char *h, const char *d)
416 {
417 int dl;
418 int hl;
419 hl = strlen(h);
420 dl = strlen(d);
421 /*
422 * Start at the ends of the two strings and work towards the
423 * beginning.
424 */
425 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
426 if (hl == 0 && dl == 0) {
427 /*
428 * We made it all the way to the beginning of both
429 * strings without finding any difference.
430 */
431 return 0;
432 }
433 if (0 == hl) {
434 /*
435 * The host string is shorter than the domain string.
436 * There is only one case when this can be a match.
437 * If the domain is just one character longer, and if
438 * that character is a leading '.' then we call it a
439 * match.
440 */
441 if (1 == dl && '.' == d[0])
442 return 0;
443 else
444 return -1;
445 }
446 if (0 == dl) {
447 /*
448 * The domain string is shorter than the host string.
449 * This is a match only if the first domain character
450 * is a leading '.'.
451 */
452 if ('.' == d[0])
453 return 0;
454 else
455 return 1;
456 }
457 }
458 /*
459 * We found different characters in the same position (from the end).
460 */
461 return (xtolower(h[hl]) - xtolower(d[dl]));
462 }
463
464 int
465 urlCheckRequest(const request_t * r)
466 {
467 int rc = 0;
468 /* protocol "independent" methods */
469 if (r->method == METHOD_CONNECT)
470 return 1;
471 if (r->method == METHOD_TRACE)
472 return 1;
473 if (r->method == METHOD_PURGE)
474 return 1;
475 /* does method match the protocol? */
476 switch (r->protocol) {
477 case PROTO_URN:
478 case PROTO_HTTP:
479 case PROTO_CACHEOBJ:
480 rc = 1;
481 break;
482 case PROTO_FTP:
483 if (r->method == METHOD_PUT)
484 rc = 1;
485 case PROTO_GOPHER:
486 case PROTO_WAIS:
487 case PROTO_WHOIS:
488 if (r->method == METHOD_GET)
489 rc = 1;
490 else if (r->method == METHOD_HEAD)
491 rc = 1;
492 break;
493 case PROTO_HTTPS:
494 /*
495 * Squid can't originate an SSL connection, so it should
496 * never receive an "https:" URL. It should always be
497 * CONNECT instead.
498 */
499 rc = 0;
500 default:
501 break;
502 }
503 return rc;
504 }
505
506 /*
507 * Quick-n-dirty host extraction from a URL. Steps:
508 * Look for a colon
509 * Skip any '/' after the colon
510 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
511 * Look for an ending '/' or ':' and terminate
512 * Look for login info preceeded by '@'
513 */
514 char *
515 urlHostname(const char *url)
516 {
517 LOCAL_ARRAY(char, host, SQUIDHOSTNAMELEN);
518 char *t;
519 host[0] = '\0';
520 if (NULL == (t = strchr(url, ':')))
521 return NULL;
522 t++;
523 while (*t != '\0' && *t == '/')
524 t++;
525 xstrncpy(host, t, SQUIDHOSTNAMELEN);
526 if ((t = strchr(host, '/')))
527 *t = '\0';
528 if ((t = strchr(host, ':')))
529 *t = '\0';
530 if ((t = strrchr(host, '@'))) {
531 t++;
532 xmemmove(host, t, strlen(t) + 1);
533 }
534 return host;
535 }