]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
3 * $Id: url.cc,v 1.142 2003/01/23 00:37:29 robertc Exp $
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
37 #include "HttpRequest.h"
39 const char *RequestMethodStr
[] =
90 const char *ProtocolStr
[] =
109 static request_t
*urnParse(method_t method
, char *urn
);
111 static const char *const valid_hostname_chars
=
112 #if ALLOW_HOSTNAME_UNDERSCORES
113 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
114 "abcdefghijklmnopqrstuvwxyz"
117 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
118 "abcdefghijklmnopqrstuvwxyz"
121 #endif /* CHECK_HOSTNAMES */
123 /* convert %xx in url string to a character
124 * Allocate a new string and return a pointer to converted string */
127 url_convert_hex(char *org_url
, int allocate
)
129 static char code
[] = "00";
133 url
= allocate
? (char *) xstrdup(org_url
) : org_url
;
134 if ((int) strlen(url
) < 3 || !strchr(url
, '%'))
136 for (s
= t
= url
; *s
; s
++) {
137 if (*s
== '%' && *(s
+ 1) && *(s
+ 2)) {
140 *t
++ = (char) strtol(code
, NULL
, 16);
154 debug(23, 5) ("urlInitialize: Initializing...\n");
155 assert(sizeof(ProtocolStr
) == (PROTO_MAX
+ 1) * sizeof(char *));
157 * These test that our matchDomainName() function works the
158 * way we expect it to.
160 assert(0 == matchDomainName("foo.com", "foo.com"));
161 assert(0 == matchDomainName(".foo.com", "foo.com"));
162 assert(0 == matchDomainName("foo.com", ".foo.com"));
163 assert(0 == matchDomainName(".foo.com", ".foo.com"));
164 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
165 assert(0 != matchDomainName("x.foo.com", "foo.com"));
166 assert(0 != matchDomainName("foo.com", "x.foo.com"));
167 assert(0 != matchDomainName("bar.com", "foo.com"));
168 assert(0 != matchDomainName(".bar.com", "foo.com"));
169 assert(0 != matchDomainName(".bar.com", ".foo.com"));
170 assert(0 != matchDomainName("bar.com", ".foo.com"));
171 assert(0 < matchDomainName("zzz.com", "foo.com"));
172 assert(0 > matchDomainName("aaa.com", "foo.com"));
173 assert(0 == matchDomainName("FOO.com", "foo.COM"));
174 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
175 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
176 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
180 method_t
&operator++ (method_t
&aMethod
)
182 aMethod
= (method_t
)(++(int)aMethod
);
188 urlParseMethod(const char *s
)
190 method_t method
= METHOD_NONE
;
192 * This check for '%' makes sure that we don't
193 * match one of the extension method placeholders,
194 * which have the form %EXT[0-9][0-9]
198 for (++method
; method
< METHOD_ENUM_END
; ++method
) {
199 if (0 == strcasecmp(s
, RequestMethodStr
[method
]))
207 urlParseProtocol(const char *s
)
209 /* test common stuff first */
210 if (strcasecmp(s
, "http") == 0)
212 if (strcasecmp(s
, "ftp") == 0)
214 if (strcasecmp(s
, "https") == 0)
216 if (strcasecmp(s
, "file") == 0)
218 if (strcasecmp(s
, "gopher") == 0)
220 if (strcasecmp(s
, "wais") == 0)
222 if (strcasecmp(s
, "cache_object") == 0)
223 return PROTO_CACHEOBJ
;
224 if (strcasecmp(s
, "urn") == 0)
226 if (strcasecmp(s
, "whois") == 0)
228 if (strcasecmp(s
, "internal") == 0)
229 return PROTO_INTERNAL
;
235 urlDefaultPort(protocol_t p
)
250 return CACHE_HTTP_PORT
;
259 urlParse(method_t method
, char *url
)
261 LOCAL_ARRAY(char, proto
, MAX_URL
);
262 LOCAL_ARRAY(char, login
, MAX_URL
);
263 LOCAL_ARRAY(char, host
, MAX_URL
);
264 LOCAL_ARRAY(char, urlpath
, MAX_URL
);
265 request_t
*request
= NULL
;
269 protocol_t protocol
= PROTO_NONE
;
271 proto
[0] = host
[0] = urlpath
[0] = login
[0] = '\0';
273 if ((l
= strlen(url
)) + Config
.appendDomainLen
> (MAX_URL
- 1)) {
274 /* terminate so it doesn't overflow other buffers */
275 *(url
+ (MAX_URL
>> 1)) = '\0';
276 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l
);
279 if (method
== METHOD_CONNECT
) {
281 if (sscanf(url
, "%[^:]:%d", host
, &port
) < 1)
283 } else if (!strncmp(url
, "urn:", 4)) {
284 return urnParse(method
, url
);
286 if (sscanf(url
, "%[^:]://%[^/]%[^\r\n]", proto
, host
, urlpath
) < 2)
288 protocol
= urlParseProtocol(proto
);
289 port
= urlDefaultPort(protocol
);
290 /* Is there any login informaiton? */
291 if ((t
= strrchr(host
, '@'))) {
292 strcpy((char *) login
, (char *) host
);
293 t
= strrchr(login
, '@');
295 strcpy((char *) host
, t
+ 1);
297 if ((t
= strrchr(host
, ':'))) {
303 for (t
= host
; *t
; t
++)
305 if (stringHasWhitespace(host
)) {
306 if (URI_WHITESPACE_STRIP
== Config
.uri_whitespace
) {
317 if (Config
.onoff
.check_hostnames
&& strspn(host
, valid_hostname_chars
) != strlen(host
)) {
318 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host
);
322 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
323 /* remove trailing dots from hostnames */
324 while ((l
= strlen(host
)) > 0 && host
[--l
] == '.')
326 /* remove duplicate dots */
327 while ((t
= strstr(host
, "..")))
328 xmemmove(t
, t
+ 1, strlen(t
));
330 if (Config
.appendDomain
&& !strchr(host
, '.'))
331 strncat(host
, Config
.appendDomain
, SQUIDHOSTNAMELEN
);
332 if (port
< 1 || port
> 65535) {
333 debug(23, 3) ("urlParse: Invalid port '%d'\n", port
);
336 #ifdef HARDCODE_DENY_PORTS
337 /* These ports are filtered in the default squid.conf, but
338 * maybe someone wants them hardcoded... */
339 if (port
== 7 || port
== 9 || port
== 19) {
340 debug(23, 0) ("urlParse: Deny access to port %d\n", port
);
344 if (stringHasWhitespace(urlpath
)) {
345 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url
);
346 switch (Config
.uri_whitespace
) {
347 case URI_WHITESPACE_DENY
:
349 case URI_WHITESPACE_ALLOW
:
351 case URI_WHITESPACE_ENCODE
:
352 t
= rfc1738_escape_unescaped(urlpath
);
353 xstrncpy(urlpath
, t
, MAX_URL
);
355 case URI_WHITESPACE_CHOP
:
356 *(urlpath
+ strcspn(urlpath
, w_space
)) = '\0';
358 case URI_WHITESPACE_STRIP
:
369 request
= requestCreate(method
, protocol
, urlpath
);
370 xstrncpy(request
->host
, host
, SQUIDHOSTNAMELEN
);
371 xstrncpy(request
->login
, login
, MAX_LOGIN_SZ
);
372 request
->port
= (u_short
) port
;
377 urnParse(method_t method
, char *urn
)
379 debug(50, 5) ("urnParse: %s\n", urn
);
380 return requestCreate(method
, PROTO_URN
, urn
+ 4);
384 urlCanonical(request_t
* request
)
386 LOCAL_ARRAY(char, portbuf
, 32);
387 LOCAL_ARRAY(char, urlbuf
, MAX_URL
);
388 if (request
->canonical
)
389 return request
->canonical
;
390 if (request
->protocol
== PROTO_URN
) {
391 snprintf(urlbuf
, MAX_URL
, "urn:%s", request
->urlpath
.buf());
393 switch (request
->method
) {
395 snprintf(urlbuf
, MAX_URL
, "%s:%d", request
->host
, request
->port
);
399 if (request
->port
!= urlDefaultPort(request
->protocol
))
400 snprintf(portbuf
, 32, ":%d", request
->port
);
401 snprintf(urlbuf
, MAX_URL
, "%s://%s%s%s%s%s",
402 ProtocolStr
[request
->protocol
],
404 *request
->login
? "@" : null_string
,
407 request
->urlpath
.buf());
411 return (request
->canonical
= xstrdup(urlbuf
));
415 urlCanonicalClean(const request_t
* request
)
417 LOCAL_ARRAY(char, buf
, MAX_URL
);
418 LOCAL_ARRAY(char, portbuf
, 32);
419 LOCAL_ARRAY(char, loginbuf
, MAX_LOGIN_SZ
+ 1);
421 if (request
->protocol
== PROTO_URN
) {
422 snprintf(buf
, MAX_URL
, "urn:%s", request
->urlpath
.buf());
424 switch (request
->method
) {
426 snprintf(buf
, MAX_URL
, "%s:%d", request
->host
, request
->port
);
430 if (request
->port
!= urlDefaultPort(request
->protocol
))
431 snprintf(portbuf
, 32, ":%d", request
->port
);
433 if ((int) strlen(request
->login
) > 0) {
434 strcpy(loginbuf
, request
->login
);
435 if ((t
= strchr(loginbuf
, ':')))
437 strcat(loginbuf
, "@");
439 snprintf(buf
, MAX_URL
, "%s://%s%s%s%s",
440 ProtocolStr
[request
->protocol
],
444 request
->urlpath
.buf());
446 * strip arguments AFTER a question-mark
448 if (Config
.onoff
.strip_query_terms
)
449 if ((t
= strchr(buf
, '?')))
454 if (stringHasCntl(buf
))
455 xstrncpy(buf
, rfc1738_escape_unescaped(buf
), MAX_URL
);
460 * matchDomainName() compares a hostname with a domainname according
461 * to the following rules:
464 * ------------- ------------- ------
465 * foo.com foo.com YES
466 * .foo.com foo.com YES
467 * x.foo.com foo.com NO
468 * foo.com .foo.com YES
469 * .foo.com .foo.com YES
470 * x.foo.com .foo.com YES
472 * We strip leading dots on hosts (but not domains!) so that
473 * ".foo.com" is is always the same as "foo.com".
476 * 0 means the host matches the domain
477 * 1 means the host is greater than the domain
478 * -1 means the host is less than the domain
482 matchDomainName(const char *h
, const char *d
)
491 * Start at the ends of the two strings and work towards the
494 while (xtolower(h
[--hl
]) == xtolower(d
[--dl
])) {
495 if (hl
== 0 && dl
== 0) {
497 * We made it all the way to the beginning of both
498 * strings without finding any difference.
504 * The host string is shorter than the domain string.
505 * There is only one case when this can be a match.
506 * If the domain is just one character longer, and if
507 * that character is a leading '.' then we call it a
510 if (1 == dl
&& '.' == d
[0])
517 * The domain string is shorter than the host string.
518 * This is a match only if the first domain character
528 * We found different characters in the same position (from the end).
531 * If one of those character is '.' then its special. In order
532 * for splay tree sorting to work properly, "x-foo.com" must
533 * be greater than ".foo.com" even though '-' is less than '.'.
539 return (xtolower(h
[hl
]) - xtolower(d
[dl
]));
543 urlCheckRequest(const request_t
* r
)
546 /* protocol "independent" methods */
547 if (r
->method
== METHOD_CONNECT
)
549 if (r
->method
== METHOD_TRACE
)
551 if (r
->method
== METHOD_PURGE
)
553 /* does method match the protocol? */
554 switch (r
->protocol
) {
561 if (r
->method
== METHOD_PUT
)
566 if (r
->method
== METHOD_GET
)
568 else if (r
->method
== METHOD_HEAD
)
577 * Squid can't originate an SSL connection, so it should
578 * never receive an "https:" URL. It should always be
590 * Quick-n-dirty host extraction from a URL. Steps:
592 * Skip any '/' after the colon
593 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
594 * Look for an ending '/' or ':' and terminate
595 * Look for login info preceeded by '@'
598 urlHostname(const char *url
)
600 LOCAL_ARRAY(char, host
, SQUIDHOSTNAMELEN
);
603 if (NULL
== (t
= strchr(url
, ':')))
606 while (*t
!= '\0' && *t
== '/')
608 xstrncpy(host
, t
, SQUIDHOSTNAMELEN
);
609 if ((t
= strchr(host
, '/')))
611 if ((t
= strchr(host
, ':')))
613 if ((t
= strrchr(host
, '@'))) {
615 xmemmove(host
, t
, strlen(t
) + 1);
621 urlExtMethodAdd(const char *mstr
)
623 method_t method
= METHOD_NONE
;
624 for (++method
; method
< METHOD_ENUM_END
; ++method
) {
625 if (0 == strcmp(mstr
, RequestMethodStr
[method
])) {
626 debug(23, 2) ("Extension method '%s' already exists\n", mstr
);
629 if (0 != strncmp("%EXT", RequestMethodStr
[method
], 4))
631 /* Don't free statically allocated "%EXTnn" string */
632 RequestMethodStr
[method
] = xstrdup(mstr
);
633 debug(23, 1) ("Extension method '%s' added, enum=%d\n", mstr
, (int) method
);
636 debug(23, 1) ("WARNING: Could not add new extension method '%s' due to lack of array space\n", mstr
);
640 urlExtMethodConfigure(void)
642 wordlist
*w
= Config
.ext_methods
;
645 for (s
= w
->key
; *s
; s
++)
647 urlExtMethodAdd(w
->key
);