]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
3 * $Id: url.cc,v 1.157 2007/04/28 22:26:38 hno Exp $
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
37 #include "HttpRequest.h"
38 #include "URLScheme.h"
40 static HttpRequest
*urnParse(method_t method
, char *urn
);
41 static const char valid_hostname_chars_u
[] =
42 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
43 "abcdefghijklmnopqrstuvwxyz"
46 static const char valid_hostname_chars
[] =
47 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
48 "abcdefghijklmnopqrstuvwxyz"
52 /* convert %xx in url string to a character
53 * Allocate a new string and return a pointer to converted string */
56 url_convert_hex(char *org_url
, int allocate
)
58 static char code
[] = "00";
62 url
= allocate
? (char *) xstrdup(org_url
) : org_url
;
64 if ((int) strlen(url
) < 3 || !strchr(url
, '%'))
67 for (s
= t
= url
; *s
; s
++) {
68 if (*s
== '%' && *(s
+ 1) && *(s
+ 2)) {
71 *t
++ = (char) strtol(code
, NULL
, 16);
87 debugs(23, 5, "urlInitialize: Initializing...");
88 /* this ensures that the number of protocol strings is the same as
89 * the enum slots allocated because the last enum is always 'TOTAL'.
91 assert(strcmp(ProtocolStr
[PROTO_MAX
], "TOTAL") == 0);
93 * These test that our matchDomainName() function works the
94 * way we expect it to.
96 assert(0 == matchDomainName("foo.com", "foo.com"));
97 assert(0 == matchDomainName(".foo.com", "foo.com"));
98 assert(0 == matchDomainName("foo.com", ".foo.com"));
99 assert(0 == matchDomainName(".foo.com", ".foo.com"));
100 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
101 assert(0 != matchDomainName("x.foo.com", "foo.com"));
102 assert(0 != matchDomainName("foo.com", "x.foo.com"));
103 assert(0 != matchDomainName("bar.com", "foo.com"));
104 assert(0 != matchDomainName(".bar.com", "foo.com"));
105 assert(0 != matchDomainName(".bar.com", ".foo.com"));
106 assert(0 != matchDomainName("bar.com", ".foo.com"));
107 assert(0 < matchDomainName("zzz.com", "foo.com"));
108 assert(0 > matchDomainName("aaa.com", "foo.com"));
109 assert(0 == matchDomainName("FOO.com", "foo.COM"));
110 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
111 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
112 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
117 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
118 * backwards compatibility, e defaults to NULL, in which case we
119 * assume b is NULL-terminated.
122 urlParseProtocol(const char *b
, const char *e
)
125 * if e is NULL, b must be NULL terminated and we
126 * make e point to the first whitespace character
131 e
= b
+ strcspn(b
, ":");
135 /* test common stuff first */
137 if (strncasecmp(b
, "http", len
) == 0)
140 if (strncasecmp(b
, "ftp", len
) == 0)
143 if (strncasecmp(b
, "https", len
) == 0)
146 if (strncasecmp(b
, "file", len
) == 0)
149 if (strncasecmp(b
, "gopher", len
) == 0)
152 if (strncasecmp(b
, "wais", len
) == 0)
155 if (strncasecmp(b
, "cache_object", len
) == 0)
156 return PROTO_CACHEOBJ
;
158 if (strncasecmp(b
, "urn", len
) == 0)
161 if (strncasecmp(b
, "whois", len
) == 0)
164 if (strncasecmp(b
, "internal", len
) == 0)
165 return PROTO_INTERNAL
;
171 urlDefaultPort(protocol_t p
)
193 return CACHE_HTTP_PORT
;
206 * If the 'request' arg is non-NULL, put parsed values there instead
207 * of allocating a new HttpRequest.
209 * This abuses HttpRequest as a way of representing the parsed url
210 * and its components.
211 * method is used to switch parsers and to init the HttpRequest.
212 * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
214 * The url is non const so that if its too long we can NULL-terminate it in place.
217 urlParse(method_t method
, char *url
, HttpRequest
*request
)
219 LOCAL_ARRAY(char, proto
, MAX_URL
);
220 LOCAL_ARRAY(char, login
, MAX_URL
);
221 LOCAL_ARRAY(char, host
, MAX_URL
);
222 LOCAL_ARRAY(char, urlpath
, MAX_URL
);
226 protocol_t protocol
= PROTO_NONE
;
228 proto
[0] = host
[0] = urlpath
[0] = login
[0] = '\0';
230 if ((l
= strlen(url
)) + Config
.appendDomainLen
> (MAX_URL
- 1)) {
231 /* terminate so it doesn't overflow other buffers */
232 *(url
+ (MAX_URL
>> 1)) = '\0';
233 debugs(23, 1, "urlParse: URL too large (" << l
<< " bytes)");
237 if (method
== METHOD_CONNECT
) {
240 if (sscanf(url
, "%[^:]:%d", host
, &port
) < 1)
242 } else if (!strncmp(url
, "urn:", 4)) {
243 return urnParse(method
, url
);
245 if (sscanf(url
, "%[^:]://%[^/]%[^\r\n]", proto
, host
, urlpath
) < 2)
248 protocol
= urlParseProtocol(proto
);
250 port
= urlDefaultPort(protocol
);
252 /* Is there any login informaiton? */
253 if ((t
= strrchr(host
, '@'))) {
254 strcpy((char *) login
, (char *) host
);
255 t
= strrchr(login
, '@');
257 strcpy((char *) host
, t
+ 1);
260 if ((t
= strrchr(host
, ':'))) {
268 for (t
= host
; *t
; t
++)
271 if (stringHasWhitespace(host
)) {
272 if (URI_WHITESPACE_STRIP
== Config
.uri_whitespace
) {
286 if (Config
.onoff
.check_hostnames
&& strspn(host
, Config
.onoff
.allow_underscore
? valid_hostname_chars_u
: valid_hostname_chars
) != strlen(host
)) {
287 debugs(23, 1, "urlParse: Illegal character in hostname '" << host
<< "'");
291 #if DONT_DO_THIS_IT_BREAKS_SEMANTIC_TRANSPARENCY
292 /* remove trailing dots from hostnames */
293 while ((l
= strlen(host
)) > 0 && host
[--l
] == '.')
296 /* remove duplicate dots */
297 while ((t
= strstr(host
, "..")))
298 xmemmove(t
, t
+ 1, strlen(t
));
302 if (Config
.appendDomain
&& !strchr(host
, '.'))
303 strncat(host
, Config
.appendDomain
, SQUIDHOSTNAMELEN
- strlen(host
) - 1);
305 if (port
< 1 || port
> 65535) {
306 debugs(23, 3, "urlParse: Invalid port '" << port
<< "'");
310 #ifdef HARDCODE_DENY_PORTS
311 /* These ports are filtered in the default squid.conf, but
312 * maybe someone wants them hardcoded... */
313 if (port
== 7 || port
== 9 || port
== 19) {
314 debugs(23, 0, "urlParse: Deny access to port " << port
);
319 if (stringHasWhitespace(urlpath
)) {
320 debugs(23, 2, "urlParse: URI has whitespace: {" << url
<< "}");
322 switch (Config
.uri_whitespace
) {
324 case URI_WHITESPACE_DENY
:
327 case URI_WHITESPACE_ALLOW
:
330 case URI_WHITESPACE_ENCODE
:
331 t
= rfc1738_escape_unescaped(urlpath
);
332 xstrncpy(urlpath
, t
, MAX_URL
);
335 case URI_WHITESPACE_CHOP
:
336 *(urlpath
+ strcspn(urlpath
, w_space
)) = '\0';
339 case URI_WHITESPACE_STRIP
:
356 request
= new HttpRequest(method
, protocol
, urlpath
);
358 request
->initHTTP(method
, protocol
, urlpath
);
361 xstrncpy(request
->host
, host
, SQUIDHOSTNAMELEN
);
362 xstrncpy(request
->login
, login
, MAX_LOGIN_SZ
);
363 request
->port
= (u_short
) port
;
368 urnParse(method_t method
, char *urn
)
370 debugs(50, 5, "urnParse: " << urn
);
371 return new HttpRequest(method
, PROTO_URN
, urn
+ 4);
375 urlCanonical(HttpRequest
* request
)
377 LOCAL_ARRAY(char, portbuf
, 32);
378 LOCAL_ARRAY(char, urlbuf
, MAX_URL
);
380 if (request
->canonical
)
381 return request
->canonical
;
383 if (request
->protocol
== PROTO_URN
) {
384 snprintf(urlbuf
, MAX_URL
, "urn:%s", request
->urlpath
.buf());
386 switch (request
->method
) {
389 snprintf(urlbuf
, MAX_URL
, "%s:%d", request
->host
, request
->port
);
395 if (request
->port
!= urlDefaultPort(request
->protocol
))
396 snprintf(portbuf
, 32, ":%d", request
->port
);
398 snprintf(urlbuf
, MAX_URL
, "%s://%s%s%s%s%s",
399 ProtocolStr
[request
->protocol
],
401 *request
->login
? "@" : null_string
,
404 request
->urlpath
.buf());
410 return (request
->canonical
= xstrdup(urlbuf
));
414 urlCanonicalClean(const HttpRequest
* request
)
416 LOCAL_ARRAY(char, buf
, MAX_URL
);
417 LOCAL_ARRAY(char, portbuf
, 32);
418 LOCAL_ARRAY(char, loginbuf
, MAX_LOGIN_SZ
+ 1);
421 if (request
->protocol
== PROTO_URN
) {
422 snprintf(buf
, MAX_URL
, "urn:%s", request
->urlpath
.buf());
424 switch (request
->method
) {
427 snprintf(buf
, MAX_URL
, "%s:%d", request
->host
, request
->port
);
433 if (request
->port
!= urlDefaultPort(request
->protocol
))
434 snprintf(portbuf
, 32, ":%d", request
->port
);
438 if ((int) strlen(request
->login
) > 0) {
439 strcpy(loginbuf
, request
->login
);
441 if ((t
= strchr(loginbuf
, ':')))
444 strcat(loginbuf
, "@");
447 snprintf(buf
, MAX_URL
, "%s://%s%s%s%s",
448 ProtocolStr
[request
->protocol
],
452 request
->urlpath
.buf());
454 * strip arguments AFTER a question-mark
457 if (Config
.onoff
.strip_query_terms
)
458 if ((t
= strchr(buf
, '?')))
465 if (stringHasCntl(buf
))
466 xstrncpy(buf
, rfc1738_escape_unescaped(buf
), MAX_URL
);
472 * matchDomainName() compares a hostname with a domainname according
473 * to the following rules:
476 * ------------- ------------- ------
477 * foo.com foo.com YES
478 * .foo.com foo.com YES
479 * x.foo.com foo.com NO
480 * foo.com .foo.com YES
481 * .foo.com .foo.com YES
482 * x.foo.com .foo.com YES
484 * We strip leading dots on hosts (but not domains!) so that
485 * ".foo.com" is is always the same as "foo.com".
488 * 0 means the host matches the domain
489 * 1 means the host is greater than the domain
490 * -1 means the host is less than the domain
494 matchDomainName(const char *h
, const char *d
)
507 * Start at the ends of the two strings and work towards the
510 while (xtolower(h
[--hl
]) == xtolower(d
[--dl
])) {
511 if (hl
== 0 && dl
== 0) {
513 * We made it all the way to the beginning of both
514 * strings without finding any difference.
521 * The host string is shorter than the domain string.
522 * There is only one case when this can be a match.
523 * If the domain is just one character longer, and if
524 * that character is a leading '.' then we call it a
528 if (1 == dl
&& '.' == d
[0])
536 * The domain string is shorter than the host string.
537 * This is a match only if the first domain character
549 * We found different characters in the same position (from the end).
552 * If one of those character is '.' then its special. In order
553 * for splay tree sorting to work properly, "x-foo.com" must
554 * be greater than ".foo.com" even though '-' is less than '.'.
562 return (xtolower(h
[hl
]) - xtolower(d
[dl
]));
567 * return true if we can serve requests for this method.
570 urlCheckRequest(const HttpRequest
* r
)
573 /* protocol "independent" methods
575 * actually these methods are specific to HTTP:
576 * they are methods we recieve on our HTTP port,
577 * and if we had a FTP listener would not be relevant
580 * So, we should delegate them to HTTP. The problem is that we
581 * do not have a default protocol from the client side of HTTP.
584 if (r
->method
== METHOD_CONNECT
)
587 if (r
->method
== METHOD_TRACE
)
590 if (r
->method
== METHOD_PURGE
)
593 /* does method match the protocol? */
594 switch (r
->protocol
) {
606 if (r
->method
== METHOD_PUT
)
614 if (r
->method
== METHOD_GET
)
616 else if (r
->method
== METHOD_HEAD
)
630 * Squid can't originate an SSL connection, so it should
631 * never receive an "https:" URL. It should always be
646 * Quick-n-dirty host extraction from a URL. Steps:
648 * Skip any '/' after the colon
649 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
650 * Look for an ending '/' or ':' and terminate
651 * Look for login info preceeded by '@'
658 char * extract(char const *url
);
661 static char Host
[SQUIDHOSTNAMELEN
];
662 void init(char const *);
663 void findHostStart();
664 void trimTrailingChars();
666 char const *hostStart
;
671 urlHostname(const char *url
)
673 return URLHostName().extract(url
);
676 char URLHostName::Host
[SQUIDHOSTNAMELEN
];
679 URLHostName::init(char const *aUrl
)
686 URLHostName::findHostStart()
688 if (NULL
== (hostStart
= strchr(url
, ':')))
693 while (*hostStart
!= '\0' && *hostStart
== '/')
698 URLHostName::trimTrailingChars()
702 if ((t
= strchr(Host
, '/')))
705 if ((t
= strchr(Host
, ':')))
710 URLHostName::trimAuth()
714 if ((t
= strrchr(Host
, '@'))) {
716 xmemmove(Host
, t
, strlen(t
) + 1);
721 URLHostName::extract(char const *aUrl
)
726 if (hostStart
== NULL
)
729 xstrncpy(Host
, hostStart
, SQUIDHOSTNAMELEN
);
738 URL::URL() : scheme()
741 URL::URL(URLScheme
const &aScheme
): scheme(aScheme
)