]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
3 * $Id: url.cc,v 1.165 2008/02/03 10:00:30 amosjeffries Exp $
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
8 * SQUID Web Proxy Cache http://www.squid-cache.org/
9 * ----------------------------------------------------------
11 * Squid is the result of efforts by numerous individuals from
12 * the Internet community; see the CONTRIBUTORS file for full
13 * details. Many organizations have provided support for Squid's
14 * development; see the SPONSORS file for full details. Squid is
15 * Copyrighted (C) 2001 by the Regents of the University of
16 * California; see the COPYRIGHT file for full details. Squid
17 * incorporates software developed and/or copyrighted by other
18 * sources; see the CREDITS file for full details.
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
37 #include "HttpRequest.h"
38 #include "URLScheme.h"
40 static HttpRequest
*urnParse(const HttpRequestMethod
& method
, char *urn
);
41 static const char valid_hostname_chars_u
[] =
42 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
43 "abcdefghijklmnopqrstuvwxyz"
49 static const char valid_hostname_chars
[] =
50 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
51 "abcdefghijklmnopqrstuvwxyz"
61 debugs(23, 5, "urlInitialize: Initializing...");
62 /* this ensures that the number of protocol strings is the same as
63 * the enum slots allocated because the last enum is always 'TOTAL'.
65 assert(strcmp(ProtocolStr
[PROTO_MAX
], "TOTAL") == 0);
67 * These test that our matchDomainName() function works the
68 * way we expect it to.
70 assert(0 == matchDomainName("foo.com", "foo.com"));
71 assert(0 == matchDomainName(".foo.com", "foo.com"));
72 assert(0 == matchDomainName("foo.com", ".foo.com"));
73 assert(0 == matchDomainName(".foo.com", ".foo.com"));
74 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
75 assert(0 != matchDomainName("x.foo.com", "foo.com"));
76 assert(0 != matchDomainName("foo.com", "x.foo.com"));
77 assert(0 != matchDomainName("bar.com", "foo.com"));
78 assert(0 != matchDomainName(".bar.com", "foo.com"));
79 assert(0 != matchDomainName(".bar.com", ".foo.com"));
80 assert(0 != matchDomainName("bar.com", ".foo.com"));
81 assert(0 < matchDomainName("zzz.com", "foo.com"));
82 assert(0 > matchDomainName("aaa.com", "foo.com"));
83 assert(0 == matchDomainName("FOO.com", "foo.COM"));
84 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
85 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
86 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
91 * urlParseProtocol() takes begin (b) and end (e) pointers, but for
92 * backwards compatibility, e defaults to NULL, in which case we
93 * assume b is NULL-terminated.
96 urlParseProtocol(const char *b
, const char *e
)
99 * if e is NULL, b must be NULL terminated and we
100 * make e point to the first whitespace character
105 e
= b
+ strcspn(b
, ":");
109 /* test common stuff first */
111 if (strncasecmp(b
, "http", len
) == 0)
114 if (strncasecmp(b
, "ftp", len
) == 0)
117 if (strncasecmp(b
, "https", len
) == 0)
120 if (strncasecmp(b
, "file", len
) == 0)
123 if (strncasecmp(b
, "gopher", len
) == 0)
126 if (strncasecmp(b
, "wais", len
) == 0)
129 if (strncasecmp(b
, "cache_object", len
) == 0)
130 return PROTO_CACHEOBJ
;
132 if (strncasecmp(b
, "urn", len
) == 0)
135 if (strncasecmp(b
, "whois", len
) == 0)
138 if (strncasecmp(b
, "internal", len
) == 0)
139 return PROTO_INTERNAL
;
145 urlDefaultPort(protocol_t p
)
167 return CACHE_HTTP_PORT
;
180 * If the 'request' arg is non-NULL, put parsed values there instead
181 * of allocating a new HttpRequest.
183 * This abuses HttpRequest as a way of representing the parsed url
184 * and its components.
185 * method is used to switch parsers and to init the HttpRequest.
186 * If method is METHOD_CONNECT, then rather than a URL a hostname:port is
188 * The url is non const so that if its too long we can NULL-terminate it in place.
192 * This routine parses a URL. Its assumed that the URL is complete -
193 * ie, the end of the string is the end of the URL. Don't pass a partial
194 * URL here as this routine doesn't have any way of knowing whether
195 * its partial or not (ie, it handles the case of no trailing slash as
196 * being "end of host with implied path of /".
199 urlParse(const HttpRequestMethod
& method
, char *url
, HttpRequest
*request
)
201 LOCAL_ARRAY(char, proto
, MAX_URL
);
202 LOCAL_ARRAY(char, login
, MAX_URL
);
203 LOCAL_ARRAY(char, host
, MAX_URL
);
204 LOCAL_ARRAY(char, urlpath
, MAX_URL
);
208 protocol_t protocol
= PROTO_NONE
;
213 proto
[0] = host
[0] = urlpath
[0] = login
[0] = '\0';
215 if ((l
= strlen(url
)) + Config
.appendDomainLen
> (MAX_URL
- 1)) {
216 /* terminate so it doesn't overflow other buffers */
217 *(url
+ (MAX_URL
>> 1)) = '\0';
218 debugs(23, 1, "urlParse: URL too large (" << l
<< " bytes)");
221 if (method
== METHOD_CONNECT
) {
224 if (sscanf(url
, "[%[^]]]:%d", host
, &port
) < 1)
225 if (sscanf(url
, "%[^:]:%d", host
, &port
) < 1)
228 } else if (!strncmp(url
, "urn:", 4)) {
229 return urnParse(method
, url
);
234 /* Find first : - everything before is protocol */
235 for (i
= 0, dst
= proto
; i
< l
&& *src
!= ':'; i
++, src
++, dst
++) {
243 /* (XXX yah, I'm not checking we've got enough data left before checking the array..) */
244 if (*src
!= ':' || *(src
+ 1) != '/' || *(src
+ 2) != '/')
249 /* Then everything until first /; thats host (and port; which we'll look for here later) */
250 /* bug 1881: If we don't get a "/" then we imply it was there */
251 for (dst
= host
; i
< l
&& *src
!= '/' && *src
!= '\0'; i
++, src
++, dst
++) {
256 * We can't check for "i >= l" here because we could be at the end of the line
257 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
258 * been -given- a valid URL and the path is just '/'.
264 /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
265 for (dst
= urlpath
; i
< l
&& *src
!= '\r' && *src
!= '\n' && *src
!= '\0'; i
++, src
++, dst
++) {
269 /* We -could- be at the end of the buffer here */
272 /* If the URL path is empty we set it to be "/" */
273 if (dst
== urlpath
) {
278 protocol
= urlParseProtocol(proto
);
279 port
= urlDefaultPort(protocol
);
281 /* Is there any login information? (we should eventually parse it above) */
282 if ((t
= strrchr(host
, '@'))) {
283 strcpy((char *) login
, (char *) host
);
284 t
= strrchr(login
, '@');
286 strcpy((char *) host
, t
+ 1);
289 /* Is there any host information? (we should eventually parse it above) */
291 /* strip any IPA brackets. valid under IPv6. */
294 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
299 for (; i
< l
&& *src
!= ']' && *src
!= '\0'; i
++, src
++, dst
++) {
303 /* we moved in-place, so truncate the actual hostname found */
306 /* IPv4-pure needs to skip the whole hostname to ']' inclusive for now */
307 while (*dst
!= '\0' && *dst
!= ']') dst
++;
310 /* skip ahead to either start of port, or original EOS */
311 while (*dst
!= '\0' && *dst
!= ':') dst
++;
314 t
= strrchr(host
, ':');
316 if (t
!= strchr(host
,':') ) {
317 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
318 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
319 /* therefore we MUST accept the case where they are not bracketed at all. */
324 if (t
&& *t
== ':') {
331 for (t
= host
; *t
; t
++)
334 if (stringHasWhitespace(host
)) {
335 if (URI_WHITESPACE_STRIP
== Config
.uri_whitespace
) {
346 debugs(23, 3, "urlParse: Split URL '" << url
<< "' into proto='" << proto
<< "', host='" << host
<< "', port='" << port
<< "', path='" << urlpath
<< "'");
348 if (Config
.onoff
.check_hostnames
&& strspn(host
, Config
.onoff
.allow_underscore
? valid_hostname_chars_u
: valid_hostname_chars
) != strlen(host
)) {
349 debugs(23, 1, "urlParse: Illegal character in hostname '" << host
<< "'");
353 if (Config
.appendDomain
&& !strchr(host
, '.'))
354 strncat(host
, Config
.appendDomain
, SQUIDHOSTNAMELEN
- strlen(host
) - 1);
356 /* remove trailing dots from hostnames */
357 while ((l
= strlen(host
)) > 0 && host
[--l
] == '.')
360 /* reject duplicate or leading dots */
361 if (strstr(host
, "..") || *host
== '.') {
362 debugs(23, 1, "urlParse: Illegal hostname '" << host
<< "'");
366 if (port
< 1 || port
> 65535) {
367 debugs(23, 3, "urlParse: Invalid port '" << port
<< "'");
371 #ifdef HARDCODE_DENY_PORTS
372 /* These ports are filtered in the default squid.conf, but
373 * maybe someone wants them hardcoded... */
374 if (port
== 7 || port
== 9 || port
== 19) {
375 debugs(23, 0, "urlParse: Deny access to port " << port
);
380 if (stringHasWhitespace(urlpath
)) {
381 debugs(23, 2, "urlParse: URI has whitespace: {" << url
<< "}");
383 switch (Config
.uri_whitespace
) {
385 case URI_WHITESPACE_DENY
:
388 case URI_WHITESPACE_ALLOW
:
391 case URI_WHITESPACE_ENCODE
:
392 t
= rfc1738_escape_unescaped(urlpath
);
393 xstrncpy(urlpath
, t
, MAX_URL
);
396 case URI_WHITESPACE_CHOP
:
397 *(urlpath
+ strcspn(urlpath
, w_space
)) = '\0';
400 case URI_WHITESPACE_STRIP
:
413 request
= new HttpRequest(method
, protocol
, urlpath
);
415 request
->initHTTP(method
, protocol
, urlpath
);
418 request
->SetHost(host
);
419 xstrncpy(request
->login
, login
, MAX_LOGIN_SZ
);
420 request
->port
= (u_short
) port
;
425 urnParse(const HttpRequestMethod
& method
, char *urn
)
427 debugs(50, 5, "urnParse: " << urn
);
428 return new HttpRequest(method
, PROTO_URN
, urn
+ 4);
432 urlCanonical(HttpRequest
* request
)
434 LOCAL_ARRAY(char, portbuf
, 32);
435 /// \todo AYJ: Performance: making this a ptr and allocating when needed will be better than a write and future xstrdup().
436 LOCAL_ARRAY(char, urlbuf
, MAX_URL
);
438 if (request
->canonical
)
439 return request
->canonical
;
441 if (request
->protocol
== PROTO_URN
) {
442 snprintf(urlbuf
, MAX_URL
, "urn:%s", request
->urlpath
.buf());
444 /// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
445 switch (request
->method
.id()) {
448 snprintf(urlbuf
, MAX_URL
, "%s:%d", request
->GetHost(), request
->port
);
454 if (request
->port
!= urlDefaultPort(request
->protocol
))
455 snprintf(portbuf
, 32, ":%d", request
->port
);
457 snprintf(urlbuf
, MAX_URL
, "%s://%s%s%s%s%s",
458 ProtocolStr
[request
->protocol
],
460 *request
->login
? "@" : null_string
,
463 request
->urlpath
.buf());
469 return (request
->canonical
= xstrdup(urlbuf
));
472 /** \todo AYJ: Performance: This is an *almost* duplicate of urlCanoncical. But elides the query-string.
473 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
474 * and never copy the query-string part in the first place
477 urlCanonicalClean(const HttpRequest
* request
)
479 LOCAL_ARRAY(char, buf
, MAX_URL
);
480 LOCAL_ARRAY(char, portbuf
, 32);
481 LOCAL_ARRAY(char, loginbuf
, MAX_LOGIN_SZ
+ 1);
484 if (request
->protocol
== PROTO_URN
) {
485 snprintf(buf
, MAX_URL
, "urn:%s", request
->urlpath
.buf());
487 /// \todo AYJ: this could use "if..else and method == METHOD_CONNECT" easier.
488 switch (request
->method
.id()) {
491 snprintf(buf
, MAX_URL
, "%s:%d",
499 if (request
->port
!= urlDefaultPort(request
->protocol
))
500 snprintf(portbuf
, 32, ":%d", request
->port
);
504 if ((int) strlen(request
->login
) > 0) {
505 strcpy(loginbuf
, request
->login
);
507 if ((t
= strchr(loginbuf
, ':')))
510 strcat(loginbuf
, "@");
513 snprintf(buf
, MAX_URL
, "%s://%s%s%s%s",
514 ProtocolStr
[request
->protocol
],
518 request
->urlpath
.buf());
520 * strip arguments AFTER a question-mark
523 if (Config
.onoff
.strip_query_terms
)
524 if ((t
= strchr(buf
, '?')))
531 if (stringHasCntl(buf
))
532 xstrncpy(buf
, rfc1738_escape_unescaped(buf
), MAX_URL
);
538 * Test if a URL is relative.
540 * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
541 * appear before a ':'.
544 urlIsRelative(const char *url
)
555 for (p
= url
; *p
!= '\0' && *p
!= ':' && *p
!= '/'; p
++);
564 * Convert a relative URL to an absolute URL using the context of a given
567 * It is assumed that you have already ensured that the URL is relative.
569 * If NULL is returned it is an indication that the method in use in the
570 * request does not distinguish between relative and absolute and you should
571 * use the url unchanged.
573 * If non-NULL is returned, it is up to the caller to free the resulting
574 * memory using safe_free().
577 urlMakeAbsolute(const HttpRequest
* req
, const char *relUrl
)
580 if (req
->method
.id() == METHOD_CONNECT
) {
584 char *urlbuf
= (char *)xmalloc(MAX_URL
* sizeof(char));
586 if (req
->protocol
== PROTO_URN
) {
587 snprintf(urlbuf
, MAX_URL
, "urn:%s", req
->urlpath
.buf());
593 if (req
->port
!= urlDefaultPort(req
->protocol
)) {
594 urllen
= snprintf(urlbuf
, MAX_URL
, "%s://%s%s%s:%d",
595 ProtocolStr
[req
->protocol
],
597 *req
->login
? "@" : null_string
,
602 urllen
= snprintf(urlbuf
, MAX_URL
, "%s://%s%s%s",
603 ProtocolStr
[req
->protocol
],
605 *req
->login
? "@" : null_string
,
610 if (relUrl
[0] == '/') {
611 strncpy(&urlbuf
[urllen
], relUrl
, MAX_URL
- urllen
- 1);
613 const char *path
= req
->urlpath
.buf();
614 const char *last_slash
= strrchr(path
, '/');
616 if (last_slash
== NULL
) {
617 urlbuf
[urllen
++] = '/';
618 strncpy(&urlbuf
[urllen
], relUrl
, MAX_URL
- urllen
- 1);
621 size_t pathlen
= last_slash
- path
;
622 if (pathlen
> MAX_URL
- urllen
- 1) {
623 pathlen
= MAX_URL
- urllen
- 1;
625 strncpy(&urlbuf
[urllen
], path
, pathlen
);
627 if (urllen
+ 1 < MAX_URL
) {
628 strncpy(&urlbuf
[urllen
], relUrl
, MAX_URL
- urllen
- 1);
637 * matchDomainName() compares a hostname with a domainname according
638 * to the following rules:
641 * ------------- ------------- ------
642 * foo.com foo.com YES
643 * .foo.com foo.com YES
644 * x.foo.com foo.com NO
645 * foo.com .foo.com YES
646 * .foo.com .foo.com YES
647 * x.foo.com .foo.com YES
649 * We strip leading dots on hosts (but not domains!) so that
650 * ".foo.com" is is always the same as "foo.com".
653 * 0 means the host matches the domain
654 * 1 means the host is greater than the domain
655 * -1 means the host is less than the domain
659 matchDomainName(const char *h
, const char *d
)
672 * Start at the ends of the two strings and work towards the
675 while (xtolower(h
[--hl
]) == xtolower(d
[--dl
])) {
676 if (hl
== 0 && dl
== 0) {
678 * We made it all the way to the beginning of both
679 * strings without finding any difference.
686 * The host string is shorter than the domain string.
687 * There is only one case when this can be a match.
688 * If the domain is just one character longer, and if
689 * that character is a leading '.' then we call it a
693 if (1 == dl
&& '.' == d
[0])
701 * The domain string is shorter than the host string.
702 * This is a match only if the first domain character
714 * We found different characters in the same position (from the end).
717 * If one of those character is '.' then its special. In order
718 * for splay tree sorting to work properly, "x-foo.com" must
719 * be greater than ".foo.com" even though '-' is less than '.'.
727 return (xtolower(h
[hl
]) - xtolower(d
[dl
]));
732 * return true if we can serve requests for this method.
735 urlCheckRequest(const HttpRequest
* r
)
738 /* protocol "independent" methods
740 * actually these methods are specific to HTTP:
741 * they are methods we recieve on our HTTP port,
742 * and if we had a FTP listener would not be relevant
745 * So, we should delegate them to HTTP. The problem is that we
746 * do not have a default protocol from the client side of HTTP.
749 if (r
->method
== METHOD_CONNECT
)
752 if (r
->method
== METHOD_TRACE
)
755 if (r
->method
== METHOD_PURGE
)
758 /* does method match the protocol? */
759 switch (r
->protocol
) {
771 if (r
->method
== METHOD_PUT
)
779 if (r
->method
== METHOD_GET
)
781 else if (r
->method
== METHOD_HEAD
)
795 * Squid can't originate an SSL connection, so it should
796 * never receive an "https:" URL. It should always be
811 * Quick-n-dirty host extraction from a URL. Steps:
813 * Skip any '/' after the colon
814 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
815 * Look for an ending '/' or ':' and terminate
816 * Look for login info preceeded by '@'
823 char * extract(char const *url
);
826 static char Host
[SQUIDHOSTNAMELEN
];
827 void init(char const *);
828 void findHostStart();
829 void trimTrailingChars();
831 char const *hostStart
;
836 urlHostname(const char *url
)
838 return URLHostName().extract(url
);
841 char URLHostName::Host
[SQUIDHOSTNAMELEN
];
844 URLHostName::init(char const *aUrl
)
851 URLHostName::findHostStart()
853 if (NULL
== (hostStart
= strchr(url
, ':')))
858 while (*hostStart
!= '\0' && *hostStart
== '/')
862 if (*hostStart
== ']')
869 URLHostName::trimTrailingChars()
873 if ((t
= strchr(Host
, '/')))
876 if ((t
= strrchr(Host
, ':')))
880 if ((t
= strchr(Host
, ']')))
887 URLHostName::trimAuth()
891 if ((t
= strrchr(Host
, '@'))) {
893 xmemmove(Host
, t
, strlen(t
) + 1);
898 URLHostName::extract(char const *aUrl
)
903 if (hostStart
== NULL
)
906 xstrncpy(Host
, hostStart
, SQUIDHOSTNAMELEN
);
915 URL::URL() : scheme()
918 URL::URL(URLScheme
const &aScheme
): scheme(aScheme
)