]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/url.cc
3 * $Id: url.cc,v 1.124 2000/05/12 00:29:10 wessels Exp $
5 * DEBUG: section 23 URL Parsing
6 * AUTHOR: Duane Wessels
8 * SQUID Internet Object Cache http://squid.nlanr.net/Squid/
9 * ----------------------------------------------------------
11 * Squid is the result of efforts by numerous individuals from the
12 * Internet community. Development is led by Duane Wessels of the
13 * National Laboratory for Applied Network Research and funded by the
14 * National Science Foundation. Squid is Copyrighted (C) 1998 by
15 * the Regents of the University of California. Please see the
16 * COPYRIGHT file for full details. Squid incorporates software
17 * developed and/or copyrighted by other sources. Please see the
18 * CREDITS file for full details.
20 * This program is free software; you can redistribute it and/or modify
21 * it under the terms of the GNU General Public License as published by
22 * the Free Software Foundation; either version 2 of the License, or
23 * (at your option) any later version.
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
30 * You should have received a copy of the GNU General Public License
31 * along with this program; if not, write to the Free Software
32 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
38 const char *RequestMethodStr
[] =
60 const char *ProtocolStr
[] =
79 static request_t
*urnParse(method_t method
, char *urn
);
80 static const char *const valid_hostname_chars
=
81 #if ALLOW_HOSTNAME_UNDERSCORES
82 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
83 "abcdefghijklmnopqrstuvwxyz"
86 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
87 "abcdefghijklmnopqrstuvwxyz"
91 /* convert %xx in url string to a character
92 * Allocate a new string and return a pointer to converted string */
95 url_convert_hex(char *org_url
, int allocate
)
97 static char code
[] = "00";
101 url
= allocate
? (char *) xstrdup(org_url
) : org_url
;
102 if ((int) strlen(url
) < 3 || !strchr(url
, '%'))
104 for (s
= t
= url
; *(s
+ 2); s
++) {
108 *t
++ = (char) strtol(code
, NULL
, 16);
122 debug(23, 5) ("urlInitialize: Initializing...\n");
123 assert(sizeof(ProtocolStr
) == (PROTO_MAX
+ 1) * sizeof(char *));
124 memset(&null_request_flags
, '\0', sizeof(null_request_flags
));
126 * These test that our matchDomainName() function works the
127 * way we expect it to.
129 assert(0 == matchDomainName("foo.com", "foo.com"));
130 assert(0 < matchDomainName(".foo.com", "foo.com"));
131 assert(0 == matchDomainName("foo.com", ".foo.com"));
132 assert(0 == matchDomainName(".foo.com", ".foo.com"));
133 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
134 assert(0 != matchDomainName("x.foo.com", "foo.com"));
135 assert(0 != matchDomainName("foo.com", "x.foo.com"));
136 assert(0 != matchDomainName("bar.com", "foo.com"));
137 assert(0 != matchDomainName(".bar.com", "foo.com"));
138 assert(0 != matchDomainName(".bar.com", ".foo.com"));
139 assert(0 != matchDomainName("bar.com", ".foo.com"));
140 assert(0 < matchDomainName("zzz.com", "foo.com"));
141 assert(0 > matchDomainName("aaa.com", "foo.com"));
142 assert(0 == matchDomainName("FOO.com", "foo.COM"));
143 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
144 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
149 urlParseMethod(const char *s
)
151 method_t method
= METHOD_NONE
;
152 for (method
++; method
< METHOD_ENUM_END
; method
++) {
153 if (0 == strcasecmp(s
, RequestMethodStr
[method
]))
161 urlParseProtocol(const char *s
)
163 /* test common stuff first */
164 if (strcasecmp(s
, "http") == 0)
166 if (strcasecmp(s
, "ftp") == 0)
168 if (strcasecmp(s
, "https") == 0)
170 if (strcasecmp(s
, "file") == 0)
172 if (strcasecmp(s
, "gopher") == 0)
174 if (strcasecmp(s
, "wais") == 0)
176 if (strcasecmp(s
, "cache_object") == 0)
177 return PROTO_CACHEOBJ
;
178 if (strcasecmp(s
, "urn") == 0)
180 if (strcasecmp(s
, "whois") == 0)
182 if (strcasecmp(s
, "internal") == 0)
183 return PROTO_INTERNAL
;
189 urlDefaultPort(protocol_t p
)
204 return CACHE_HTTP_PORT
;
213 urlParse(method_t method
, char *url
)
215 LOCAL_ARRAY(char, proto
, MAX_URL
);
216 LOCAL_ARRAY(char, login
, MAX_URL
);
217 LOCAL_ARRAY(char, host
, MAX_URL
);
218 LOCAL_ARRAY(char, urlpath
, MAX_URL
);
219 request_t
*request
= NULL
;
223 protocol_t protocol
= PROTO_NONE
;
225 proto
[0] = host
[0] = urlpath
[0] = login
[0] = '\0';
227 if ((l
= strlen(url
)) + Config
.appendDomainLen
> (MAX_URL
- 1)) {
228 /* terminate so it doesn't overflow other buffers */
229 *(url
+ (MAX_URL
>> 1)) = '\0';
230 debug(23, 1) ("urlParse: URL too large (%d bytes)\n", l
);
233 if (method
== METHOD_CONNECT
) {
235 if (sscanf(url
, "%[^:]:%d", host
, &port
) < 1)
237 } else if (!strncmp(url
, "urn:", 4)) {
238 return urnParse(method
, url
);
240 if (sscanf(url
, "%[^:]://%[^/]%[^\r\n]", proto
, host
, urlpath
) < 2)
242 protocol
= urlParseProtocol(proto
);
243 port
= urlDefaultPort(protocol
);
244 /* Is there any login informaiton? */
245 if ((t
= strrchr(host
, '@'))) {
246 strcpy((char *) login
, (char *) host
);
247 t
= strrchr(login
, '@');
249 strcpy((char *) host
, t
+ 1);
251 if ((t
= strrchr(host
, ':'))) {
257 for (t
= host
; *t
; t
++)
259 if (strspn(host
, valid_hostname_chars
) != strlen(host
)) {
260 debug(23, 1) ("urlParse: Illegal character in hostname '%s'\n", host
);
263 /* remove trailing dots from hostnames */
264 while ((l
= strlen(host
)) > 0 && host
[--l
] == '.')
266 if (Config
.appendDomain
&& !strchr(host
, '.'))
267 strncat(host
, Config
.appendDomain
, SQUIDHOSTNAMELEN
);
269 debug(23, 3) ("urlParse: Invalid port == 0\n");
272 #ifdef HARDCODE_DENY_PORTS
273 /* These ports are filtered in the default squid.conf, but
274 * maybe someone wants them hardcoded... */
275 if (port
== 7 || port
== 9 || port
= 19) {
276 debug(23, 0) ("urlParse: Deny access to port %d\n", port
);
280 if (stringHasWhitespace(urlpath
)) {
281 debug(23, 2) ("urlParse: URI has whitespace: {%s}\n", url
);
282 switch (Config
.uri_whitespace
) {
283 case URI_WHITESPACE_DENY
:
285 case URI_WHITESPACE_ALLOW
:
287 case URI_WHITESPACE_ENCODE
:
288 t
= rfc1738_escape_unescaped(urlpath
);
289 xstrncpy(urlpath
, t
, MAX_URL
);
291 case URI_WHITESPACE_CHOP
:
292 *(urlpath
+ strcspn(urlpath
, w_space
)) = '\0';
294 case URI_WHITESPACE_STRIP
:
305 request
= requestCreate(method
, protocol
, urlpath
);
306 xstrncpy(request
->host
, host
, SQUIDHOSTNAMELEN
);
307 xstrncpy(request
->login
, login
, MAX_LOGIN_SZ
);
308 request
->port
= (u_short
) port
;
313 urnParse(method_t method
, char *urn
)
315 debug(50, 5) ("urnParse: %s\n", urn
);
316 return requestCreate(method
, PROTO_URN
, urn
+ 4);
320 urlCanonical(request_t
* request
)
322 LOCAL_ARRAY(char, portbuf
, 32);
323 LOCAL_ARRAY(char, urlbuf
, MAX_URL
);
324 if (request
->canonical
)
325 return request
->canonical
;
326 if (request
->protocol
== PROTO_URN
) {
327 snprintf(urlbuf
, MAX_URL
, "urn:%s", strBuf(request
->urlpath
));
329 switch (request
->method
) {
331 snprintf(urlbuf
, MAX_URL
, "%s:%d", request
->host
, request
->port
);
335 if (request
->port
!= urlDefaultPort(request
->protocol
))
336 snprintf(portbuf
, 32, ":%d", request
->port
);
337 snprintf(urlbuf
, MAX_URL
, "%s://%s%s%s%s%s",
338 ProtocolStr
[request
->protocol
],
340 *request
->login
? "@" : null_string
,
343 strBuf(request
->urlpath
));
347 return (request
->canonical
= xstrdup(urlbuf
));
351 urlCanonicalClean(const request_t
* request
)
353 LOCAL_ARRAY(char, buf
, MAX_URL
);
354 LOCAL_ARRAY(char, portbuf
, 32);
355 LOCAL_ARRAY(char, loginbuf
, MAX_LOGIN_SZ
+ 1);
357 if (request
->protocol
== PROTO_URN
) {
358 snprintf(buf
, MAX_URL
, "urn:%s", strBuf(request
->urlpath
));
360 switch (request
->method
) {
362 snprintf(buf
, MAX_URL
, "%s:%d", request
->host
, request
->port
);
366 if (request
->port
!= urlDefaultPort(request
->protocol
))
367 snprintf(portbuf
, 32, ":%d", request
->port
);
369 if ((int) strlen(request
->login
) > 0) {
370 strcpy(loginbuf
, request
->login
);
371 if ((t
= strchr(loginbuf
, ':')))
373 strcat(loginbuf
, "@");
375 snprintf(buf
, MAX_URL
, "%s://%s%s%s%s",
376 ProtocolStr
[request
->protocol
],
380 strBuf(request
->urlpath
));
382 * strip arguments AFTER a question-mark
384 if (Config
.onoff
.strip_query_terms
)
385 if ((t
= strchr(buf
, '?')))
390 if (stringHasCntl(buf
))
391 xstrncpy(buf
, rfc1738_escape_unescaped(buf
), MAX_URL
);
396 * matchDomainName() compares a hostname with a domainname according
397 * to the following rules:
400 * ------------- ------------- ------
401 * foo.com foo.com YES
402 * .foo.com foo.com NO
403 * x.foo.com foo.com NO
404 * foo.com .foo.com YES
405 * .foo.com .foo.com YES
406 * x.foo.com .foo.com YES
409 * 0 means the host matches the domain
410 * 1 means the host is greater than the domain
411 * -1 means the host is less than the domain
415 matchDomainName(const char *h
, const char *d
)
422 * Start at the ends of the two strings and work towards the
425 while (xtolower(h
[--hl
]) == xtolower(d
[--dl
])) {
426 if (hl
== 0 && dl
== 0) {
428 * We made it all the way to the beginning of both
429 * strings without finding any difference.
435 * The host string is shorter than the domain string.
436 * There is only one case when this can be a match.
437 * If the domain is just one character longer, and if
438 * that character is a leading '.' then we call it a
441 if (1 == dl
&& '.' == d
[0])
448 * The domain string is shorter than the host string.
449 * This is a match only if the first domain character
459 * We found different characters in the same position (from the end).
461 return (xtolower(h
[hl
]) - xtolower(d
[dl
]));
465 urlCheckRequest(const request_t
* r
)
468 /* protocol "independent" methods */
469 if (r
->method
== METHOD_CONNECT
)
471 if (r
->method
== METHOD_TRACE
)
473 if (r
->method
== METHOD_PURGE
)
475 /* does method match the protocol? */
476 switch (r
->protocol
) {
483 if (r
->method
== METHOD_PUT
)
488 if (r
->method
== METHOD_GET
)
490 else if (r
->method
== METHOD_HEAD
)
495 * Squid can't originate an SSL connection, so it should
496 * never receive an "https:" URL. It should always be
507 * Quick-n-dirty host extraction from a URL. Steps:
509 * Skip any '/' after the colon
510 * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
511 * Look for an ending '/' or ':' and terminate
512 * Look for login info preceeded by '@'
515 urlHostname(const char *url
)
517 LOCAL_ARRAY(char, host
, SQUIDHOSTNAMELEN
);
520 if (NULL
== (t
= strchr(url
, ':')))
523 while (*t
!= '\0' && *t
== '/')
525 xstrncpy(host
, t
, SQUIDHOSTNAMELEN
);
526 if ((t
= strchr(host
, '/')))
528 if ((t
= strchr(host
, ':')))
530 if ((t
= strrchr(host
, '@'))) {
532 xmemmove(host
, t
, strlen(t
) + 1);