]>
Commit | Line | Data |
---|---|---|
30a4f2a8 | 1 | /* |
bbc27441 | 2 | * Copyright (C) 1996-2014 The Squid Software Foundation and contributors |
e25c139f | 3 | * |
bbc27441 AJ |
4 | * Squid software is distributed under GPLv2+ license and includes |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
019dd986 | 7 | */ |
ed43818f | 8 | |
bbc27441 AJ |
9 | /* DEBUG: section 23 URL Parsing */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
582c2af2 | 12 | #include "globals.h" |
528b2c61 | 13 | #include "HttpRequest.h" |
1fa9b1a7 | 14 | #include "rfc1738.h" |
4d5904f7 | 15 | #include "SquidConfig.h" |
7a707cb5 | 16 | #include "SquidString.h" |
582c2af2 | 17 | #include "URL.h" |
090089c4 | 18 | |
4d919a80 | 19 | static HttpRequest *urlParseFinish(const HttpRequestMethod& method, |
0c3d3f65 | 20 | const AnyP::ProtocolType protocol, |
4d919a80 AR |
21 | const char *const urlpath, |
22 | const char *const host, | |
92d6986d | 23 | const SBuf &login, |
4d919a80 AR |
24 | const int port, |
25 | HttpRequest *request); | |
9be14530 | 26 | static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request); |
a78278e2 | 27 | static const char valid_hostname_chars_u[] = |
62e76326 | 28 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
29 | "abcdefghijklmnopqrstuvwxyz" | |
a78278e2 | 30 | "0123456789-._" |
cc192b50 | 31 | "[:]" |
a78278e2 | 32 | ; |
33 | static const char valid_hostname_chars[] = | |
62e76326 | 34 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
35 | "abcdefghijklmnopqrstuvwxyz" | |
36 | "0123456789-." | |
cc192b50 | 37 | "[:]" |
62e76326 | 38 | ; |
090089c4 | 39 | |
2e260208 AJ |
40 | const SBuf & |
41 | URL::Asterisk() | |
42 | { | |
43 | static SBuf star("*"); | |
44 | return star; | |
45 | } | |
46 | ||
b8d8561b | 47 | void |
0673c0ba | 48 | urlInitialize(void) |
090089c4 | 49 | { |
bf8fe701 | 50 | debugs(23, 5, "urlInitialize: Initializing..."); |
985c86bc | 51 | /* this ensures that the number of protocol strings is the same as |
0c3d3f65 | 52 | * the enum slots allocated because the last enum is always 'MAX'. |
985c86bc | 53 | */ |
0c3d3f65 | 54 | assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0); |
9bc73deb | 55 | /* |
56 | * These test that our matchDomainName() function works the | |
57 | * way we expect it to. | |
58 | */ | |
59 | assert(0 == matchDomainName("foo.com", "foo.com")); | |
d20b1cd0 | 60 | assert(0 == matchDomainName(".foo.com", "foo.com")); |
9bc73deb | 61 | assert(0 == matchDomainName("foo.com", ".foo.com")); |
62 | assert(0 == matchDomainName(".foo.com", ".foo.com")); | |
63 | assert(0 == matchDomainName("x.foo.com", ".foo.com")); | |
64 | assert(0 != matchDomainName("x.foo.com", "foo.com")); | |
65 | assert(0 != matchDomainName("foo.com", "x.foo.com")); | |
66 | assert(0 != matchDomainName("bar.com", "foo.com")); | |
67 | assert(0 != matchDomainName(".bar.com", "foo.com")); | |
68 | assert(0 != matchDomainName(".bar.com", ".foo.com")); | |
69 | assert(0 != matchDomainName("bar.com", ".foo.com")); | |
70 | assert(0 < matchDomainName("zzz.com", "foo.com")); | |
71 | assert(0 > matchDomainName("aaa.com", "foo.com")); | |
72 | assert(0 == matchDomainName("FOO.com", "foo.COM")); | |
aca95add | 73 | assert(0 < matchDomainName("bfoo.com", "afoo.com")); |
74 | assert(0 > matchDomainName("afoo.com", "bfoo.com")); | |
d20b1cd0 | 75 | assert(0 < matchDomainName("x-foo.com", ".foo.com")); |
9bc73deb | 76 | /* more cases? */ |
090089c4 | 77 | } |
78 | ||
cc192b50 | 79 | /** |
d4a04ed5 | 80 | * urlParseProtocol() takes begin (b) and end (e) pointers, but for |
81 | * backwards compatibility, e defaults to NULL, in which case we | |
82 | * assume b is NULL-terminated. | |
83 | */ | |
0c3d3f65 | 84 | AnyP::ProtocolType |
d4a04ed5 | 85 | urlParseProtocol(const char *b, const char *e) |
92a6f4b1 | 86 | { |
d4a04ed5 | 87 | /* |
88 | * if e is NULL, b must be NULL terminated and we | |
89 | * make e point to the first whitespace character | |
90 | * after b. | |
91 | */ | |
92 | ||
93 | if (NULL == e) | |
94 | e = b + strcspn(b, ":"); | |
95 | ||
96 | int len = e - b; | |
97 | ||
fcd2d3ef | 98 | /* test common stuff first */ |
62e76326 | 99 | |
d4a04ed5 | 100 | if (strncasecmp(b, "http", len) == 0) |
0c3d3f65 | 101 | return AnyP::PROTO_HTTP; |
62e76326 | 102 | |
d4a04ed5 | 103 | if (strncasecmp(b, "ftp", len) == 0) |
0c3d3f65 | 104 | return AnyP::PROTO_FTP; |
62e76326 | 105 | |
d4a04ed5 | 106 | if (strncasecmp(b, "https", len) == 0) |
0c3d3f65 | 107 | return AnyP::PROTO_HTTPS; |
62e76326 | 108 | |
d4a04ed5 | 109 | if (strncasecmp(b, "file", len) == 0) |
0c3d3f65 | 110 | return AnyP::PROTO_FTP; |
62e76326 | 111 | |
330f829e AJ |
112 | if (strncasecmp(b, "coap", len) == 0) |
113 | return AnyP::PROTO_COAP; | |
114 | ||
115 | if (strncasecmp(b, "coaps", len) == 0) | |
116 | return AnyP::PROTO_COAPS; | |
117 | ||
d4a04ed5 | 118 | if (strncasecmp(b, "gopher", len) == 0) |
0c3d3f65 | 119 | return AnyP::PROTO_GOPHER; |
62e76326 | 120 | |
d4a04ed5 | 121 | if (strncasecmp(b, "wais", len) == 0) |
0c3d3f65 | 122 | return AnyP::PROTO_WAIS; |
62e76326 | 123 | |
d4a04ed5 | 124 | if (strncasecmp(b, "cache_object", len) == 0) |
39a19cb7 | 125 | return AnyP::PROTO_CACHE_OBJECT; |
62e76326 | 126 | |
d4a04ed5 | 127 | if (strncasecmp(b, "urn", len) == 0) |
0c3d3f65 | 128 | return AnyP::PROTO_URN; |
62e76326 | 129 | |
d4a04ed5 | 130 | if (strncasecmp(b, "whois", len) == 0) |
0c3d3f65 | 131 | return AnyP::PROTO_WHOIS; |
62e76326 | 132 | |
0c3d3f65 | 133 | return AnyP::PROTO_NONE; |
92a6f4b1 | 134 | } |
135 | ||
3fdadc70 | 136 | int |
0c3d3f65 | 137 | urlDefaultPort(AnyP::ProtocolType p) |
92a6f4b1 | 138 | { |
139 | switch (p) { | |
62e76326 | 140 | |
0c3d3f65 | 141 | case AnyP::PROTO_HTTP: |
62e76326 | 142 | return 80; |
143 | ||
0c3d3f65 | 144 | case AnyP::PROTO_HTTPS: |
62e76326 | 145 | return 443; |
146 | ||
0c3d3f65 | 147 | case AnyP::PROTO_FTP: |
62e76326 | 148 | return 21; |
149 | ||
330f829e AJ |
150 | case AnyP::PROTO_COAP: |
151 | case AnyP::PROTO_COAPS: | |
152 | // coaps:// default is TBA as of draft-ietf-core-coap-08. | |
153 | // Assuming IANA policy of allocating same port for base and TLS protocol versions will occur. | |
154 | return 5683; | |
155 | ||
0c3d3f65 | 156 | case AnyP::PROTO_GOPHER: |
62e76326 | 157 | return 70; |
158 | ||
0c3d3f65 | 159 | case AnyP::PROTO_WAIS: |
62e76326 | 160 | return 210; |
161 | ||
39a19cb7 | 162 | case AnyP::PROTO_CACHE_OBJECT: |
62e76326 | 163 | return CACHE_HTTP_PORT; |
164 | ||
0c3d3f65 | 165 | case AnyP::PROTO_WHOIS: |
62e76326 | 166 | return 43; |
167 | ||
92a6f4b1 | 168 | default: |
62e76326 | 169 | return 0; |
92a6f4b1 | 170 | } |
171 | } | |
7111c86a | 172 | |
d4a04ed5 | 173 | /* |
174 | * Parse a URI/URL. | |
175 | * | |
176 | * If the 'request' arg is non-NULL, put parsed values there instead | |
177 | * of allocating a new HttpRequest. | |
c21ad0f5 | 178 | * |
26ac0430 | 179 | * This abuses HttpRequest as a way of representing the parsed url |
c21ad0f5 | 180 | * and its components. |
181 | * method is used to switch parsers and to init the HttpRequest. | |
c2a7cefd | 182 | * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is |
c21ad0f5 | 183 | * looked for. |
184 | * The url is non const so that if its too long we can NULL-terminate it in place. | |
d4a04ed5 | 185 | */ |
cc192b50 | 186 | |
187 | /* | |
188 | * This routine parses a URL. Its assumed that the URL is complete - | |
189 | * ie, the end of the string is the end of the URL. Don't pass a partial | |
190 | * URL here as this routine doesn't have any way of knowing whether | |
191 | * its partial or not (ie, it handles the case of no trailing slash as | |
192 | * being "end of host with implied path of /". | |
193 | */ | |
190154cf | 194 | HttpRequest * |
60745f24 | 195 | urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request) |
7111c86a | 196 | { |
f2052513 | 197 | LOCAL_ARRAY(char, proto, MAX_URL); |
198 | LOCAL_ARRAY(char, login, MAX_URL); | |
199 | LOCAL_ARRAY(char, host, MAX_URL); | |
200 | LOCAL_ARRAY(char, urlpath, MAX_URL); | |
7111c86a | 201 | char *t = NULL; |
7e3ce7b9 | 202 | char *q = NULL; |
7111c86a | 203 | int port; |
0c3d3f65 | 204 | AnyP::ProtocolType protocol = AnyP::PROTO_NONE; |
774cc2d8 | 205 | int l; |
cc192b50 | 206 | int i; |
207 | const char *src; | |
208 | char *dst; | |
983061ed | 209 | proto[0] = host[0] = urlpath[0] = login[0] = '\0'; |
7111c86a | 210 | |
ba0fd1b6 | 211 | if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) { |
62e76326 | 212 | /* terminate so it doesn't overflow other buffers */ |
213 | *(url + (MAX_URL >> 1)) = '\0'; | |
e0236918 | 214 | debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)"); |
62e76326 | 215 | return NULL; |
0a5b9b32 | 216 | } |
c2a7cefd | 217 | if (method == Http::METHOD_CONNECT) { |
62e76326 | 218 | port = CONNECT_PORT; |
219 | ||
0f0affc7 | 220 | if (sscanf(url, "[%[^]]]:%d", host, &port) < 1) |
cc192b50 | 221 | if (sscanf(url, "%[^:]:%d", host, &port) < 1) |
222 | return NULL; | |
223 | ||
c2a7cefd | 224 | } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) && |
2e260208 | 225 | URL::Asterisk().cmp(url) == 0) { |
0c3d3f65 | 226 | protocol = AnyP::PROTO_HTTP; |
4d919a80 | 227 | port = urlDefaultPort(protocol); |
92d6986d | 228 | return urlParseFinish(method, protocol, url, host, SBuf(), port, request); |
23d92c64 | 229 | } else if (!strncmp(url, "urn:", 4)) { |
9be14530 | 230 | return urnParse(method, url, request); |
7111c86a | 231 | } else { |
cc192b50 | 232 | /* Parse the URL: */ |
233 | src = url; | |
234 | i = 0; | |
235 | /* Find first : - everything before is protocol */ | |
5db6bf73 | 236 | for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) { |
cc192b50 | 237 | *dst = *src; |
238 | } | |
239 | if (i >= l) | |
26ac0430 | 240 | return NULL; |
cc192b50 | 241 | *dst = '\0'; |
242 | ||
243 | /* Then its :// */ | |
5e245980 | 244 | if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/') |
62e76326 | 245 | return NULL; |
cc192b50 | 246 | i += 3; |
247 | src += 3; | |
62e76326 | 248 | |
cc192b50 | 249 | /* Then everything until first /; thats host (and port; which we'll look for here later) */ |
68338d14 F |
250 | // bug 1881: If we don't get a "/" then we imply it was there |
251 | // bug 3074: We could just be given a "?" or "#". These also imply "/" | |
b2ab59ad | 252 | // bug 3233: whitespace is also a hostname delimiter. |
5db6bf73 | 253 | for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) { |
cc192b50 | 254 | *dst = *src; |
255 | } | |
256 | ||
26ac0430 | 257 | /* |
cc192b50 | 258 | * We can't check for "i >= l" here because we could be at the end of the line |
259 | * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've | |
260 | * been -given- a valid URL and the path is just '/'. | |
261 | */ | |
262 | if (i > l) | |
263 | return NULL; | |
264 | *dst = '\0'; | |
265 | ||
68338d14 F |
266 | // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/' |
267 | if (*src == '?' || *src == '#' || *src == '\0') { | |
268 | urlpath[0] = '/'; | |
269 | dst = &urlpath[1]; | |
270 | } else { | |
271 | dst = urlpath; | |
272 | } | |
cc192b50 | 273 | /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */ |
5db6bf73 | 274 | for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 275 | *dst = *src; |
276 | } | |
62e76326 | 277 | |
cc192b50 | 278 | /* We -could- be at the end of the buffer here */ |
279 | if (i > l) | |
280 | return NULL; | |
281 | /* If the URL path is empty we set it to be "/" */ | |
282 | if (dst == urlpath) { | |
5db6bf73 FC |
283 | *dst = '/'; |
284 | ++dst; | |
cc192b50 | 285 | } |
286 | *dst = '\0'; | |
287 | ||
288 | protocol = urlParseProtocol(proto); | |
62e76326 | 289 | port = urlDefaultPort(protocol); |
290 | ||
cc192b50 | 291 | /* Is there any login information? (we should eventually parse it above) */ |
810635e3 FC |
292 | t = strrchr(host, '@'); |
293 | if (t != NULL) { | |
0a84e4fb AJ |
294 | strncpy((char *) login, (char *) host, sizeof(login)-1); |
295 | login[sizeof(login)-1] = '\0'; | |
62e76326 | 296 | t = strrchr(login, '@'); |
297 | *t = 0; | |
0a84e4fb AJ |
298 | strncpy((char *) host, t + 1, sizeof(host)-1); |
299 | host[sizeof(host)-1] = '\0'; | |
62e76326 | 300 | } |
301 | ||
cc192b50 | 302 | /* Is there any host information? (we should eventually parse it above) */ |
26ac0430 | 303 | if (*host == '[') { |
cc192b50 | 304 | /* strip any IPA brackets. valid under IPv6. */ |
305 | dst = host; | |
cc192b50 | 306 | /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */ |
26ac0430 | 307 | src = host; |
5db6bf73 | 308 | ++src; |
cc192b50 | 309 | l = strlen(host); |
310 | i = 1; | |
5db6bf73 | 311 | for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 312 | *dst = *src; |
313 | } | |
314 | ||
315 | /* we moved in-place, so truncate the actual hostname found */ | |
5db6bf73 FC |
316 | *dst = '\0'; |
317 | ++dst; | |
cc192b50 | 318 | |
319 | /* skip ahead to either start of port, or original EOS */ | |
5db6bf73 FC |
320 | while (*dst != '\0' && *dst != ':') |
321 | ++dst; | |
cc192b50 | 322 | t = dst; |
323 | } else { | |
324 | t = strrchr(host, ':'); | |
325 | ||
26ac0430 | 326 | if (t != strchr(host,':') ) { |
cc192b50 | 327 | /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ |
328 | /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ | |
329 | /* therefore we MUST accept the case where they are not bracketed at all. */ | |
330 | t = NULL; | |
331 | } | |
332 | } | |
62e76326 | 333 | |
b5acc277 | 334 | // Bug 3183 sanity check: If scheme is present, host must be too. |
ff8b6bcf | 335 | if (protocol != AnyP::PROTO_NONE && host[0] == '\0') { |
ac89842b | 336 | debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details."); |
b5acc277 AJ |
337 | return NULL; |
338 | } | |
339 | ||
cc192b50 | 340 | if (t && *t == ':') { |
26ac0430 | 341 | *t = '\0'; |
5db6bf73 | 342 | ++t; |
cc192b50 | 343 | port = atoi(t); |
62e76326 | 344 | } |
7111c86a | 345 | } |
62e76326 | 346 | |
5db6bf73 | 347 | for (t = host; *t; ++t) |
62e76326 | 348 | *t = xtolower(*t); |
349 | ||
30abd221 | 350 | if (stringHasWhitespace(host)) { |
62e76326 | 351 | if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { |
352 | t = q = host; | |
62e76326 | 353 | while (*t) { |
5db6bf73 FC |
354 | if (!xisspace(*t)) { |
355 | *q = *t; | |
356 | ++q; | |
357 | } | |
358 | ++t; | |
62e76326 | 359 | } |
62e76326 | 360 | *q = '\0'; |
361 | } | |
d20b1cd0 | 362 | } |
62e76326 | 363 | |
cc192b50 | 364 | debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'"); |
365 | ||
a78278e2 | 366 | if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) { |
e0236918 | 367 | debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'"); |
62e76326 | 368 | return NULL; |
b3f3dd02 | 369 | } |
62e76326 | 370 | |
532e5dd4 AJ |
371 | /* For IPV6 addresses also check for a colon */ |
372 | if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) | |
cc192b50 | 373 | strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1); |
374 | ||
1c481e00 | 375 | /* remove trailing dots from hostnames */ |
79d39a72 | 376 | while ((l = strlen(host)) > 0 && host[--l] == '.') |
62e76326 | 377 | host[l] = '\0'; |
378 | ||
cc192b50 | 379 | /* reject duplicate or leading dots */ |
380 | if (strstr(host, "..") || *host == '.') { | |
e0236918 | 381 | debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'"); |
cc192b50 | 382 | return NULL; |
383 | } | |
62e76326 | 384 | |
3a1d4727 | 385 | if (port < 1 || port > 65535) { |
bf8fe701 | 386 | debugs(23, 3, "urlParse: Invalid port '" << port << "'"); |
62e76326 | 387 | return NULL; |
7111c86a | 388 | } |
62e76326 | 389 | |
32d002cb | 390 | #if HARDCODE_DENY_PORTS |
429fdbec | 391 | /* These ports are filtered in the default squid.conf, but |
392 | * maybe someone wants them hardcoded... */ | |
6d2eb13e | 393 | if (port == 7 || port == 9 || port == 19) { |
fa84c01d | 394 | debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port); |
62e76326 | 395 | return NULL; |
429fdbec | 396 | } |
6ef12318 | 397 | #endif |
cc192b50 | 398 | |
30abd221 | 399 | if (stringHasWhitespace(urlpath)) { |
bf8fe701 | 400 | debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}"); |
62e76326 | 401 | |
402 | switch (Config.uri_whitespace) { | |
403 | ||
404 | case URI_WHITESPACE_DENY: | |
405 | return NULL; | |
406 | ||
407 | case URI_WHITESPACE_ALLOW: | |
408 | break; | |
409 | ||
410 | case URI_WHITESPACE_ENCODE: | |
411 | t = rfc1738_escape_unescaped(urlpath); | |
412 | xstrncpy(urlpath, t, MAX_URL); | |
413 | break; | |
414 | ||
415 | case URI_WHITESPACE_CHOP: | |
416 | *(urlpath + strcspn(urlpath, w_space)) = '\0'; | |
417 | break; | |
418 | ||
419 | case URI_WHITESPACE_STRIP: | |
62e76326 | 420 | default: |
421 | t = q = urlpath; | |
62e76326 | 422 | while (*t) { |
5db6bf73 FC |
423 | if (!xisspace(*t)) { |
424 | *q = *t; | |
425 | ++q; | |
426 | } | |
427 | ++t; | |
62e76326 | 428 | } |
62e76326 | 429 | *q = '\0'; |
430 | } | |
d548ee64 | 431 | } |
62e76326 | 432 | |
92d6986d | 433 | return urlParseFinish(method, protocol, urlpath, host, SBuf(login), port, request); |
4d919a80 AR |
434 | } |
435 | ||
436 | /** | |
437 | * Update request with parsed URI data. If the request arg is | |
438 | * non-NULL, put parsed values there instead of allocating a new | |
439 | * HttpRequest. | |
440 | */ | |
441 | static HttpRequest * | |
442 | urlParseFinish(const HttpRequestMethod& method, | |
0c3d3f65 | 443 | const AnyP::ProtocolType protocol, |
4d919a80 AR |
444 | const char *const urlpath, |
445 | const char *const host, | |
92d6986d | 446 | const SBuf &login, |
4d919a80 AR |
447 | const int port, |
448 | HttpRequest *request) | |
449 | { | |
d4a04ed5 | 450 | if (NULL == request) |
5cafad19 | 451 | request = new HttpRequest(method, protocol, urlpath); |
d4a04ed5 | 452 | else { |
0e8aad88 | 453 | request->initHTTP(method, protocol, urlpath); |
9be14530 | 454 | safe_free(request->canonical); |
d4a04ed5 | 455 | } |
456 | ||
cc192b50 | 457 | request->SetHost(host); |
92d6986d | 458 | request->url.userInfo(login); |
f45dd259 | 459 | request->port = (unsigned short) port; |
7111c86a | 460 | return request; |
461 | } | |
462 | ||
190154cf | 463 | static HttpRequest * |
9be14530 | 464 | urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request) |
23d92c64 | 465 | { |
bf8fe701 | 466 | debugs(50, 5, "urnParse: " << urn); |
9be14530 AJ |
467 | if (request) { |
468 | request->initHTTP(method, AnyP::PROTO_URN, urn + 4); | |
469 | safe_free(request->canonical); | |
470 | return request; | |
471 | } | |
472 | ||
0c3d3f65 | 473 | return new HttpRequest(method, AnyP::PROTO_URN, urn + 4); |
23d92c64 | 474 | } |
475 | ||
4aba13ed | 476 | const char * |
190154cf | 477 | urlCanonical(HttpRequest * request) |
7111c86a | 478 | { |
95d659f0 | 479 | LOCAL_ARRAY(char, portbuf, 32); |
9b5d1d21 | 480 | LOCAL_ARRAY(char, urlbuf, MAX_URL); |
62e76326 | 481 | |
9b5d1d21 | 482 | if (request->canonical) |
62e76326 | 483 | return request->canonical; |
484 | ||
4e3f4dc7 | 485 | if (request->url.getScheme() == AnyP::PROTO_URN) { |
826a1fed | 486 | snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH, |
af6a12ee | 487 | SQUIDSTRINGPRINT(request->urlpath)); |
9b5d1d21 | 488 | } else { |
9e0dafa7 AJ |
489 | switch (request->method.id()) { |
490 | ||
491 | case Http::METHOD_CONNECT: | |
492 | snprintf(urlbuf, MAX_URL, "%s:%d", request->GetHost(), request->port); | |
493 | break; | |
494 | ||
e2849af8 A |
495 | default: { |
496 | portbuf[0] = '\0'; | |
497 | ||
498 | if (request->port != urlDefaultPort(request->url.getScheme())) | |
499 | snprintf(portbuf, 32, ":%d", request->port); | |
500 | ||
92d6986d | 501 | snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s%s" SQUIDSTRINGPH, |
e2849af8 | 502 | request->url.getScheme().c_str(), |
92d6986d AJ |
503 | SQUIDSBUFPRINT(request->url.userInfo()), |
504 | !request->url.userInfo().isEmpty() ? "@" : "", | |
e2849af8 A |
505 | request->GetHost(), |
506 | portbuf, | |
507 | SQUIDSTRINGPRINT(request->urlpath)); | |
508 | } | |
9e0dafa7 | 509 | } |
9b5d1d21 | 510 | } |
62e76326 | 511 | |
4aba13ed | 512 | return (request->canonical = xstrdup(urlbuf)); |
7111c86a | 513 | } |
30a4f2a8 | 514 | |
b3802bdc | 515 | /** \todo AYJ: Performance: This is an *almost* duplicate of urlCanonical. But elides the query-string. |
914b89a2 | 516 | * After copying it on in the first place! Would be less code to merge the two with a flag parameter. |
517 | * and never copy the query-string part in the first place | |
518 | */ | |
88738790 | 519 | char * |
190154cf | 520 | urlCanonicalClean(const HttpRequest * request) |
88738790 | 521 | { |
522 | LOCAL_ARRAY(char, buf, MAX_URL); | |
523 | LOCAL_ARRAY(char, portbuf, 32); | |
524 | char *t; | |
62e76326 | 525 | |
4e3f4dc7 | 526 | if (request->url.getScheme() == AnyP::PROTO_URN) { |
826a1fed | 527 | snprintf(buf, MAX_URL, "urn:" SQUIDSTRINGPH, |
af6a12ee | 528 | SQUIDSTRINGPRINT(request->urlpath)); |
d548ee64 | 529 | } else { |
9e0dafa7 | 530 | switch (request->method.id()) { |
62e76326 | 531 | |
9e0dafa7 | 532 | case Http::METHOD_CONNECT: |
f9d91107 | 533 | snprintf(buf, MAX_URL, "%s:%d", request->GetHost(), request->port); |
9e0dafa7 | 534 | break; |
62e76326 | 535 | |
e2849af8 A |
536 | default: { |
537 | portbuf[0] = '\0'; | |
62e76326 | 538 | |
e2849af8 A |
539 | if (request->port != urlDefaultPort(request->url.getScheme())) |
540 | snprintf(portbuf, 32, ":%d", request->port); | |
62e76326 | 541 | |
92d6986d | 542 | snprintf(buf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s%s" SQUIDSTRINGPH, |
e2849af8 | 543 | request->url.getScheme().c_str(), |
92d6986d AJ |
544 | SQUIDSBUFPRINT(request->url.userInfo()), |
545 | (request->url.userInfo().isEmpty() ? "" : "@"), | |
e2849af8 A |
546 | request->GetHost(), |
547 | portbuf, | |
548 | SQUIDSTRINGPRINT(request->urlpath)); | |
549 | ||
550 | // strip arguments AFTER a question-mark | |
551 | if (Config.onoff.strip_query_terms) | |
552 | if ((t = strchr(buf, '?'))) | |
553 | *(++t) = '\0'; | |
554 | } | |
92d6986d | 555 | } // switch |
d548ee64 | 556 | } |
62e76326 | 557 | |
9bc73deb | 558 | if (stringHasCntl(buf)) |
62e76326 | 559 | xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL); |
560 | ||
88738790 | 561 | return buf; |
562 | } | |
563 | ||
b3802bdc AJ |
564 | /** |
565 | * Yet another alternative to urlCanonical. | |
c2a7cefd | 566 | * This one adds the https:// parts to Http::METHOD_CONNECT URL |
b3802bdc AJ |
567 | * for use in error page outputs. |
568 | * Luckily we can leverage the others instead of duplicating. | |
569 | */ | |
570 | const char * | |
571 | urlCanonicalFakeHttps(const HttpRequest * request) | |
572 | { | |
573 | LOCAL_ARRAY(char, buf, MAX_URL); | |
574 | ||
575 | // method CONNECT and port HTTPS | |
c2a7cefd | 576 | if (request->method == Http::METHOD_CONNECT && request->port == 443) { |
b3802bdc AJ |
577 | snprintf(buf, MAX_URL, "https://%s/*", request->GetHost()); |
578 | return buf; | |
579 | } | |
580 | ||
581 | // else do the normal complete canonical thing. | |
582 | return urlCanonicalClean(request); | |
583 | } | |
584 | ||
bf956b0a BR |
585 | /* |
586 | * Test if a URL is relative. | |
587 | * | |
71051277 BR |
588 | * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will |
589 | * appear before a ':'. | |
bf956b0a | 590 | */ |
6e44cca8 | 591 | bool |
bf956b0a BR |
592 | urlIsRelative(const char *url) |
593 | { | |
594 | const char *p; | |
595 | ||
596 | if (url == NULL) { | |
6e44cca8 | 597 | return (false); |
bf956b0a BR |
598 | } |
599 | if (*url == '\0') { | |
6e44cca8 | 600 | return (false); |
bf956b0a BR |
601 | } |
602 | ||
5db6bf73 | 603 | for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p); |
bf956b0a BR |
604 | |
605 | if (*p == ':') { | |
6e44cca8 | 606 | return (false); |
bf956b0a | 607 | } |
6e44cca8 | 608 | return (true); |
bf956b0a BR |
609 | } |
610 | ||
611 | /* | |
71051277 | 612 | * Convert a relative URL to an absolute URL using the context of a given |
bf956b0a | 613 | * request. |
71051277 BR |
614 | * |
615 | * It is assumed that you have already ensured that the URL is relative. | |
616 | * | |
6e44cca8 BR |
617 | * If NULL is returned it is an indication that the method in use in the |
618 | * request does not distinguish between relative and absolute and you should | |
619 | * use the url unchanged. | |
0376a4c9 BR |
620 | * |
621 | * If non-NULL is returned, it is up to the caller to free the resulting | |
622 | * memory using safe_free(). | |
bf956b0a | 623 | */ |
6e44cca8 | 624 | char * |
bf956b0a | 625 | urlMakeAbsolute(const HttpRequest * req, const char *relUrl) |
3cbbd242 | 626 | { |
3cbbd242 | 627 | |
c2a7cefd | 628 | if (req->method.id() == Http::METHOD_CONNECT) { |
f3900427 | 629 | return (NULL); |
3cbbd242 | 630 | } |
26ac0430 | 631 | |
6e44cca8 | 632 | char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char)); |
26ac0430 | 633 | |
4e3f4dc7 | 634 | if (req->url.getScheme() == AnyP::PROTO_URN) { |
826a1fed | 635 | snprintf(urlbuf, MAX_URL, "urn:" SQUIDSTRINGPH, |
af6a12ee | 636 | SQUIDSTRINGPRINT(req->urlpath)); |
71051277 | 637 | return (urlbuf); |
3cbbd242 | 638 | } |
26ac0430 | 639 | |
6e44cca8 BR |
640 | size_t urllen; |
641 | ||
4e3f4dc7 | 642 | if (req->port != urlDefaultPort(req->url.getScheme())) { |
92d6986d | 643 | urllen = snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s:%d", |
4e3f4dc7 | 644 | req->url.getScheme().c_str(), |
92d6986d AJ |
645 | SQUIDSBUFPRINT(req->url.userInfo()), |
646 | !req->url.userInfo().isEmpty() ? "@" : "", | |
26ac0430 AJ |
647 | req->GetHost(), |
648 | req->port | |
649 | ); | |
6e44cca8 | 650 | } else { |
92d6986d | 651 | urllen = snprintf(urlbuf, MAX_URL, "%s://" SQUIDSBUFPH "%s%s", |
4e3f4dc7 | 652 | req->url.getScheme().c_str(), |
92d6986d AJ |
653 | SQUIDSBUFPRINT(req->url.userInfo()), |
654 | !req->url.userInfo().isEmpty() ? "@" : "", | |
26ac0430 AJ |
655 | req->GetHost() |
656 | ); | |
6e44cca8 BR |
657 | } |
658 | ||
659 | if (relUrl[0] == '/') { | |
660 | strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); | |
661 | } else { | |
b4f2886c | 662 | const char *path = req->urlpath.termedBuf(); |
6e44cca8 BR |
663 | const char *last_slash = strrchr(path, '/'); |
664 | ||
665 | if (last_slash == NULL) { | |
5db6bf73 FC |
666 | urlbuf[urllen] = '/'; |
667 | ++urllen; | |
6e44cca8 BR |
668 | strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
669 | } else { | |
5db6bf73 | 670 | ++last_slash; |
6e44cca8 BR |
671 | size_t pathlen = last_slash - path; |
672 | if (pathlen > MAX_URL - urllen - 1) { | |
673 | pathlen = MAX_URL - urllen - 1; | |
674 | } | |
675 | strncpy(&urlbuf[urllen], path, pathlen); | |
676 | urllen += pathlen; | |
677 | if (urllen + 1 < MAX_URL) { | |
678 | strncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); | |
679 | } | |
680 | } | |
681 | } | |
3cbbd242 | 682 | |
bc9ad11f | 683 | return (urlbuf); |
3cbbd242 | 684 | } |
685 | ||
9bc73deb | 686 | /* |
687 | * matchDomainName() compares a hostname with a domainname according | |
688 | * to the following rules: | |
26ac0430 | 689 | * |
9bc73deb | 690 | * HOST DOMAIN MATCH? |
691 | * ------------- ------------- ------ | |
692 | * foo.com foo.com YES | |
d20b1cd0 | 693 | * .foo.com foo.com YES |
9bc73deb | 694 | * x.foo.com foo.com NO |
695 | * foo.com .foo.com YES | |
696 | * .foo.com .foo.com YES | |
697 | * x.foo.com .foo.com YES | |
698 | * | |
d20b1cd0 | 699 | * We strip leading dots on hosts (but not domains!) so that |
700 | * ".foo.com" is is always the same as "foo.com". | |
701 | * | |
9bc73deb | 702 | * Return values: |
703 | * 0 means the host matches the domain | |
704 | * 1 means the host is greater than the domain | |
705 | * -1 means the host is less than the domain | |
706 | */ | |
707 | ||
b8d8561b | 708 | int |
9bc73deb | 709 | matchDomainName(const char *h, const char *d) |
30a4f2a8 | 710 | { |
9bc73deb | 711 | int dl; |
712 | int hl; | |
62e76326 | 713 | |
d20b1cd0 | 714 | while ('.' == *h) |
5db6bf73 | 715 | ++h; |
62e76326 | 716 | |
9bc73deb | 717 | hl = strlen(h); |
62e76326 | 718 | |
9bc73deb | 719 | dl = strlen(d); |
62e76326 | 720 | |
9bc73deb | 721 | /* |
722 | * Start at the ends of the two strings and work towards the | |
723 | * beginning. | |
724 | */ | |
725 | while (xtolower(h[--hl]) == xtolower(d[--dl])) { | |
62e76326 | 726 | if (hl == 0 && dl == 0) { |
727 | /* | |
728 | * We made it all the way to the beginning of both | |
729 | * strings without finding any difference. | |
730 | */ | |
731 | return 0; | |
732 | } | |
733 | ||
734 | if (0 == hl) { | |
735 | /* | |
736 | * The host string is shorter than the domain string. | |
737 | * There is only one case when this can be a match. | |
738 | * If the domain is just one character longer, and if | |
739 | * that character is a leading '.' then we call it a | |
740 | * match. | |
741 | */ | |
742 | ||
743 | if (1 == dl && '.' == d[0]) | |
744 | return 0; | |
745 | else | |
746 | return -1; | |
747 | } | |
748 | ||
749 | if (0 == dl) { | |
750 | /* | |
751 | * The domain string is shorter than the host string. | |
752 | * This is a match only if the first domain character | |
753 | * is a leading '.'. | |
754 | */ | |
755 | ||
756 | if ('.' == d[0]) | |
757 | return 0; | |
758 | else | |
759 | return 1; | |
760 | } | |
9bc73deb | 761 | } |
62e76326 | 762 | |
9bc73deb | 763 | /* |
764 | * We found different characters in the same position (from the end). | |
765 | */ | |
d20b1cd0 | 766 | /* |
767 | * If one of those character is '.' then its special. In order | |
768 | * for splay tree sorting to work properly, "x-foo.com" must | |
769 | * be greater than ".foo.com" even though '-' is less than '.'. | |
770 | */ | |
771 | if ('.' == d[dl]) | |
62e76326 | 772 | return 1; |
773 | ||
d20b1cd0 | 774 | if ('.' == h[hl]) |
62e76326 | 775 | return -1; |
776 | ||
9bc73deb | 777 | return (xtolower(h[hl]) - xtolower(d[dl])); |
30a4f2a8 | 778 | } |
a8f7d3ee | 779 | |
985c86bc | 780 | /* |
610ee341 | 781 | * return true if we can serve requests for this method. |
985c86bc | 782 | */ |
b8d8561b | 783 | int |
190154cf | 784 | urlCheckRequest(const HttpRequest * r) |
a8f7d3ee | 785 | { |
786 | int rc = 0; | |
610ee341 | 787 | /* protocol "independent" methods |
788 | * | |
789 | * actually these methods are specific to HTTP: | |
790 | * they are methods we recieve on our HTTP port, | |
791 | * and if we had a FTP listener would not be relevant | |
792 | * there. | |
793 | * | |
794 | * So, we should delegate them to HTTP. The problem is that we | |
795 | * do not have a default protocol from the client side of HTTP. | |
796 | */ | |
62e76326 | 797 | |
c2a7cefd | 798 | if (r->method == Http::METHOD_CONNECT) |
62e76326 | 799 | return 1; |
800 | ||
77ce6ba9 AR |
801 | // we support OPTIONS and TRACE directed at us (with a 501 reply, for now) |
802 | // we also support forwarding OPTIONS and TRACE, except for the *-URI ones | |
c2a7cefd | 803 | if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE) |
2e260208 | 804 | return (r->header.getInt64(HDR_MAX_FORWARDS) == 0 || URL::Asterisk().cmp(r->urlpath.rawBuf(), r->urlpath.size()) != 0); |
62e76326 | 805 | |
c2a7cefd | 806 | if (r->method == Http::METHOD_PURGE) |
62e76326 | 807 | return 1; |
808 | ||
99edd1c3 | 809 | /* does method match the protocol? */ |
4e3f4dc7 | 810 | switch (r->url.getScheme()) { |
62e76326 | 811 | |
0c3d3f65 | 812 | case AnyP::PROTO_URN: |
62e76326 | 813 | |
0c3d3f65 | 814 | case AnyP::PROTO_HTTP: |
62e76326 | 815 | |
39a19cb7 | 816 | case AnyP::PROTO_CACHE_OBJECT: |
62e76326 | 817 | rc = 1; |
818 | break; | |
819 | ||
0c3d3f65 | 820 | case AnyP::PROTO_FTP: |
62e76326 | 821 | |
c2a7cefd | 822 | if (r->method == Http::METHOD_PUT) |
62e76326 | 823 | rc = 1; |
824 | ||
0c3d3f65 | 825 | case AnyP::PROTO_GOPHER: |
62e76326 | 826 | |
0c3d3f65 | 827 | case AnyP::PROTO_WAIS: |
62e76326 | 828 | |
0c3d3f65 | 829 | case AnyP::PROTO_WHOIS: |
c2a7cefd | 830 | if (r->method == Http::METHOD_GET) |
62e76326 | 831 | rc = 1; |
c2a7cefd | 832 | else if (r->method == Http::METHOD_HEAD) |
62e76326 | 833 | rc = 1; |
834 | ||
835 | break; | |
836 | ||
0c3d3f65 | 837 | case AnyP::PROTO_HTTPS: |
cb4f4424 | 838 | #if USE_OPENSSL |
62e76326 | 839 | |
840 | rc = 1; | |
841 | ||
842 | break; | |
843 | ||
1f7c9178 | 844 | #else |
62e76326 | 845 | /* |
846 | * Squid can't originate an SSL connection, so it should | |
847 | * never receive an "https:" URL. It should always be | |
848 | * CONNECT instead. | |
849 | */ | |
850 | rc = 0; | |
851 | ||
1f7c9178 | 852 | #endif |
62e76326 | 853 | |
a8f7d3ee | 854 | default: |
62e76326 | 855 | break; |
a8f7d3ee | 856 | } |
62e76326 | 857 | |
a8f7d3ee | 858 | return rc; |
859 | } | |
9ce5e3e6 | 860 | |
861 | /* | |
862 | * Quick-n-dirty host extraction from a URL. Steps: | |
5999b776 | 863 | * Look for a colon |
864 | * Skip any '/' after the colon | |
865 | * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[] | |
866 | * Look for an ending '/' or ':' and terminate | |
867 | * Look for login info preceeded by '@' | |
9ce5e3e6 | 868 | */ |
77bfc324 | 869 | |
870 | class URLHostName | |
871 | { | |
872 | ||
873 | public: | |
874 | char * extract(char const *url); | |
875 | ||
876 | private: | |
877 | static char Host [SQUIDHOSTNAMELEN]; | |
878 | void init(char const *); | |
879 | void findHostStart(); | |
880 | void trimTrailingChars(); | |
881 | void trimAuth(); | |
882 | char const *hostStart; | |
883 | char const *url; | |
884 | }; | |
885 | ||
9ce5e3e6 | 886 | char * |
887 | urlHostname(const char *url) | |
888 | { | |
77bfc324 | 889 | return URLHostName().extract(url); |
890 | } | |
62e76326 | 891 | |
77bfc324 | 892 | char URLHostName::Host[SQUIDHOSTNAMELEN]; |
893 | ||
894 | void | |
895 | URLHostName::init(char const *aUrl) | |
896 | { | |
897 | Host[0] = '\0'; | |
aa1cafc4 | 898 | url = aUrl; |
77bfc324 | 899 | } |
62e76326 | 900 | |
77bfc324 | 901 | void |
902 | URLHostName::findHostStart() | |
903 | { | |
904 | if (NULL == (hostStart = strchr(url, ':'))) | |
905 | return; | |
62e76326 | 906 | |
77bfc324 | 907 | ++hostStart; |
62e76326 | 908 | |
77bfc324 | 909 | while (*hostStart != '\0' && *hostStart == '/') |
910 | ++hostStart; | |
cc192b50 | 911 | |
cc192b50 | 912 | if (*hostStart == ']') |
913 | ++hostStart; | |
77bfc324 | 914 | } |
62e76326 | 915 | |
77bfc324 | 916 | void |
917 | URLHostName::trimTrailingChars() | |
918 | { | |
919 | char *t; | |
920 | ||
921 | if ((t = strchr(Host, '/'))) | |
62e76326 | 922 | *t = '\0'; |
923 | ||
cc192b50 | 924 | if ((t = strrchr(Host, ':'))) |
62e76326 | 925 | *t = '\0'; |
cc192b50 | 926 | |
cc192b50 | 927 | if ((t = strchr(Host, ']'))) |
928 | *t = '\0'; | |
77bfc324 | 929 | } |
62e76326 | 930 | |
77bfc324 | 931 | void |
932 | URLHostName::trimAuth() | |
933 | { | |
934 | char *t; | |
935 | ||
936 | if ((t = strrchr(Host, '@'))) { | |
5db6bf73 | 937 | ++t; |
41d00cd3 | 938 | memmove(Host, t, strlen(t) + 1); |
9ce5e3e6 | 939 | } |
77bfc324 | 940 | } |
941 | ||
942 | char * | |
943 | URLHostName::extract(char const *aUrl) | |
944 | { | |
945 | init(aUrl); | |
946 | findHostStart(); | |
947 | ||
948 | if (hostStart == NULL) | |
949 | return NULL; | |
950 | ||
951 | xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN); | |
952 | ||
953 | trimTrailingChars(); | |
954 | ||
955 | trimAuth(); | |
62e76326 | 956 | |
77bfc324 | 957 | return Host; |
9ce5e3e6 | 958 | } |
f53969cc | 959 |