]>
Commit | Line | Data |
---|---|---|
30a4f2a8 | 1 | /* |
4ac4a490 | 2 | * Copyright (C) 1996-2017 The Squid Software Foundation and contributors |
e25c139f | 3 | * |
bbc27441 AJ |
4 | * Squid software is distributed under GPLv2+ license and includes |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
019dd986 | 7 | */ |
ed43818f | 8 | |
bbc27441 AJ |
9 | /* DEBUG: section 23 URL Parsing */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
582c2af2 | 12 | #include "globals.h" |
528b2c61 | 13 | #include "HttpRequest.h" |
1fa9b1a7 | 14 | #include "rfc1738.h" |
4d5904f7 | 15 | #include "SquidConfig.h" |
7a707cb5 | 16 | #include "SquidString.h" |
582c2af2 | 17 | #include "URL.h" |
090089c4 | 18 | |
4d919a80 | 19 | static HttpRequest *urlParseFinish(const HttpRequestMethod& method, |
0c3d3f65 | 20 | const AnyP::ProtocolType protocol, |
d31d59d8 | 21 | const char *const protoStr, |
4d919a80 AR |
22 | const char *const urlpath, |
23 | const char *const host, | |
92d6986d | 24 | const SBuf &login, |
4d919a80 AR |
25 | const int port, |
26 | HttpRequest *request); | |
9be14530 | 27 | static HttpRequest *urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request); |
a78278e2 | 28 | static const char valid_hostname_chars_u[] = |
62e76326 | 29 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
30 | "abcdefghijklmnopqrstuvwxyz" | |
a78278e2 | 31 | "0123456789-._" |
cc192b50 | 32 | "[:]" |
a78278e2 | 33 | ; |
34 | static const char valid_hostname_chars[] = | |
62e76326 | 35 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
36 | "abcdefghijklmnopqrstuvwxyz" | |
37 | "0123456789-." | |
cc192b50 | 38 | "[:]" |
62e76326 | 39 | ; |
090089c4 | 40 | |
2e260208 AJ |
41 | const SBuf & |
42 | URL::Asterisk() | |
43 | { | |
44 | static SBuf star("*"); | |
45 | return star; | |
46 | } | |
47 | ||
51b5dcf5 AJ |
48 | const SBuf & |
49 | URL::SlashPath() | |
50 | { | |
51 | static SBuf slash("/"); | |
52 | return slash; | |
53 | } | |
54 | ||
5c51bffb AJ |
55 | void |
56 | URL::host(const char *src) | |
57 | { | |
58 | hostAddr_.setEmpty(); | |
59 | hostAddr_ = src; | |
60 | if (hostAddr_.isAnyAddr()) { | |
61 | xstrncpy(host_, src, sizeof(host_)); | |
62 | hostIsNumeric_ = false; | |
63 | } else { | |
64 | hostAddr_.toHostStr(host_, sizeof(host_)); | |
65 | debugs(23, 3, "given IP: " << hostAddr_); | |
66 | hostIsNumeric_ = 1; | |
67 | } | |
68 | touch(); | |
69 | } | |
70 | ||
51b5dcf5 AJ |
71 | const SBuf & |
72 | URL::path() const | |
73 | { | |
74 | // RFC 3986 section 3.3 says path can be empty (path-abempty). | |
75 | // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/" | |
76 | // at least when sending and using. We must still accept path-abempty as input. | |
77 | if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS)) | |
78 | return SlashPath(); | |
79 | ||
80 | return path_; | |
81 | } | |
82 | ||
b8d8561b | 83 | void |
0673c0ba | 84 | urlInitialize(void) |
090089c4 | 85 | { |
bf8fe701 | 86 | debugs(23, 5, "urlInitialize: Initializing..."); |
985c86bc | 87 | /* this ensures that the number of protocol strings is the same as |
0c3d3f65 | 88 | * the enum slots allocated because the last enum is always 'MAX'. |
985c86bc | 89 | */ |
0c3d3f65 | 90 | assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0); |
9bc73deb | 91 | /* |
92 | * These test that our matchDomainName() function works the | |
93 | * way we expect it to. | |
94 | */ | |
95 | assert(0 == matchDomainName("foo.com", "foo.com")); | |
d20b1cd0 | 96 | assert(0 == matchDomainName(".foo.com", "foo.com")); |
9bc73deb | 97 | assert(0 == matchDomainName("foo.com", ".foo.com")); |
98 | assert(0 == matchDomainName(".foo.com", ".foo.com")); | |
99 | assert(0 == matchDomainName("x.foo.com", ".foo.com")); | |
abbd7825 | 100 | assert(0 == matchDomainName("y.x.foo.com", ".foo.com")); |
9bc73deb | 101 | assert(0 != matchDomainName("x.foo.com", "foo.com")); |
102 | assert(0 != matchDomainName("foo.com", "x.foo.com")); | |
103 | assert(0 != matchDomainName("bar.com", "foo.com")); | |
104 | assert(0 != matchDomainName(".bar.com", "foo.com")); | |
105 | assert(0 != matchDomainName(".bar.com", ".foo.com")); | |
106 | assert(0 != matchDomainName("bar.com", ".foo.com")); | |
107 | assert(0 < matchDomainName("zzz.com", "foo.com")); | |
108 | assert(0 > matchDomainName("aaa.com", "foo.com")); | |
109 | assert(0 == matchDomainName("FOO.com", "foo.COM")); | |
aca95add | 110 | assert(0 < matchDomainName("bfoo.com", "afoo.com")); |
111 | assert(0 > matchDomainName("afoo.com", "bfoo.com")); | |
d20b1cd0 | 112 | assert(0 < matchDomainName("x-foo.com", ".foo.com")); |
abbd7825 CT |
113 | |
114 | assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
115 | assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
116 | assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
117 | assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
118 | ||
119 | assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards)); | |
120 | assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards)); | |
121 | assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards)); | |
122 | assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards)); | |
123 | ||
9bc73deb | 124 | /* more cases? */ |
090089c4 | 125 | } |
126 | ||
cc192b50 | 127 | /** |
5a7fb80a AJ |
128 | * Parse the scheme name from string b, into protocol type. |
129 | * The string must be 0-terminated. | |
d4a04ed5 | 130 | */ |
0c3d3f65 | 131 | AnyP::ProtocolType |
5a7fb80a | 132 | urlParseProtocol(const char *b) |
92a6f4b1 | 133 | { |
5a7fb80a AJ |
134 | // make e point to the ':' character |
135 | const char *e = b + strcspn(b, ":"); | |
d4a04ed5 | 136 | int len = e - b; |
137 | ||
fcd2d3ef | 138 | /* test common stuff first */ |
62e76326 | 139 | |
d4a04ed5 | 140 | if (strncasecmp(b, "http", len) == 0) |
0c3d3f65 | 141 | return AnyP::PROTO_HTTP; |
62e76326 | 142 | |
d4a04ed5 | 143 | if (strncasecmp(b, "ftp", len) == 0) |
0c3d3f65 | 144 | return AnyP::PROTO_FTP; |
62e76326 | 145 | |
d4a04ed5 | 146 | if (strncasecmp(b, "https", len) == 0) |
0c3d3f65 | 147 | return AnyP::PROTO_HTTPS; |
62e76326 | 148 | |
d4a04ed5 | 149 | if (strncasecmp(b, "file", len) == 0) |
0c3d3f65 | 150 | return AnyP::PROTO_FTP; |
62e76326 | 151 | |
330f829e AJ |
152 | if (strncasecmp(b, "coap", len) == 0) |
153 | return AnyP::PROTO_COAP; | |
154 | ||
155 | if (strncasecmp(b, "coaps", len) == 0) | |
156 | return AnyP::PROTO_COAPS; | |
157 | ||
d4a04ed5 | 158 | if (strncasecmp(b, "gopher", len) == 0) |
0c3d3f65 | 159 | return AnyP::PROTO_GOPHER; |
62e76326 | 160 | |
d4a04ed5 | 161 | if (strncasecmp(b, "wais", len) == 0) |
0c3d3f65 | 162 | return AnyP::PROTO_WAIS; |
62e76326 | 163 | |
d4a04ed5 | 164 | if (strncasecmp(b, "cache_object", len) == 0) |
39a19cb7 | 165 | return AnyP::PROTO_CACHE_OBJECT; |
62e76326 | 166 | |
d4a04ed5 | 167 | if (strncasecmp(b, "urn", len) == 0) |
0c3d3f65 | 168 | return AnyP::PROTO_URN; |
62e76326 | 169 | |
d4a04ed5 | 170 | if (strncasecmp(b, "whois", len) == 0) |
0c3d3f65 | 171 | return AnyP::PROTO_WHOIS; |
62e76326 | 172 | |
d31d59d8 AJ |
173 | if (len > 0) |
174 | return AnyP::PROTO_UNKNOWN; | |
175 | ||
0c3d3f65 | 176 | return AnyP::PROTO_NONE; |
92a6f4b1 | 177 | } |
178 | ||
d4a04ed5 | 179 | /* |
180 | * Parse a URI/URL. | |
181 | * | |
182 | * If the 'request' arg is non-NULL, put parsed values there instead | |
183 | * of allocating a new HttpRequest. | |
c21ad0f5 | 184 | * |
26ac0430 | 185 | * This abuses HttpRequest as a way of representing the parsed url |
c21ad0f5 | 186 | * and its components. |
187 | * method is used to switch parsers and to init the HttpRequest. | |
c2a7cefd | 188 | * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is |
c21ad0f5 | 189 | * looked for. |
190 | * The url is non const so that if its too long we can NULL-terminate it in place. | |
d4a04ed5 | 191 | */ |
cc192b50 | 192 | |
193 | /* | |
194 | * This routine parses a URL. Its assumed that the URL is complete - | |
195 | * ie, the end of the string is the end of the URL. Don't pass a partial | |
196 | * URL here as this routine doesn't have any way of knowing whether | |
197 | * its partial or not (ie, it handles the case of no trailing slash as | |
198 | * being "end of host with implied path of /". | |
199 | */ | |
190154cf | 200 | HttpRequest * |
60745f24 | 201 | urlParse(const HttpRequestMethod& method, char *url, HttpRequest *request) |
7111c86a | 202 | { |
f2052513 | 203 | LOCAL_ARRAY(char, proto, MAX_URL); |
204 | LOCAL_ARRAY(char, login, MAX_URL); | |
205 | LOCAL_ARRAY(char, host, MAX_URL); | |
206 | LOCAL_ARRAY(char, urlpath, MAX_URL); | |
7111c86a | 207 | char *t = NULL; |
7e3ce7b9 | 208 | char *q = NULL; |
7111c86a | 209 | int port; |
0c3d3f65 | 210 | AnyP::ProtocolType protocol = AnyP::PROTO_NONE; |
774cc2d8 | 211 | int l; |
cc192b50 | 212 | int i; |
213 | const char *src; | |
214 | char *dst; | |
983061ed | 215 | proto[0] = host[0] = urlpath[0] = login[0] = '\0'; |
7111c86a | 216 | |
ba0fd1b6 | 217 | if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) { |
62e76326 | 218 | /* terminate so it doesn't overflow other buffers */ |
219 | *(url + (MAX_URL >> 1)) = '\0'; | |
e0236918 | 220 | debugs(23, DBG_IMPORTANT, "urlParse: URL too large (" << l << " bytes)"); |
62e76326 | 221 | return NULL; |
0a5b9b32 | 222 | } |
c2a7cefd | 223 | if (method == Http::METHOD_CONNECT) { |
62e76326 | 224 | port = CONNECT_PORT; |
225 | ||
0f0affc7 | 226 | if (sscanf(url, "[%[^]]]:%d", host, &port) < 1) |
cc192b50 | 227 | if (sscanf(url, "%[^:]:%d", host, &port) < 1) |
228 | return NULL; | |
229 | ||
c2a7cefd | 230 | } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) && |
2e260208 | 231 | URL::Asterisk().cmp(url) == 0) { |
0c3d3f65 | 232 | protocol = AnyP::PROTO_HTTP; |
d31d59d8 AJ |
233 | port = 80; // or the slow way ... AnyP::UriScheme(protocol,"http").defaultPort(); |
234 | return urlParseFinish(method, protocol, "http", url, host, SBuf(), port, request); | |
23d92c64 | 235 | } else if (!strncmp(url, "urn:", 4)) { |
9be14530 | 236 | return urnParse(method, url, request); |
7111c86a | 237 | } else { |
cc192b50 | 238 | /* Parse the URL: */ |
239 | src = url; | |
240 | i = 0; | |
241 | /* Find first : - everything before is protocol */ | |
5db6bf73 | 242 | for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) { |
cc192b50 | 243 | *dst = *src; |
244 | } | |
245 | if (i >= l) | |
26ac0430 | 246 | return NULL; |
cc192b50 | 247 | *dst = '\0'; |
248 | ||
249 | /* Then its :// */ | |
5e245980 | 250 | if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/') |
62e76326 | 251 | return NULL; |
cc192b50 | 252 | i += 3; |
253 | src += 3; | |
62e76326 | 254 | |
cc192b50 | 255 | /* Then everything until first /; thats host (and port; which we'll look for here later) */ |
68338d14 F |
256 | // bug 1881: If we don't get a "/" then we imply it was there |
257 | // bug 3074: We could just be given a "?" or "#". These also imply "/" | |
b2ab59ad | 258 | // bug 3233: whitespace is also a hostname delimiter. |
5db6bf73 | 259 | for (dst = host; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) { |
cc192b50 | 260 | *dst = *src; |
261 | } | |
262 | ||
26ac0430 | 263 | /* |
cc192b50 | 264 | * We can't check for "i >= l" here because we could be at the end of the line |
265 | * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've | |
266 | * been -given- a valid URL and the path is just '/'. | |
267 | */ | |
268 | if (i > l) | |
269 | return NULL; | |
270 | *dst = '\0'; | |
271 | ||
68338d14 F |
272 | // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/' |
273 | if (*src == '?' || *src == '#' || *src == '\0') { | |
274 | urlpath[0] = '/'; | |
275 | dst = &urlpath[1]; | |
276 | } else { | |
277 | dst = urlpath; | |
278 | } | |
cc192b50 | 279 | /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */ |
5db6bf73 | 280 | for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 281 | *dst = *src; |
282 | } | |
62e76326 | 283 | |
cc192b50 | 284 | /* We -could- be at the end of the buffer here */ |
285 | if (i > l) | |
286 | return NULL; | |
287 | /* If the URL path is empty we set it to be "/" */ | |
288 | if (dst == urlpath) { | |
5db6bf73 FC |
289 | *dst = '/'; |
290 | ++dst; | |
cc192b50 | 291 | } |
292 | *dst = '\0'; | |
293 | ||
294 | protocol = urlParseProtocol(proto); | |
5c51bffb | 295 | port = AnyP::UriScheme(protocol).defaultPort(); |
62e76326 | 296 | |
cc192b50 | 297 | /* Is there any login information? (we should eventually parse it above) */ |
810635e3 FC |
298 | t = strrchr(host, '@'); |
299 | if (t != NULL) { | |
0a84e4fb AJ |
300 | strncpy((char *) login, (char *) host, sizeof(login)-1); |
301 | login[sizeof(login)-1] = '\0'; | |
62e76326 | 302 | t = strrchr(login, '@'); |
303 | *t = 0; | |
0a84e4fb AJ |
304 | strncpy((char *) host, t + 1, sizeof(host)-1); |
305 | host[sizeof(host)-1] = '\0'; | |
bcddfefb AJ |
306 | // Bug 4498: URL-unescape the login info after extraction |
307 | rfc1738_unescape(login); | |
62e76326 | 308 | } |
309 | ||
cc192b50 | 310 | /* Is there any host information? (we should eventually parse it above) */ |
26ac0430 | 311 | if (*host == '[') { |
cc192b50 | 312 | /* strip any IPA brackets. valid under IPv6. */ |
313 | dst = host; | |
cc192b50 | 314 | /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */ |
26ac0430 | 315 | src = host; |
5db6bf73 | 316 | ++src; |
cc192b50 | 317 | l = strlen(host); |
318 | i = 1; | |
5db6bf73 | 319 | for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 320 | *dst = *src; |
321 | } | |
322 | ||
323 | /* we moved in-place, so truncate the actual hostname found */ | |
5db6bf73 FC |
324 | *dst = '\0'; |
325 | ++dst; | |
cc192b50 | 326 | |
327 | /* skip ahead to either start of port, or original EOS */ | |
5db6bf73 FC |
328 | while (*dst != '\0' && *dst != ':') |
329 | ++dst; | |
cc192b50 | 330 | t = dst; |
331 | } else { | |
332 | t = strrchr(host, ':'); | |
333 | ||
26ac0430 | 334 | if (t != strchr(host,':') ) { |
cc192b50 | 335 | /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ |
336 | /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ | |
337 | /* therefore we MUST accept the case where they are not bracketed at all. */ | |
338 | t = NULL; | |
339 | } | |
340 | } | |
62e76326 | 341 | |
b5acc277 | 342 | // Bug 3183 sanity check: If scheme is present, host must be too. |
ff8b6bcf | 343 | if (protocol != AnyP::PROTO_NONE && host[0] == '\0') { |
ac89842b | 344 | debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details."); |
b5acc277 AJ |
345 | return NULL; |
346 | } | |
347 | ||
cc192b50 | 348 | if (t && *t == ':') { |
26ac0430 | 349 | *t = '\0'; |
5db6bf73 | 350 | ++t; |
cc192b50 | 351 | port = atoi(t); |
62e76326 | 352 | } |
7111c86a | 353 | } |
62e76326 | 354 | |
5db6bf73 | 355 | for (t = host; *t; ++t) |
62e76326 | 356 | *t = xtolower(*t); |
357 | ||
30abd221 | 358 | if (stringHasWhitespace(host)) { |
62e76326 | 359 | if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { |
360 | t = q = host; | |
62e76326 | 361 | while (*t) { |
5db6bf73 FC |
362 | if (!xisspace(*t)) { |
363 | *q = *t; | |
364 | ++q; | |
365 | } | |
366 | ++t; | |
62e76326 | 367 | } |
62e76326 | 368 | *q = '\0'; |
369 | } | |
d20b1cd0 | 370 | } |
62e76326 | 371 | |
cc192b50 | 372 | debugs(23, 3, "urlParse: Split URL '" << url << "' into proto='" << proto << "', host='" << host << "', port='" << port << "', path='" << urlpath << "'"); |
373 | ||
a78278e2 | 374 | if (Config.onoff.check_hostnames && strspn(host, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(host)) { |
e0236918 | 375 | debugs(23, DBG_IMPORTANT, "urlParse: Illegal character in hostname '" << host << "'"); |
62e76326 | 376 | return NULL; |
b3f3dd02 | 377 | } |
62e76326 | 378 | |
532e5dd4 AJ |
379 | /* For IPV6 addresses also check for a colon */ |
380 | if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) | |
cc192b50 | 381 | strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(host) - 1); |
382 | ||
1c481e00 | 383 | /* remove trailing dots from hostnames */ |
79d39a72 | 384 | while ((l = strlen(host)) > 0 && host[--l] == '.') |
62e76326 | 385 | host[l] = '\0'; |
386 | ||
cc192b50 | 387 | /* reject duplicate or leading dots */ |
388 | if (strstr(host, "..") || *host == '.') { | |
e0236918 | 389 | debugs(23, DBG_IMPORTANT, "urlParse: Illegal hostname '" << host << "'"); |
cc192b50 | 390 | return NULL; |
391 | } | |
62e76326 | 392 | |
3a1d4727 | 393 | if (port < 1 || port > 65535) { |
bf8fe701 | 394 | debugs(23, 3, "urlParse: Invalid port '" << port << "'"); |
62e76326 | 395 | return NULL; |
7111c86a | 396 | } |
62e76326 | 397 | |
32d002cb | 398 | #if HARDCODE_DENY_PORTS |
429fdbec | 399 | /* These ports are filtered in the default squid.conf, but |
400 | * maybe someone wants them hardcoded... */ | |
6d2eb13e | 401 | if (port == 7 || port == 9 || port == 19) { |
fa84c01d | 402 | debugs(23, DBG_CRITICAL, "urlParse: Deny access to port " << port); |
62e76326 | 403 | return NULL; |
429fdbec | 404 | } |
6ef12318 | 405 | #endif |
cc192b50 | 406 | |
30abd221 | 407 | if (stringHasWhitespace(urlpath)) { |
bf8fe701 | 408 | debugs(23, 2, "urlParse: URI has whitespace: {" << url << "}"); |
62e76326 | 409 | |
410 | switch (Config.uri_whitespace) { | |
411 | ||
412 | case URI_WHITESPACE_DENY: | |
413 | return NULL; | |
414 | ||
415 | case URI_WHITESPACE_ALLOW: | |
416 | break; | |
417 | ||
418 | case URI_WHITESPACE_ENCODE: | |
419 | t = rfc1738_escape_unescaped(urlpath); | |
420 | xstrncpy(urlpath, t, MAX_URL); | |
421 | break; | |
422 | ||
423 | case URI_WHITESPACE_CHOP: | |
424 | *(urlpath + strcspn(urlpath, w_space)) = '\0'; | |
425 | break; | |
426 | ||
427 | case URI_WHITESPACE_STRIP: | |
62e76326 | 428 | default: |
429 | t = q = urlpath; | |
62e76326 | 430 | while (*t) { |
5db6bf73 FC |
431 | if (!xisspace(*t)) { |
432 | *q = *t; | |
433 | ++q; | |
434 | } | |
435 | ++t; | |
62e76326 | 436 | } |
62e76326 | 437 | *q = '\0'; |
438 | } | |
d548ee64 | 439 | } |
62e76326 | 440 | |
d31d59d8 | 441 | return urlParseFinish(method, protocol, proto, urlpath, host, SBuf(login), port, request); |
4d919a80 AR |
442 | } |
443 | ||
444 | /** | |
445 | * Update request with parsed URI data. If the request arg is | |
446 | * non-NULL, put parsed values there instead of allocating a new | |
447 | * HttpRequest. | |
448 | */ | |
449 | static HttpRequest * | |
450 | urlParseFinish(const HttpRequestMethod& method, | |
0c3d3f65 | 451 | const AnyP::ProtocolType protocol, |
d31d59d8 | 452 | const char *const protoStr, // for unknown protocols |
4d919a80 AR |
453 | const char *const urlpath, |
454 | const char *const host, | |
92d6986d | 455 | const SBuf &login, |
4d919a80 AR |
456 | const int port, |
457 | HttpRequest *request) | |
458 | { | |
d4a04ed5 | 459 | if (NULL == request) |
d31d59d8 | 460 | request = new HttpRequest(method, protocol, protoStr, urlpath); |
d4a04ed5 | 461 | else { |
d31d59d8 | 462 | request->initHTTP(method, protocol, protoStr, urlpath); |
d4a04ed5 | 463 | } |
464 | ||
851feda6 | 465 | request->url.host(host); |
92d6986d | 466 | request->url.userInfo(login); |
5c51bffb | 467 | request->url.port(port); |
7111c86a | 468 | return request; |
469 | } | |
470 | ||
190154cf | 471 | static HttpRequest * |
9be14530 | 472 | urnParse(const HttpRequestMethod& method, char *urn, HttpRequest *request) |
23d92c64 | 473 | { |
bf8fe701 | 474 | debugs(50, 5, "urnParse: " << urn); |
9be14530 | 475 | if (request) { |
d31d59d8 | 476 | request->initHTTP(method, AnyP::PROTO_URN, "urn", urn + 4); |
9be14530 AJ |
477 | return request; |
478 | } | |
479 | ||
d31d59d8 | 480 | return new HttpRequest(method, AnyP::PROTO_URN, "urn", urn + 4); |
23d92c64 | 481 | } |
482 | ||
5c51bffb AJ |
483 | void |
484 | URL::touch() | |
485 | { | |
c823e2da | 486 | absolute_.clear(); |
5c51bffb AJ |
487 | authorityHttp_.clear(); |
488 | authorityWithPort_.clear(); | |
489 | } | |
490 | ||
491 | SBuf & | |
492 | URL::authority(bool requirePort) const | |
493 | { | |
494 | if (authorityHttp_.isEmpty()) { | |
495 | ||
496 | // both formats contain Host/IP | |
497 | authorityWithPort_.append(host()); | |
498 | authorityHttp_ = authorityWithPort_; | |
499 | ||
500 | // authorityForm_ only has :port if it is non-default | |
501 | authorityWithPort_.appendf(":%u",port()); | |
502 | if (port() != getScheme().defaultPort()) | |
503 | authorityHttp_ = authorityWithPort_; | |
504 | } | |
505 | ||
506 | return requirePort ? authorityWithPort_ : authorityHttp_; | |
507 | } | |
508 | ||
c823e2da AJ |
509 | SBuf & |
510 | URL::absolute() const | |
511 | { | |
512 | if (absolute_.isEmpty()) { | |
513 | // TODO: most URL will be much shorter, avoid allocating this much | |
514 | absolute_.reserveCapacity(MAX_URL); | |
515 | ||
d31d59d8 AJ |
516 | absolute_.append(getScheme().image()); |
517 | absolute_.append(":",1); | |
c823e2da AJ |
518 | if (getScheme() != AnyP::PROTO_URN) { |
519 | absolute_.append("//", 2); | |
520 | const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP || | |
521 | getScheme() != AnyP::PROTO_HTTPS || | |
522 | userInfo().isEmpty(); | |
523 | if (!omitUserInfo) { | |
524 | absolute_.append(userInfo()); | |
525 | absolute_.append("@", 1); | |
526 | } | |
527 | absolute_.append(authority()); | |
528 | } | |
529 | absolute_.append(path()); | |
530 | } | |
531 | ||
532 | return absolute_; | |
533 | } | |
534 | ||
851feda6 | 535 | /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string. |
914b89a2 | 536 | * After copying it on in the first place! Would be less code to merge the two with a flag parameter. |
537 | * and never copy the query-string part in the first place | |
538 | */ | |
88738790 | 539 | char * |
190154cf | 540 | urlCanonicalClean(const HttpRequest * request) |
88738790 | 541 | { |
542 | LOCAL_ARRAY(char, buf, MAX_URL); | |
62e76326 | 543 | |
851feda6 | 544 | snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(request->effectiveRequestUri())); |
c823e2da | 545 | buf[sizeof(buf)-1] = '\0'; |
62e76326 | 546 | |
c823e2da | 547 | // URN, CONNECT method, and non-stripped URIs can go straight out |
851feda6 | 548 | if (Config.onoff.strip_query_terms && !(request->method == Http::METHOD_CONNECT || request->url.getScheme() == AnyP::PROTO_URN)) { |
c823e2da AJ |
549 | // strip anything AFTER a question-mark |
550 | // leaving the '?' in place | |
551 | if (auto t = strchr(buf, '?')) { | |
552 | *(++t) = '\0'; | |
e2849af8 | 553 | } |
d548ee64 | 554 | } |
62e76326 | 555 | |
9bc73deb | 556 | if (stringHasCntl(buf)) |
62e76326 | 557 | xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL); |
558 | ||
88738790 | 559 | return buf; |
560 | } | |
561 | ||
b3802bdc AJ |
562 | /** |
563 | * Yet another alternative to urlCanonical. | |
c2a7cefd | 564 | * This one adds the https:// parts to Http::METHOD_CONNECT URL |
b3802bdc AJ |
565 | * for use in error page outputs. |
566 | * Luckily we can leverage the others instead of duplicating. | |
567 | */ | |
568 | const char * | |
569 | urlCanonicalFakeHttps(const HttpRequest * request) | |
570 | { | |
571 | LOCAL_ARRAY(char, buf, MAX_URL); | |
572 | ||
573 | // method CONNECT and port HTTPS | |
5c51bffb AJ |
574 | if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) { |
575 | snprintf(buf, MAX_URL, "https://%s/*", request->url.host()); | |
b3802bdc AJ |
576 | return buf; |
577 | } | |
578 | ||
579 | // else do the normal complete canonical thing. | |
580 | return urlCanonicalClean(request); | |
581 | } | |
582 | ||
bf956b0a BR |
583 | /* |
584 | * Test if a URL is relative. | |
585 | * | |
71051277 BR |
586 | * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will |
587 | * appear before a ':'. | |
bf956b0a | 588 | */ |
6e44cca8 | 589 | bool |
bf956b0a BR |
590 | urlIsRelative(const char *url) |
591 | { | |
592 | const char *p; | |
593 | ||
594 | if (url == NULL) { | |
6e44cca8 | 595 | return (false); |
bf956b0a BR |
596 | } |
597 | if (*url == '\0') { | |
6e44cca8 | 598 | return (false); |
bf956b0a BR |
599 | } |
600 | ||
5db6bf73 | 601 | for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p); |
bf956b0a BR |
602 | |
603 | if (*p == ':') { | |
6e44cca8 | 604 | return (false); |
bf956b0a | 605 | } |
6e44cca8 | 606 | return (true); |
bf956b0a BR |
607 | } |
608 | ||
609 | /* | |
71051277 | 610 | * Convert a relative URL to an absolute URL using the context of a given |
bf956b0a | 611 | * request. |
71051277 BR |
612 | * |
613 | * It is assumed that you have already ensured that the URL is relative. | |
614 | * | |
6e44cca8 BR |
615 | * If NULL is returned it is an indication that the method in use in the |
616 | * request does not distinguish between relative and absolute and you should | |
617 | * use the url unchanged. | |
0376a4c9 BR |
618 | * |
619 | * If non-NULL is returned, it is up to the caller to free the resulting | |
620 | * memory using safe_free(). | |
bf956b0a | 621 | */ |
6e44cca8 | 622 | char * |
bf956b0a | 623 | urlMakeAbsolute(const HttpRequest * req, const char *relUrl) |
3cbbd242 | 624 | { |
3cbbd242 | 625 | |
c2a7cefd | 626 | if (req->method.id() == Http::METHOD_CONNECT) { |
f3900427 | 627 | return (NULL); |
3cbbd242 | 628 | } |
26ac0430 | 629 | |
6e44cca8 | 630 | char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char)); |
26ac0430 | 631 | |
4e3f4dc7 | 632 | if (req->url.getScheme() == AnyP::PROTO_URN) { |
c823e2da AJ |
633 | // XXX: this is what the original code did, but it seems to break the |
634 | // intended behaviour of this function. It returns the stored URN path, | |
635 | // not converting the given one into a URN... | |
636 | snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute())); | |
71051277 | 637 | return (urlbuf); |
3cbbd242 | 638 | } |
26ac0430 | 639 | |
5c51bffb | 640 | SBuf authorityForm = req->url.authority(); // host[:port] |
d31d59d8 AJ |
641 | const SBuf &scheme = req->url.getScheme().image(); |
642 | size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH, | |
643 | SQUIDSBUFPRINT(scheme), | |
5c51bffb AJ |
644 | SQUIDSBUFPRINT(req->url.userInfo()), |
645 | !req->url.userInfo().isEmpty() ? "@" : "", | |
646 | SQUIDSBUFPRINT(authorityForm)); | |
6e44cca8 | 647 | |
51b5dcf5 AJ |
648 | // if the first char is '/' assume its a relative path |
649 | // XXX: this breaks on scheme-relative URLs, | |
650 | // but we should not see those outside ESI, and rarely there. | |
c823e2da | 651 | // XXX: also breaks on any URL containing a '/' in the query-string portion |
6e44cca8 | 652 | if (relUrl[0] == '/') { |
51b5dcf5 | 653 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 | 654 | } else { |
51b5dcf5 AJ |
655 | SBuf path = req->url.path(); |
656 | SBuf::size_type lastSlashPos = path.rfind('/'); | |
6e44cca8 | 657 | |
51b5dcf5 AJ |
658 | if (lastSlashPos == SBuf::npos) { |
659 | // replace the whole path with the given bit(s) | |
5db6bf73 FC |
660 | urlbuf[urllen] = '/'; |
661 | ++urllen; | |
51b5dcf5 | 662 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 | 663 | } else { |
51b5dcf5 AJ |
664 | // replace only the last (file?) segment with the given bit(s) |
665 | ++lastSlashPos; | |
666 | if (lastSlashPos > MAX_URL - urllen - 1) { | |
667 | // XXX: crops bits in the middle of the combined URL. | |
668 | lastSlashPos = MAX_URL - urllen - 1; | |
6e44cca8 | 669 | } |
3f0e38d6 | 670 | SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos)); |
51b5dcf5 | 671 | urllen += lastSlashPos; |
6e44cca8 | 672 | if (urllen + 1 < MAX_URL) { |
51b5dcf5 | 673 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 BR |
674 | } |
675 | } | |
676 | } | |
3cbbd242 | 677 | |
bc9ad11f | 678 | return (urlbuf); |
3cbbd242 | 679 | } |
680 | ||
b8d8561b | 681 | int |
abbd7825 | 682 | matchDomainName(const char *h, const char *d, uint flags) |
30a4f2a8 | 683 | { |
9bc73deb | 684 | int dl; |
685 | int hl; | |
62e76326 | 686 | |
abbd7825 | 687 | const bool hostIncludesSubdomains = (*h == '.'); |
d20b1cd0 | 688 | while ('.' == *h) |
5db6bf73 | 689 | ++h; |
62e76326 | 690 | |
9bc73deb | 691 | hl = strlen(h); |
62e76326 | 692 | |
abbd7825 CT |
693 | if (hl == 0) |
694 | return -1; | |
695 | ||
9bc73deb | 696 | dl = strlen(d); |
62e76326 | 697 | |
9bc73deb | 698 | /* |
699 | * Start at the ends of the two strings and work towards the | |
700 | * beginning. | |
701 | */ | |
702 | while (xtolower(h[--hl]) == xtolower(d[--dl])) { | |
62e76326 | 703 | if (hl == 0 && dl == 0) { |
704 | /* | |
705 | * We made it all the way to the beginning of both | |
706 | * strings without finding any difference. | |
707 | */ | |
708 | return 0; | |
709 | } | |
710 | ||
711 | if (0 == hl) { | |
712 | /* | |
713 | * The host string is shorter than the domain string. | |
714 | * There is only one case when this can be a match. | |
715 | * If the domain is just one character longer, and if | |
716 | * that character is a leading '.' then we call it a | |
717 | * match. | |
718 | */ | |
719 | ||
720 | if (1 == dl && '.' == d[0]) | |
721 | return 0; | |
722 | else | |
723 | return -1; | |
724 | } | |
725 | ||
726 | if (0 == dl) { | |
727 | /* | |
728 | * The domain string is shorter than the host string. | |
729 | * This is a match only if the first domain character | |
730 | * is a leading '.'. | |
731 | */ | |
732 | ||
abbd7825 CT |
733 | if ('.' == d[0]) { |
734 | if (flags & mdnRejectSubsubDomains) { | |
735 | // Check for sub-sub domain and reject | |
736 | while(--hl >= 0 && h[hl] != '.'); | |
737 | if (hl < 0) { | |
738 | // No sub-sub domain found, but reject if there is a | |
739 | // leading dot in given host string (which is removed | |
740 | // before the check is started). | |
741 | return hostIncludesSubdomains ? 1 : 0; | |
742 | } else | |
743 | return 1; // sub-sub domain, reject | |
744 | } else | |
745 | return 0; | |
746 | } else | |
62e76326 | 747 | return 1; |
748 | } | |
9bc73deb | 749 | } |
62e76326 | 750 | |
9bc73deb | 751 | /* |
752 | * We found different characters in the same position (from the end). | |
753 | */ | |
69f69080 CT |
754 | |
755 | // If the h has a form of "*.foo.com" and d has a form of "x.foo.com" | |
756 | // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x' | |
757 | // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'. | |
abbd7825 | 758 | if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.') |
69f69080 CT |
759 | return 0; |
760 | ||
d20b1cd0 | 761 | /* |
762 | * If one of those character is '.' then its special. In order | |
763 | * for splay tree sorting to work properly, "x-foo.com" must | |
764 | * be greater than ".foo.com" even though '-' is less than '.'. | |
765 | */ | |
766 | if ('.' == d[dl]) | |
62e76326 | 767 | return 1; |
768 | ||
d20b1cd0 | 769 | if ('.' == h[hl]) |
62e76326 | 770 | return -1; |
771 | ||
9bc73deb | 772 | return (xtolower(h[hl]) - xtolower(d[dl])); |
30a4f2a8 | 773 | } |
a8f7d3ee | 774 | |
985c86bc | 775 | /* |
610ee341 | 776 | * return true if we can serve requests for this method. |
985c86bc | 777 | */ |
b8d8561b | 778 | int |
190154cf | 779 | urlCheckRequest(const HttpRequest * r) |
a8f7d3ee | 780 | { |
781 | int rc = 0; | |
610ee341 | 782 | /* protocol "independent" methods |
783 | * | |
784 | * actually these methods are specific to HTTP: | |
785 | * they are methods we recieve on our HTTP port, | |
786 | * and if we had a FTP listener would not be relevant | |
787 | * there. | |
788 | * | |
789 | * So, we should delegate them to HTTP. The problem is that we | |
790 | * do not have a default protocol from the client side of HTTP. | |
791 | */ | |
62e76326 | 792 | |
c2a7cefd | 793 | if (r->method == Http::METHOD_CONNECT) |
62e76326 | 794 | return 1; |
795 | ||
77ce6ba9 AR |
796 | // we support OPTIONS and TRACE directed at us (with a 501 reply, for now) |
797 | // we also support forwarding OPTIONS and TRACE, except for the *-URI ones | |
c2a7cefd | 798 | if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE) |
789217a2 | 799 | return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != URL::Asterisk()); |
62e76326 | 800 | |
c2a7cefd | 801 | if (r->method == Http::METHOD_PURGE) |
62e76326 | 802 | return 1; |
803 | ||
99edd1c3 | 804 | /* does method match the protocol? */ |
4e3f4dc7 | 805 | switch (r->url.getScheme()) { |
62e76326 | 806 | |
0c3d3f65 | 807 | case AnyP::PROTO_URN: |
62e76326 | 808 | |
0c3d3f65 | 809 | case AnyP::PROTO_HTTP: |
62e76326 | 810 | |
39a19cb7 | 811 | case AnyP::PROTO_CACHE_OBJECT: |
62e76326 | 812 | rc = 1; |
813 | break; | |
814 | ||
0c3d3f65 | 815 | case AnyP::PROTO_FTP: |
62e76326 | 816 | |
c2a7cefd | 817 | if (r->method == Http::METHOD_PUT) |
62e76326 | 818 | rc = 1; |
819 | ||
0c3d3f65 | 820 | case AnyP::PROTO_GOPHER: |
62e76326 | 821 | |
0c3d3f65 | 822 | case AnyP::PROTO_WAIS: |
62e76326 | 823 | |
0c3d3f65 | 824 | case AnyP::PROTO_WHOIS: |
c2a7cefd | 825 | if (r->method == Http::METHOD_GET) |
62e76326 | 826 | rc = 1; |
c2a7cefd | 827 | else if (r->method == Http::METHOD_HEAD) |
62e76326 | 828 | rc = 1; |
829 | ||
830 | break; | |
831 | ||
0c3d3f65 | 832 | case AnyP::PROTO_HTTPS: |
cb4f4424 | 833 | #if USE_OPENSSL |
62e76326 | 834 | rc = 1; |
1f7c9178 | 835 | #else |
62e76326 | 836 | /* |
837 | * Squid can't originate an SSL connection, so it should | |
838 | * never receive an "https:" URL. It should always be | |
839 | * CONNECT instead. | |
840 | */ | |
841 | rc = 0; | |
1f7c9178 | 842 | #endif |
0166128b | 843 | break; |
62e76326 | 844 | |
a8f7d3ee | 845 | default: |
62e76326 | 846 | break; |
a8f7d3ee | 847 | } |
62e76326 | 848 | |
a8f7d3ee | 849 | return rc; |
850 | } | |
9ce5e3e6 | 851 | |
852 | /* | |
853 | * Quick-n-dirty host extraction from a URL. Steps: | |
5999b776 | 854 | * Look for a colon |
855 | * Skip any '/' after the colon | |
856 | * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[] | |
857 | * Look for an ending '/' or ':' and terminate | |
858 | * Look for login info preceeded by '@' | |
9ce5e3e6 | 859 | */ |
77bfc324 | 860 | |
861 | class URLHostName | |
862 | { | |
863 | ||
864 | public: | |
865 | char * extract(char const *url); | |
866 | ||
867 | private: | |
868 | static char Host [SQUIDHOSTNAMELEN]; | |
869 | void init(char const *); | |
870 | void findHostStart(); | |
871 | void trimTrailingChars(); | |
872 | void trimAuth(); | |
873 | char const *hostStart; | |
874 | char const *url; | |
875 | }; | |
876 | ||
9ce5e3e6 | 877 | char * |
878 | urlHostname(const char *url) | |
879 | { | |
77bfc324 | 880 | return URLHostName().extract(url); |
881 | } | |
62e76326 | 882 | |
77bfc324 | 883 | char URLHostName::Host[SQUIDHOSTNAMELEN]; |
884 | ||
885 | void | |
886 | URLHostName::init(char const *aUrl) | |
887 | { | |
888 | Host[0] = '\0'; | |
aa1cafc4 | 889 | url = aUrl; |
77bfc324 | 890 | } |
62e76326 | 891 | |
77bfc324 | 892 | void |
893 | URLHostName::findHostStart() | |
894 | { | |
895 | if (NULL == (hostStart = strchr(url, ':'))) | |
896 | return; | |
62e76326 | 897 | |
77bfc324 | 898 | ++hostStart; |
62e76326 | 899 | |
77bfc324 | 900 | while (*hostStart != '\0' && *hostStart == '/') |
901 | ++hostStart; | |
cc192b50 | 902 | |
cc192b50 | 903 | if (*hostStart == ']') |
904 | ++hostStart; | |
77bfc324 | 905 | } |
62e76326 | 906 | |
77bfc324 | 907 | void |
908 | URLHostName::trimTrailingChars() | |
909 | { | |
910 | char *t; | |
911 | ||
912 | if ((t = strchr(Host, '/'))) | |
62e76326 | 913 | *t = '\0'; |
914 | ||
cc192b50 | 915 | if ((t = strrchr(Host, ':'))) |
62e76326 | 916 | *t = '\0'; |
cc192b50 | 917 | |
cc192b50 | 918 | if ((t = strchr(Host, ']'))) |
919 | *t = '\0'; | |
77bfc324 | 920 | } |
62e76326 | 921 | |
77bfc324 | 922 | void |
923 | URLHostName::trimAuth() | |
924 | { | |
925 | char *t; | |
926 | ||
927 | if ((t = strrchr(Host, '@'))) { | |
5db6bf73 | 928 | ++t; |
41d00cd3 | 929 | memmove(Host, t, strlen(t) + 1); |
9ce5e3e6 | 930 | } |
77bfc324 | 931 | } |
932 | ||
933 | char * | |
934 | URLHostName::extract(char const *aUrl) | |
935 | { | |
936 | init(aUrl); | |
937 | findHostStart(); | |
938 | ||
939 | if (hostStart == NULL) | |
940 | return NULL; | |
941 | ||
942 | xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN); | |
943 | ||
944 | trimTrailingChars(); | |
945 | ||
946 | trimAuth(); | |
62e76326 | 947 | |
77bfc324 | 948 | return Host; |
9ce5e3e6 | 949 | } |
f53969cc | 950 | |
d59e4742 FC |
951 | URL::URL(AnyP::UriScheme const &aScheme) : |
952 | scheme_(aScheme), | |
953 | hostIsNumeric_(false), | |
954 | port_(0) | |
955 | { | |
956 | *host_=0; | |
957 | } | |
1a739503 | 958 |