]>
Commit | Line | Data |
---|---|---|
30a4f2a8 | 1 | /* |
77b1029d | 2 | * Copyright (C) 1996-2020 The Squid Software Foundation and contributors |
e25c139f | 3 | * |
bbc27441 AJ |
4 | * Squid software is distributed under GPLv2+ license and includes |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
019dd986 | 7 | */ |
ed43818f | 8 | |
bbc27441 AJ |
9 | /* DEBUG: section 23 URL Parsing */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
c8ab5ec6 | 12 | #include "anyp/Uri.h" |
582c2af2 | 13 | #include "globals.h" |
528b2c61 | 14 | #include "HttpRequest.h" |
6c880a16 | 15 | #include "parser/Tokenizer.h" |
1fa9b1a7 | 16 | #include "rfc1738.h" |
4d5904f7 | 17 | #include "SquidConfig.h" |
7a707cb5 | 18 | #include "SquidString.h" |
090089c4 | 19 | |
a78278e2 | 20 | static const char valid_hostname_chars_u[] = |
62e76326 | 21 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
22 | "abcdefghijklmnopqrstuvwxyz" | |
a78278e2 | 23 | "0123456789-._" |
cc192b50 | 24 | "[:]" |
a78278e2 | 25 | ; |
26 | static const char valid_hostname_chars[] = | |
62e76326 | 27 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
28 | "abcdefghijklmnopqrstuvwxyz" | |
29 | "0123456789-." | |
cc192b50 | 30 | "[:]" |
62e76326 | 31 | ; |
090089c4 | 32 | |
2e260208 | 33 | const SBuf & |
c8ab5ec6 | 34 | AnyP::Uri::Asterisk() |
2e260208 AJ |
35 | { |
36 | static SBuf star("*"); | |
37 | return star; | |
38 | } | |
39 | ||
51b5dcf5 | 40 | const SBuf & |
c8ab5ec6 | 41 | AnyP::Uri::SlashPath() |
51b5dcf5 AJ |
42 | { |
43 | static SBuf slash("/"); | |
44 | return slash; | |
45 | } | |
46 | ||
5c51bffb | 47 | void |
c8ab5ec6 | 48 | AnyP::Uri::host(const char *src) |
5c51bffb AJ |
49 | { |
50 | hostAddr_.setEmpty(); | |
51 | hostAddr_ = src; | |
52 | if (hostAddr_.isAnyAddr()) { | |
53 | xstrncpy(host_, src, sizeof(host_)); | |
54 | hostIsNumeric_ = false; | |
55 | } else { | |
56 | hostAddr_.toHostStr(host_, sizeof(host_)); | |
57 | debugs(23, 3, "given IP: " << hostAddr_); | |
58 | hostIsNumeric_ = 1; | |
59 | } | |
60 | touch(); | |
61 | } | |
62 | ||
9ce4a1eb CT |
63 | SBuf |
64 | AnyP::Uri::hostOrIp() const | |
65 | { | |
66 | static char ip[MAX_IPSTRLEN]; | |
67 | if (hostIsNumeric()) | |
68 | return SBuf(hostIP().toStr(ip, sizeof(ip))); | |
69 | else | |
70 | return SBuf(host()); | |
71 | } | |
72 | ||
51b5dcf5 | 73 | const SBuf & |
c8ab5ec6 | 74 | AnyP::Uri::path() const |
51b5dcf5 AJ |
75 | { |
76 | // RFC 3986 section 3.3 says path can be empty (path-abempty). | |
77 | // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/" | |
78 | // at least when sending and using. We must still accept path-abempty as input. | |
79 | if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS)) | |
80 | return SlashPath(); | |
81 | ||
82 | return path_; | |
83 | } | |
84 | ||
b8d8561b | 85 | void |
0673c0ba | 86 | urlInitialize(void) |
090089c4 | 87 | { |
bf8fe701 | 88 | debugs(23, 5, "urlInitialize: Initializing..."); |
985c86bc | 89 | /* this ensures that the number of protocol strings is the same as |
0c3d3f65 | 90 | * the enum slots allocated because the last enum is always 'MAX'. |
985c86bc | 91 | */ |
0c3d3f65 | 92 | assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0); |
9bc73deb | 93 | /* |
94 | * These test that our matchDomainName() function works the | |
95 | * way we expect it to. | |
96 | */ | |
97 | assert(0 == matchDomainName("foo.com", "foo.com")); | |
d20b1cd0 | 98 | assert(0 == matchDomainName(".foo.com", "foo.com")); |
9bc73deb | 99 | assert(0 == matchDomainName("foo.com", ".foo.com")); |
100 | assert(0 == matchDomainName(".foo.com", ".foo.com")); | |
101 | assert(0 == matchDomainName("x.foo.com", ".foo.com")); | |
abbd7825 | 102 | assert(0 == matchDomainName("y.x.foo.com", ".foo.com")); |
9bc73deb | 103 | assert(0 != matchDomainName("x.foo.com", "foo.com")); |
104 | assert(0 != matchDomainName("foo.com", "x.foo.com")); | |
105 | assert(0 != matchDomainName("bar.com", "foo.com")); | |
106 | assert(0 != matchDomainName(".bar.com", "foo.com")); | |
107 | assert(0 != matchDomainName(".bar.com", ".foo.com")); | |
108 | assert(0 != matchDomainName("bar.com", ".foo.com")); | |
109 | assert(0 < matchDomainName("zzz.com", "foo.com")); | |
110 | assert(0 > matchDomainName("aaa.com", "foo.com")); | |
111 | assert(0 == matchDomainName("FOO.com", "foo.COM")); | |
aca95add | 112 | assert(0 < matchDomainName("bfoo.com", "afoo.com")); |
113 | assert(0 > matchDomainName("afoo.com", "bfoo.com")); | |
d20b1cd0 | 114 | assert(0 < matchDomainName("x-foo.com", ".foo.com")); |
abbd7825 CT |
115 | |
116 | assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
117 | assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
118 | assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
119 | assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
120 | ||
121 | assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards)); | |
122 | assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards)); | |
123 | assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards)); | |
124 | assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards)); | |
125 | ||
9bc73deb | 126 | /* more cases? */ |
090089c4 | 127 | } |
128 | ||
cc192b50 | 129 | /** |
6c880a16 AJ |
130 | * Extract the URI scheme and ':' delimiter from the given input buffer. |
131 | * | |
132 | * Schemes up to 16 characters are accepted. | |
133 | * | |
134 | * Governed by RFC 3986 section 3.1 | |
d4a04ed5 | 135 | */ |
6c880a16 AJ |
136 | static AnyP::UriScheme |
137 | uriParseScheme(Parser::Tokenizer &tok) | |
92a6f4b1 | 138 | { |
6c880a16 AJ |
139 | /* |
140 | * RFC 3986 section 3.1 paragraph 2: | |
141 | * | |
142 | * Scheme names consist of a sequence of characters beginning with a | |
143 | * letter and followed by any combination of letters, digits, plus | |
144 | * ("+"), period ("."), or hyphen ("-"). | |
091213e6 CT |
145 | * |
146 | * The underscore ("_") required to match "cache_object://" squid | |
147 | * special URI scheme. | |
6c880a16 | 148 | */ |
091213e6 CT |
149 | static const auto schemeChars = |
150 | #if USE_HTTP_VIOLATIONS | |
151 | CharacterSet("special", "_") + | |
152 | #endif | |
153 | CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT; | |
6c880a16 AJ |
154 | |
155 | SBuf str; | |
156 | if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) { | |
157 | const auto protocol = AnyP::UriScheme::FindProtocolType(str); | |
158 | if (protocol == AnyP::PROTO_UNKNOWN) | |
159 | return AnyP::UriScheme(protocol, str.c_str()); | |
160 | return AnyP::UriScheme(protocol, nullptr); | |
161 | } | |
d31d59d8 | 162 | |
6c880a16 | 163 | throw TextException("invalid URI scheme", Here()); |
92a6f4b1 | 164 | } |
165 | ||
38aa10ef AJ |
166 | /** |
167 | * Appends configured append_domain to hostname, assuming | |
168 | * the given buffer is at least SQUIDHOSTNAMELEN bytes long, | |
169 | * and that the host FQDN is not a 'dotless' TLD. | |
170 | * | |
171 | * \returns false if and only if there is not enough space to append | |
172 | */ | |
173 | bool | |
174 | urlAppendDomain(char *host) | |
175 | { | |
176 | /* For IPv4 addresses check for a dot */ | |
177 | /* For IPv6 addresses also check for a colon */ | |
178 | if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) { | |
179 | const uint64_t dlen = strlen(host); | |
180 | const uint64_t want = dlen + Config.appendDomainLen; | |
181 | if (want > SQUIDHOSTNAMELEN - 1) { | |
182 | debugs(23, 2, "URL domain too large (" << dlen << " bytes)"); | |
183 | return false; | |
184 | } | |
185 | strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1); | |
186 | } | |
187 | return true; | |
188 | } | |
189 | ||
d4a04ed5 | 190 | /* |
191 | * Parse a URI/URL. | |
192 | * | |
6c880a16 | 193 | * It is assumed that the URL is complete - |
cc192b50 | 194 | * ie, the end of the string is the end of the URL. Don't pass a partial |
195 | * URL here as this routine doesn't have any way of knowing whether | |
6c880a16 | 196 | * it is partial or not (ie, it handles the case of no trailing slash as |
cc192b50 | 197 | * being "end of host with implied path of /". |
6c880a16 AJ |
198 | * |
199 | * method is used to switch parsers. If method is Http::METHOD_CONNECT, | |
200 | * then rather than a URL a hostname:port is looked for. | |
cc192b50 | 201 | */ |
9157915c | 202 | bool |
6c880a16 | 203 | AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) |
7111c86a | 204 | { |
6c880a16 AJ |
205 | try { |
206 | ||
77b1029d | 207 | LOCAL_ARRAY(char, login, MAX_URL); |
208 | LOCAL_ARRAY(char, foundHost, MAX_URL); | |
209 | LOCAL_ARRAY(char, urlpath, MAX_URL); | |
210 | char *t = NULL; | |
211 | char *q = NULL; | |
212 | int foundPort; | |
213 | int l; | |
214 | int i; | |
215 | const char *src; | |
216 | char *dst; | |
217 | foundHost[0] = urlpath[0] = login[0] = '\0'; | |
218 | ||
219 | if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) { | |
220 | debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)"); | |
221 | return false; | |
222 | } | |
6c880a16 | 223 | |
77b1029d | 224 | if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) && |
225 | Asterisk().cmp(rawUrl) == 0) { | |
226 | // XXX: these methods might also occur in HTTPS traffic. Handle this better. | |
227 | setScheme(AnyP::PROTO_HTTP, nullptr); | |
228 | port(getScheme().defaultPort()); | |
229 | path(Asterisk()); | |
230 | return true; | |
231 | } | |
6c880a16 | 232 | |
77b1029d | 233 | Parser::Tokenizer tok(rawUrl); |
234 | AnyP::UriScheme scheme; | |
cc192b50 | 235 | |
77b1029d | 236 | if (method == Http::METHOD_CONNECT) { |
237 | /* | |
238 | * RFC 7230 section 5.3.3: authority-form = authority | |
239 | * "excluding any userinfo and its "@" delimiter" | |
240 | * | |
241 | * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ] | |
242 | * | |
243 | * As an HTTP(S) proxy we assume HTTPS (443) if no port provided. | |
244 | */ | |
245 | foundPort = 443; | |
6c880a16 | 246 | |
77b1029d | 247 | // XXX: use tokenizer |
248 | auto B = tok.buf(); | |
249 | const char *url = B.c_str(); | |
6c880a16 | 250 | |
77b1029d | 251 | if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1) |
252 | if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1) | |
253 | return false; | |
6c880a16 | 254 | |
77b1029d | 255 | } else { |
cc192b50 | 256 | |
77b1029d | 257 | scheme = uriParseScheme(tok); |
6c880a16 | 258 | |
77b1029d | 259 | if (scheme == AnyP::PROTO_NONE) |
260 | return false; // invalid scheme | |
6c880a16 | 261 | |
77b1029d | 262 | if (scheme == AnyP::PROTO_URN) { |
263 | parseUrn(tok); // throws on any error | |
264 | return true; | |
265 | } | |
62e76326 | 266 | |
77b1029d | 267 | // URLs then have "//" |
268 | static const SBuf doubleSlash("//"); | |
269 | if (!tok.skip(doubleSlash)) | |
270 | return false; | |
cc192b50 | 271 | |
77b1029d | 272 | auto B = tok.remaining(); |
273 | const char *url = B.c_str(); | |
cc192b50 | 274 | |
77b1029d | 275 | /* Parse the URL: */ |
276 | src = url; | |
277 | i = 0; | |
62e76326 | 278 | |
2f8abb64 | 279 | /* Then everything until first /; that's host (and port; which we'll look for here later) */ |
77b1029d | 280 | // bug 1881: If we don't get a "/" then we imply it was there |
281 | // bug 3074: We could just be given a "?" or "#". These also imply "/" | |
282 | // bug 3233: whitespace is also a hostname delimiter. | |
283 | for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) { | |
284 | *dst = *src; | |
285 | } | |
286 | ||
287 | /* | |
288 | * We can't check for "i >= l" here because we could be at the end of the line | |
289 | * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've | |
290 | * been -given- a valid URL and the path is just '/'. | |
291 | */ | |
292 | if (i > l) | |
293 | return false; | |
294 | *dst = '\0'; | |
62e76326 | 295 | |
77b1029d | 296 | // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/' |
297 | if (*src == '?' || *src == '#' || *src == '\0') { | |
298 | urlpath[0] = '/'; | |
299 | dst = &urlpath[1]; | |
300 | } else { | |
301 | dst = urlpath; | |
302 | } | |
2f8abb64 | 303 | /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */ |
77b1029d | 304 | for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 305 | *dst = *src; |
306 | } | |
307 | ||
77b1029d | 308 | /* We -could- be at the end of the buffer here */ |
309 | if (i > l) | |
310 | return false; | |
311 | /* If the URL path is empty we set it to be "/" */ | |
312 | if (dst == urlpath) { | |
313 | *dst = '/'; | |
314 | ++dst; | |
315 | } | |
5db6bf73 | 316 | *dst = '\0'; |
cc192b50 | 317 | |
77b1029d | 318 | foundPort = scheme.defaultPort(); // may be reset later |
319 | ||
320 | /* Is there any login information? (we should eventually parse it above) */ | |
321 | t = strrchr(foundHost, '@'); | |
322 | if (t != NULL) { | |
323 | strncpy((char *) login, (char *) foundHost, sizeof(login)-1); | |
324 | login[sizeof(login)-1] = '\0'; | |
325 | t = strrchr(login, '@'); | |
326 | *t = 0; | |
327 | strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1); | |
328 | foundHost[sizeof(foundHost)-1] = '\0'; | |
329 | // Bug 4498: URL-unescape the login info after extraction | |
330 | rfc1738_unescape(login); | |
331 | } | |
332 | ||
333 | /* Is there any host information? (we should eventually parse it above) */ | |
334 | if (*foundHost == '[') { | |
335 | /* strip any IPA brackets. valid under IPv6. */ | |
336 | dst = foundHost; | |
337 | /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */ | |
338 | src = foundHost; | |
339 | ++src; | |
340 | l = strlen(foundHost); | |
341 | i = 1; | |
342 | for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) { | |
343 | *dst = *src; | |
344 | } | |
345 | ||
346 | /* we moved in-place, so truncate the actual hostname found */ | |
347 | *dst = '\0'; | |
5db6bf73 | 348 | ++dst; |
cc192b50 | 349 | |
77b1029d | 350 | /* skip ahead to either start of port, or original EOS */ |
351 | while (*dst != '\0' && *dst != ':') | |
352 | ++dst; | |
353 | t = dst; | |
354 | } else { | |
355 | t = strrchr(foundHost, ':'); | |
356 | ||
357 | if (t != strchr(foundHost,':') ) { | |
358 | /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ | |
359 | /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ | |
360 | /* therefore we MUST accept the case where they are not bracketed at all. */ | |
361 | t = NULL; | |
362 | } | |
cc192b50 | 363 | } |
62e76326 | 364 | |
77b1029d | 365 | // Bug 3183 sanity check: If scheme is present, host must be too. |
366 | if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') { | |
367 | debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details."); | |
368 | return false; | |
369 | } | |
b5acc277 | 370 | |
77b1029d | 371 | if (t && *t == ':') { |
372 | *t = '\0'; | |
373 | ++t; | |
374 | foundPort = atoi(t); | |
375 | } | |
62e76326 | 376 | } |
62e76326 | 377 | |
77b1029d | 378 | for (t = foundHost; *t; ++t) |
379 | *t = xtolower(*t); | |
380 | ||
381 | if (stringHasWhitespace(foundHost)) { | |
382 | if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { | |
383 | t = q = foundHost; | |
384 | while (*t) { | |
385 | if (!xisspace(*t)) { | |
386 | *q = *t; | |
387 | ++q; | |
388 | } | |
389 | ++t; | |
5db6bf73 | 390 | } |
77b1029d | 391 | *q = '\0'; |
62e76326 | 392 | } |
62e76326 | 393 | } |
62e76326 | 394 | |
77b1029d | 395 | debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'"); |
cc192b50 | 396 | |
77b1029d | 397 | if (Config.onoff.check_hostnames && |
398 | strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) { | |
399 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'"); | |
400 | return false; | |
401 | } | |
62e76326 | 402 | |
77b1029d | 403 | if (!urlAppendDomain(foundHost)) |
404 | return false; | |
cc192b50 | 405 | |
77b1029d | 406 | /* remove trailing dots from hostnames */ |
407 | while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.') | |
408 | foundHost[l] = '\0'; | |
62e76326 | 409 | |
77b1029d | 410 | /* reject duplicate or leading dots */ |
411 | if (strstr(foundHost, "..") || *foundHost == '.') { | |
412 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'"); | |
413 | return false; | |
414 | } | |
62e76326 | 415 | |
77b1029d | 416 | if (foundPort < 1 || foundPort > 65535) { |
417 | debugs(23, 3, "Invalid port '" << foundPort << "'"); | |
418 | return false; | |
419 | } | |
62e76326 | 420 | |
32d002cb | 421 | #if HARDCODE_DENY_PORTS |
77b1029d | 422 | /* These ports are filtered in the default squid.conf, but |
423 | * maybe someone wants them hardcoded... */ | |
424 | if (foundPort == 7 || foundPort == 9 || foundPort == 19) { | |
425 | debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort); | |
426 | return false; | |
427 | } | |
6ef12318 | 428 | #endif |
cc192b50 | 429 | |
77b1029d | 430 | if (stringHasWhitespace(urlpath)) { |
431 | debugs(23, 2, "URI has whitespace: {" << rawUrl << "}"); | |
62e76326 | 432 | |
77b1029d | 433 | switch (Config.uri_whitespace) { |
62e76326 | 434 | |
77b1029d | 435 | case URI_WHITESPACE_DENY: |
436 | return false; | |
62e76326 | 437 | |
77b1029d | 438 | case URI_WHITESPACE_ALLOW: |
439 | break; | |
440 | ||
441 | case URI_WHITESPACE_ENCODE: | |
442 | t = rfc1738_escape_unescaped(urlpath); | |
443 | xstrncpy(urlpath, t, MAX_URL); | |
444 | break; | |
445 | ||
446 | case URI_WHITESPACE_CHOP: | |
447 | *(urlpath + strcspn(urlpath, w_space)) = '\0'; | |
448 | break; | |
449 | ||
450 | case URI_WHITESPACE_STRIP: | |
451 | default: | |
452 | t = q = urlpath; | |
453 | while (*t) { | |
454 | if (!xisspace(*t)) { | |
455 | *q = *t; | |
456 | ++q; | |
457 | } | |
458 | ++t; | |
5db6bf73 | 459 | } |
77b1029d | 460 | *q = '\0'; |
62e76326 | 461 | } |
62e76326 | 462 | } |
62e76326 | 463 | |
77b1029d | 464 | setScheme(scheme); |
465 | path(urlpath); | |
466 | host(foundHost); | |
467 | userInfo(SBuf(login)); | |
468 | port(foundPort); | |
469 | return true; | |
6c880a16 AJ |
470 | |
471 | } catch (...) { | |
472 | debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length())); | |
473 | return false; | |
474 | } | |
7111c86a | 475 | } |
476 | ||
6c880a16 AJ |
477 | /** |
478 | * Governed by RFC 8141 section 2: | |
479 | * | |
480 | * assigned-name = "urn" ":" NID ":" NSS | |
481 | * NID = (alphanum) 0*30(ldh) (alphanum) | |
482 | * ldh = alphanum / "-" | |
483 | * NSS = pchar *(pchar / "/") | |
484 | * | |
485 | * RFC 3986 Appendix D.2 defines (as deprecated): | |
486 | * | |
487 | * alphanum = ALPHA / DIGIT | |
488 | * | |
489 | * Notice that NID is exactly 2-32 characters in length. | |
490 | */ | |
db59367a | 491 | void |
6c880a16 | 492 | AnyP::Uri::parseUrn(Parser::Tokenizer &tok) |
23d92c64 | 493 | { |
6c880a16 AJ |
494 | static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT; |
495 | static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum"); | |
496 | SBuf nid; | |
497 | if (!tok.prefix(nid, nidChars, 32)) | |
498 | throw TextException("NID not found", Here()); | |
499 | ||
500 | if (!tok.skip(':')) | |
501 | throw TextException("NID too long or missing ':' delimiter", Here()); | |
502 | ||
503 | if (nid.length() < 2) | |
504 | throw TextException("NID too short", Here()); | |
505 | ||
506 | if (!alphanum[*nid.begin()]) | |
507 | throw TextException("NID prefix is not alphanumeric", Here()); | |
508 | ||
509 | if (!alphanum[*nid.rbegin()]) | |
510 | throw TextException("NID suffix is not alphanumeric", Here()); | |
511 | ||
512 | setScheme(AnyP::PROTO_URN, nullptr); | |
513 | host(nid.c_str()); | |
514 | // TODO validate path characters | |
515 | path(tok.remaining()); | |
516 | debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length())); | |
23d92c64 | 517 | } |
518 | ||
5c51bffb | 519 | void |
c8ab5ec6 | 520 | AnyP::Uri::touch() |
5c51bffb | 521 | { |
c823e2da | 522 | absolute_.clear(); |
5c51bffb AJ |
523 | authorityHttp_.clear(); |
524 | authorityWithPort_.clear(); | |
525 | } | |
526 | ||
527 | SBuf & | |
c8ab5ec6 | 528 | AnyP::Uri::authority(bool requirePort) const |
5c51bffb AJ |
529 | { |
530 | if (authorityHttp_.isEmpty()) { | |
531 | ||
532 | // both formats contain Host/IP | |
533 | authorityWithPort_.append(host()); | |
534 | authorityHttp_ = authorityWithPort_; | |
535 | ||
536 | // authorityForm_ only has :port if it is non-default | |
537 | authorityWithPort_.appendf(":%u",port()); | |
538 | if (port() != getScheme().defaultPort()) | |
539 | authorityHttp_ = authorityWithPort_; | |
540 | } | |
541 | ||
542 | return requirePort ? authorityWithPort_ : authorityHttp_; | |
543 | } | |
544 | ||
c823e2da | 545 | SBuf & |
c8ab5ec6 | 546 | AnyP::Uri::absolute() const |
c823e2da AJ |
547 | { |
548 | if (absolute_.isEmpty()) { | |
549 | // TODO: most URL will be much shorter, avoid allocating this much | |
550 | absolute_.reserveCapacity(MAX_URL); | |
551 | ||
d31d59d8 AJ |
552 | absolute_.append(getScheme().image()); |
553 | absolute_.append(":",1); | |
c823e2da AJ |
554 | if (getScheme() != AnyP::PROTO_URN) { |
555 | absolute_.append("//", 2); | |
0d0f5161 AJ |
556 | const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP || |
557 | getScheme() == AnyP::PROTO_UNKNOWN; | |
558 | ||
559 | if (allowUserInfo && !userInfo().isEmpty()) { | |
c823e2da AJ |
560 | absolute_.append(userInfo()); |
561 | absolute_.append("@", 1); | |
562 | } | |
563 | absolute_.append(authority()); | |
6c880a16 AJ |
564 | } else { |
565 | absolute_.append(host()); | |
566 | absolute_.append(":", 1); | |
c823e2da AJ |
567 | } |
568 | absolute_.append(path()); | |
569 | } | |
570 | ||
571 | return absolute_; | |
572 | } | |
573 | ||
851feda6 | 574 | /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string. |
914b89a2 | 575 | * After copying it on in the first place! Would be less code to merge the two with a flag parameter. |
576 | * and never copy the query-string part in the first place | |
577 | */ | |
88738790 | 578 | char * |
bec110e4 | 579 | urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme) |
88738790 | 580 | { |
581 | LOCAL_ARRAY(char, buf, MAX_URL); | |
62e76326 | 582 | |
bec110e4 | 583 | snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url)); |
c823e2da | 584 | buf[sizeof(buf)-1] = '\0'; |
62e76326 | 585 | |
c823e2da | 586 | // URN, CONNECT method, and non-stripped URIs can go straight out |
bec110e4 | 587 | if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) { |
c823e2da AJ |
588 | // strip anything AFTER a question-mark |
589 | // leaving the '?' in place | |
590 | if (auto t = strchr(buf, '?')) { | |
591 | *(++t) = '\0'; | |
e2849af8 | 592 | } |
d548ee64 | 593 | } |
62e76326 | 594 | |
9bc73deb | 595 | if (stringHasCntl(buf)) |
62e76326 | 596 | xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL); |
597 | ||
88738790 | 598 | return buf; |
599 | } | |
600 | ||
b3802bdc AJ |
601 | /** |
602 | * Yet another alternative to urlCanonical. | |
c2a7cefd | 603 | * This one adds the https:// parts to Http::METHOD_CONNECT URL |
b3802bdc AJ |
604 | * for use in error page outputs. |
605 | * Luckily we can leverage the others instead of duplicating. | |
606 | */ | |
607 | const char * | |
608 | urlCanonicalFakeHttps(const HttpRequest * request) | |
609 | { | |
610 | LOCAL_ARRAY(char, buf, MAX_URL); | |
611 | ||
612 | // method CONNECT and port HTTPS | |
5c51bffb AJ |
613 | if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) { |
614 | snprintf(buf, MAX_URL, "https://%s/*", request->url.host()); | |
b3802bdc AJ |
615 | return buf; |
616 | } | |
617 | ||
618 | // else do the normal complete canonical thing. | |
bec110e4 | 619 | return request->canonicalCleanUrl(); |
b3802bdc AJ |
620 | } |
621 | ||
bf956b0a BR |
622 | /* |
623 | * Test if a URL is relative. | |
624 | * | |
71051277 BR |
625 | * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will |
626 | * appear before a ':'. | |
bf956b0a | 627 | */ |
6e44cca8 | 628 | bool |
bf956b0a BR |
629 | urlIsRelative(const char *url) |
630 | { | |
631 | const char *p; | |
632 | ||
633 | if (url == NULL) { | |
6e44cca8 | 634 | return (false); |
bf956b0a BR |
635 | } |
636 | if (*url == '\0') { | |
6e44cca8 | 637 | return (false); |
bf956b0a BR |
638 | } |
639 | ||
5db6bf73 | 640 | for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p); |
bf956b0a BR |
641 | |
642 | if (*p == ':') { | |
6e44cca8 | 643 | return (false); |
bf956b0a | 644 | } |
6e44cca8 | 645 | return (true); |
bf956b0a BR |
646 | } |
647 | ||
648 | /* | |
71051277 | 649 | * Convert a relative URL to an absolute URL using the context of a given |
bf956b0a | 650 | * request. |
71051277 BR |
651 | * |
652 | * It is assumed that you have already ensured that the URL is relative. | |
653 | * | |
6e44cca8 BR |
654 | * If NULL is returned it is an indication that the method in use in the |
655 | * request does not distinguish between relative and absolute and you should | |
656 | * use the url unchanged. | |
0376a4c9 BR |
657 | * |
658 | * If non-NULL is returned, it is up to the caller to free the resulting | |
659 | * memory using safe_free(). | |
bf956b0a | 660 | */ |
6e44cca8 | 661 | char * |
bf956b0a | 662 | urlMakeAbsolute(const HttpRequest * req, const char *relUrl) |
3cbbd242 | 663 | { |
3cbbd242 | 664 | |
c2a7cefd | 665 | if (req->method.id() == Http::METHOD_CONNECT) { |
f3900427 | 666 | return (NULL); |
3cbbd242 | 667 | } |
26ac0430 | 668 | |
6e44cca8 | 669 | char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char)); |
26ac0430 | 670 | |
4e3f4dc7 | 671 | if (req->url.getScheme() == AnyP::PROTO_URN) { |
c823e2da AJ |
672 | // XXX: this is what the original code did, but it seems to break the |
673 | // intended behaviour of this function. It returns the stored URN path, | |
674 | // not converting the given one into a URN... | |
675 | snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute())); | |
71051277 | 676 | return (urlbuf); |
3cbbd242 | 677 | } |
26ac0430 | 678 | |
5c51bffb | 679 | SBuf authorityForm = req->url.authority(); // host[:port] |
d31d59d8 AJ |
680 | const SBuf &scheme = req->url.getScheme().image(); |
681 | size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH, | |
682 | SQUIDSBUFPRINT(scheme), | |
5c51bffb AJ |
683 | SQUIDSBUFPRINT(req->url.userInfo()), |
684 | !req->url.userInfo().isEmpty() ? "@" : "", | |
685 | SQUIDSBUFPRINT(authorityForm)); | |
6e44cca8 | 686 | |
51b5dcf5 AJ |
687 | // if the first char is '/' assume its a relative path |
688 | // XXX: this breaks on scheme-relative URLs, | |
689 | // but we should not see those outside ESI, and rarely there. | |
c823e2da | 690 | // XXX: also breaks on any URL containing a '/' in the query-string portion |
6e44cca8 | 691 | if (relUrl[0] == '/') { |
51b5dcf5 | 692 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 | 693 | } else { |
51b5dcf5 AJ |
694 | SBuf path = req->url.path(); |
695 | SBuf::size_type lastSlashPos = path.rfind('/'); | |
6e44cca8 | 696 | |
51b5dcf5 AJ |
697 | if (lastSlashPos == SBuf::npos) { |
698 | // replace the whole path with the given bit(s) | |
5db6bf73 FC |
699 | urlbuf[urllen] = '/'; |
700 | ++urllen; | |
51b5dcf5 | 701 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 | 702 | } else { |
51b5dcf5 AJ |
703 | // replace only the last (file?) segment with the given bit(s) |
704 | ++lastSlashPos; | |
705 | if (lastSlashPos > MAX_URL - urllen - 1) { | |
706 | // XXX: crops bits in the middle of the combined URL. | |
707 | lastSlashPos = MAX_URL - urllen - 1; | |
6e44cca8 | 708 | } |
3f0e38d6 | 709 | SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos)); |
51b5dcf5 | 710 | urllen += lastSlashPos; |
6e44cca8 | 711 | if (urllen + 1 < MAX_URL) { |
51b5dcf5 | 712 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 BR |
713 | } |
714 | } | |
715 | } | |
3cbbd242 | 716 | |
bc9ad11f | 717 | return (urlbuf); |
3cbbd242 | 718 | } |
719 | ||
b8d8561b | 720 | int |
6c1219b9 | 721 | matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags) |
30a4f2a8 | 722 | { |
9bc73deb | 723 | int dl; |
724 | int hl; | |
62e76326 | 725 | |
abbd7825 | 726 | const bool hostIncludesSubdomains = (*h == '.'); |
d20b1cd0 | 727 | while ('.' == *h) |
5db6bf73 | 728 | ++h; |
62e76326 | 729 | |
9bc73deb | 730 | hl = strlen(h); |
62e76326 | 731 | |
abbd7825 CT |
732 | if (hl == 0) |
733 | return -1; | |
734 | ||
9bc73deb | 735 | dl = strlen(d); |
62e76326 | 736 | |
9bc73deb | 737 | /* |
738 | * Start at the ends of the two strings and work towards the | |
739 | * beginning. | |
740 | */ | |
741 | while (xtolower(h[--hl]) == xtolower(d[--dl])) { | |
62e76326 | 742 | if (hl == 0 && dl == 0) { |
743 | /* | |
744 | * We made it all the way to the beginning of both | |
745 | * strings without finding any difference. | |
746 | */ | |
747 | return 0; | |
748 | } | |
749 | ||
750 | if (0 == hl) { | |
751 | /* | |
752 | * The host string is shorter than the domain string. | |
753 | * There is only one case when this can be a match. | |
754 | * If the domain is just one character longer, and if | |
755 | * that character is a leading '.' then we call it a | |
756 | * match. | |
757 | */ | |
758 | ||
759 | if (1 == dl && '.' == d[0]) | |
760 | return 0; | |
761 | else | |
762 | return -1; | |
763 | } | |
764 | ||
765 | if (0 == dl) { | |
766 | /* | |
767 | * The domain string is shorter than the host string. | |
768 | * This is a match only if the first domain character | |
769 | * is a leading '.'. | |
770 | */ | |
771 | ||
abbd7825 CT |
772 | if ('.' == d[0]) { |
773 | if (flags & mdnRejectSubsubDomains) { | |
774 | // Check for sub-sub domain and reject | |
775 | while(--hl >= 0 && h[hl] != '.'); | |
776 | if (hl < 0) { | |
777 | // No sub-sub domain found, but reject if there is a | |
778 | // leading dot in given host string (which is removed | |
779 | // before the check is started). | |
780 | return hostIncludesSubdomains ? 1 : 0; | |
781 | } else | |
782 | return 1; // sub-sub domain, reject | |
783 | } else | |
784 | return 0; | |
785 | } else | |
62e76326 | 786 | return 1; |
787 | } | |
9bc73deb | 788 | } |
62e76326 | 789 | |
9bc73deb | 790 | /* |
791 | * We found different characters in the same position (from the end). | |
792 | */ | |
69f69080 CT |
793 | |
794 | // If the h has a form of "*.foo.com" and d has a form of "x.foo.com" | |
795 | // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x' | |
796 | // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'. | |
abbd7825 | 797 | if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.') |
69f69080 CT |
798 | return 0; |
799 | ||
d20b1cd0 | 800 | /* |
801 | * If one of those character is '.' then its special. In order | |
802 | * for splay tree sorting to work properly, "x-foo.com" must | |
803 | * be greater than ".foo.com" even though '-' is less than '.'. | |
804 | */ | |
805 | if ('.' == d[dl]) | |
62e76326 | 806 | return 1; |
807 | ||
d20b1cd0 | 808 | if ('.' == h[hl]) |
62e76326 | 809 | return -1; |
810 | ||
9bc73deb | 811 | return (xtolower(h[hl]) - xtolower(d[dl])); |
30a4f2a8 | 812 | } |
a8f7d3ee | 813 | |
985c86bc | 814 | /* |
610ee341 | 815 | * return true if we can serve requests for this method. |
985c86bc | 816 | */ |
b8d8561b | 817 | int |
190154cf | 818 | urlCheckRequest(const HttpRequest * r) |
a8f7d3ee | 819 | { |
820 | int rc = 0; | |
610ee341 | 821 | /* protocol "independent" methods |
822 | * | |
823 | * actually these methods are specific to HTTP: | |
2f8abb64 | 824 | * they are methods we receive on our HTTP port, |
610ee341 | 825 | * and if we had a FTP listener would not be relevant |
826 | * there. | |
827 | * | |
828 | * So, we should delegate them to HTTP. The problem is that we | |
829 | * do not have a default protocol from the client side of HTTP. | |
830 | */ | |
62e76326 | 831 | |
c2a7cefd | 832 | if (r->method == Http::METHOD_CONNECT) |
62e76326 | 833 | return 1; |
834 | ||
77ce6ba9 AR |
835 | // we support OPTIONS and TRACE directed at us (with a 501 reply, for now) |
836 | // we also support forwarding OPTIONS and TRACE, except for the *-URI ones | |
c2a7cefd | 837 | if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE) |
c8ab5ec6 | 838 | return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk()); |
62e76326 | 839 | |
c2a7cefd | 840 | if (r->method == Http::METHOD_PURGE) |
62e76326 | 841 | return 1; |
842 | ||
99edd1c3 | 843 | /* does method match the protocol? */ |
4e3f4dc7 | 844 | switch (r->url.getScheme()) { |
62e76326 | 845 | |
0c3d3f65 | 846 | case AnyP::PROTO_URN: |
62e76326 | 847 | |
0c3d3f65 | 848 | case AnyP::PROTO_HTTP: |
62e76326 | 849 | |
39a19cb7 | 850 | case AnyP::PROTO_CACHE_OBJECT: |
62e76326 | 851 | rc = 1; |
852 | break; | |
853 | ||
0c3d3f65 | 854 | case AnyP::PROTO_FTP: |
62e76326 | 855 | |
c2a7cefd | 856 | if (r->method == Http::METHOD_PUT) |
62e76326 | 857 | rc = 1; |
858 | ||
0c3d3f65 | 859 | case AnyP::PROTO_GOPHER: |
62e76326 | 860 | |
0c3d3f65 | 861 | case AnyP::PROTO_WAIS: |
62e76326 | 862 | |
0c3d3f65 | 863 | case AnyP::PROTO_WHOIS: |
c2a7cefd | 864 | if (r->method == Http::METHOD_GET) |
62e76326 | 865 | rc = 1; |
c2a7cefd | 866 | else if (r->method == Http::METHOD_HEAD) |
62e76326 | 867 | rc = 1; |
868 | ||
869 | break; | |
870 | ||
0c3d3f65 | 871 | case AnyP::PROTO_HTTPS: |
cb4f4424 | 872 | #if USE_OPENSSL |
62e76326 | 873 | rc = 1; |
418293da AJ |
874 | #elif USE_GNUTLS |
875 | rc = 1; | |
1f7c9178 | 876 | #else |
62e76326 | 877 | /* |
878 | * Squid can't originate an SSL connection, so it should | |
879 | * never receive an "https:" URL. It should always be | |
880 | * CONNECT instead. | |
881 | */ | |
882 | rc = 0; | |
1f7c9178 | 883 | #endif |
0166128b | 884 | break; |
62e76326 | 885 | |
a8f7d3ee | 886 | default: |
62e76326 | 887 | break; |
a8f7d3ee | 888 | } |
62e76326 | 889 | |
a8f7d3ee | 890 | return rc; |
891 | } | |
9ce5e3e6 | 892 | |
c8ab5ec6 | 893 | AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) : |
d59e4742 FC |
894 | scheme_(aScheme), |
895 | hostIsNumeric_(false), | |
896 | port_(0) | |
897 | { | |
898 | *host_=0; | |
899 | } | |
1a739503 | 900 | |
bec110e4 EB |
901 | // TODO: fix code duplication with AnyP::Uri::parse() |
902 | char * | |
903 | AnyP::Uri::cleanup(const char *uri) | |
904 | { | |
905 | int flags = 0; | |
906 | char *cleanedUri = nullptr; | |
907 | switch (Config.uri_whitespace) { | |
908 | case URI_WHITESPACE_ALLOW: | |
909 | flags |= RFC1738_ESCAPE_NOSPACE; | |
279e60ef | 910 | // fall through to next case |
bec110e4 EB |
911 | case URI_WHITESPACE_ENCODE: |
912 | flags |= RFC1738_ESCAPE_UNESCAPED; | |
913 | cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL); | |
914 | break; | |
915 | ||
916 | case URI_WHITESPACE_CHOP: { | |
917 | flags |= RFC1738_ESCAPE_UNESCAPED; | |
918 | const auto pos = strcspn(uri, w_space); | |
919 | char *choppedUri = nullptr; | |
920 | if (pos < strlen(uri)) | |
921 | choppedUri = xstrndup(uri, pos + 1); | |
922 | cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL); | |
923 | cleanedUri[pos] = '\0'; | |
924 | xfree(choppedUri); | |
925 | } | |
926 | break; | |
927 | ||
928 | case URI_WHITESPACE_DENY: | |
929 | case URI_WHITESPACE_STRIP: | |
930 | default: { | |
931 | // TODO: avoid duplication with urlParse() | |
932 | const char *t; | |
933 | char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1)); | |
934 | char *q = tmp_uri; | |
935 | t = uri; | |
936 | while (*t) { | |
937 | if (!xisspace(*t)) { | |
938 | *q = *t; | |
939 | ++q; | |
940 | } | |
941 | ++t; | |
942 | } | |
943 | *q = '\0'; | |
944 | cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL); | |
945 | xfree(tmp_uri); | |
946 | } | |
947 | break; | |
948 | } | |
949 | ||
950 | assert(cleanedUri); | |
951 | return cleanedUri; | |
952 | } | |
279e60ef | 953 |