]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (C) 1996-2025 The Squid Software Foundation and contributors | |
3 | * | |
4 | * Squid software is distributed under GPLv2+ license and includes | |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
7 | */ | |
8 | ||
9 | /* DEBUG: section 23 URL Parsing */ | |
10 | ||
11 | #include "squid.h" | |
12 | #include "anyp/Host.h" | |
13 | #include "anyp/Uri.h" | |
14 | #include "base/Raw.h" | |
15 | #include "globals.h" | |
16 | #include "HttpRequest.h" | |
17 | #include "parser/Tokenizer.h" | |
18 | #include "rfc1738.h" | |
19 | #include "SquidConfig.h" | |
20 | #include "SquidMath.h" | |
21 | ||
22 | static const char valid_hostname_chars_u[] = | |
23 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
24 | "abcdefghijklmnopqrstuvwxyz" | |
25 | "0123456789-._" | |
26 | "[:]" | |
27 | ; | |
28 | static const char valid_hostname_chars[] = | |
29 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
30 | "abcdefghijklmnopqrstuvwxyz" | |
31 | "0123456789-." | |
32 | "[:]" | |
33 | ; | |
34 | ||
35 | /// Characters which are valid within a URI userinfo section | |
36 | static const CharacterSet & | |
37 | UserInfoChars() | |
38 | { | |
39 | /* | |
40 | * RFC 3986 section 3.2.1 | |
41 | * | |
42 | * userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) | |
43 | * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" | |
44 | * pct-encoded = "%" HEXDIG HEXDIG | |
45 | * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | |
46 | */ | |
47 | static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") + | |
48 | CharacterSet::ALPHA + | |
49 | CharacterSet::DIGIT; | |
50 | return userInfoValid; | |
51 | } | |
52 | ||
53 | /** | |
54 | * Governed by RFC 3986 section 2.1 | |
55 | */ | |
56 | SBuf | |
57 | AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore) | |
58 | { | |
59 | if (buf.isEmpty()) | |
60 | return buf; | |
61 | ||
62 | Parser::Tokenizer tk(buf); | |
63 | SBuf goodSection; | |
64 | // optimization for the arguably common "no encoding necessary" case | |
65 | if (tk.prefix(goodSection, ignore) && tk.atEnd()) | |
66 | return buf; | |
67 | ||
68 | SBuf output; | |
69 | output.reserveSpace(buf.length() * 3); // worst case: encode all chars | |
70 | output.append(goodSection); // may be empty | |
71 | ||
72 | while (!tk.atEnd()) { | |
73 | // TODO: Add Tokenizer::parseOne(void). | |
74 | const auto ch = tk.remaining()[0]; | |
75 | output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table | |
76 | (void)tk.skip(ch); | |
77 | ||
78 | if (tk.prefix(goodSection, ignore)) | |
79 | output.append(goodSection); | |
80 | } | |
81 | ||
82 | return output; | |
83 | } | |
84 | ||
85 | SBuf | |
86 | AnyP::Uri::Decode(const SBuf &buf) | |
87 | { | |
88 | SBuf output; | |
89 | Parser::Tokenizer tok(buf); | |
90 | while (!tok.atEnd()) { | |
91 | SBuf token; | |
92 | static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded"); | |
93 | if (tok.prefix(token, unencodedChars)) | |
94 | output.append(token); | |
95 | ||
96 | // we are either at '%' or at end of input | |
97 | if (tok.skip('%')) { | |
98 | int64_t hex1 = 0, hex2 = 0; | |
99 | if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1)) | |
100 | output.append(static_cast<char>((hex1 << 4) | hex2)); | |
101 | else | |
102 | throw TextException("invalid pct-encoded triplet", Here()); | |
103 | } | |
104 | } | |
105 | return output; | |
106 | } | |
107 | ||
108 | const SBuf & | |
109 | AnyP::Uri::Asterisk() | |
110 | { | |
111 | static SBuf star("*"); | |
112 | return star; | |
113 | } | |
114 | ||
115 | const SBuf & | |
116 | AnyP::Uri::SlashPath() | |
117 | { | |
118 | static SBuf slash("/"); | |
119 | return slash; | |
120 | } | |
121 | ||
122 | void | |
123 | AnyP::Uri::host(const char *src) | |
124 | { | |
125 | hostAddr_.fromHost(src); | |
126 | if (hostAddr_.isAnyAddr()) { | |
127 | xstrncpy(host_, src, sizeof(host_)); | |
128 | hostIsNumeric_ = false; | |
129 | } else { | |
130 | hostAddr_.toHostStr(host_, sizeof(host_)); | |
131 | debugs(23, 3, "given IP: " << hostAddr_); | |
132 | hostIsNumeric_ = 1; | |
133 | } | |
134 | touch(); | |
135 | } | |
136 | ||
137 | // TODO: Replace with ToSBuf(parsedHost()) or similar. | |
138 | SBuf | |
139 | AnyP::Uri::hostOrIp() const | |
140 | { | |
141 | if (hostIsNumeric()) { | |
142 | static char ip[MAX_IPSTRLEN]; | |
143 | const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip)); | |
144 | return SBuf(ip, hostStrLen); | |
145 | } else | |
146 | return SBuf(host()); | |
147 | } | |
148 | ||
149 | std::optional<AnyP::Host> | |
150 | AnyP::Uri::parsedHost() const | |
151 | { | |
152 | if (hostIsNumeric()) | |
153 | return Host::ParseIp(hostIP()); | |
154 | ||
155 | // XXX: Interpret host subcomponent as reg-name representing a DNS name. It | |
156 | // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but | |
157 | // current Squid APIs do not support adequate representation of those cases. | |
158 | const SBuf regName(host()); | |
159 | ||
160 | if (regName.find('%') != SBuf::npos) { | |
161 | debugs(23, 3, "rejecting percent-encoded reg-name: " << regName); | |
162 | return std::nullopt; // TODO: Decode() instead | |
163 | } | |
164 | ||
165 | return Host::ParseSimpleDomainName(regName); | |
166 | } | |
167 | ||
168 | const SBuf & | |
169 | AnyP::Uri::path() const | |
170 | { | |
171 | // RFC 3986 section 3.3 says path can be empty (path-abempty). | |
172 | // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/" | |
173 | // at least when sending and using. We must still accept path-abempty as input. | |
174 | if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS)) | |
175 | return SlashPath(); | |
176 | ||
177 | return path_; | |
178 | } | |
179 | ||
180 | void | |
181 | urlInitialize(void) | |
182 | { | |
183 | debugs(23, 5, "urlInitialize: Initializing..."); | |
184 | /* this ensures that the number of protocol strings is the same as | |
185 | * the enum slots allocated because the last enum is always 'MAX'. | |
186 | */ | |
187 | assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0); | |
188 | /* | |
189 | * These test that our matchDomainName() function works the | |
190 | * way we expect it to. | |
191 | */ | |
192 | assert(0 == matchDomainName("foo.com", "foo.com")); | |
193 | assert(0 == matchDomainName(".foo.com", "foo.com")); | |
194 | assert(0 == matchDomainName("foo.com", ".foo.com")); | |
195 | assert(0 == matchDomainName(".foo.com", ".foo.com")); | |
196 | assert(0 == matchDomainName("x.foo.com", ".foo.com")); | |
197 | assert(0 == matchDomainName("y.x.foo.com", ".foo.com")); | |
198 | assert(0 != matchDomainName("x.foo.com", "foo.com")); | |
199 | assert(0 != matchDomainName("foo.com", "x.foo.com")); | |
200 | assert(0 != matchDomainName("bar.com", "foo.com")); | |
201 | assert(0 != matchDomainName(".bar.com", "foo.com")); | |
202 | assert(0 != matchDomainName(".bar.com", ".foo.com")); | |
203 | assert(0 != matchDomainName("bar.com", ".foo.com")); | |
204 | assert(0 < matchDomainName("zzz.com", "foo.com")); | |
205 | assert(0 > matchDomainName("aaa.com", "foo.com")); | |
206 | assert(0 == matchDomainName("FOO.com", "foo.COM")); | |
207 | assert(0 < matchDomainName("bfoo.com", "afoo.com")); | |
208 | assert(0 > matchDomainName("afoo.com", "bfoo.com")); | |
209 | assert(0 < matchDomainName("x-foo.com", ".foo.com")); | |
210 | ||
211 | assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
212 | assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
213 | assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
214 | assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
215 | ||
216 | assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards)); | |
217 | assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards)); | |
218 | assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards)); | |
219 | assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards)); | |
220 | ||
221 | assert(0 != matchDomainName("foo.com", "")); | |
222 | assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards)); | |
223 | assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains)); | |
224 | ||
225 | /* more cases? */ | |
226 | } | |
227 | ||
228 | /** | |
229 | * Extract the URI scheme and ':' delimiter from the given input buffer. | |
230 | * | |
231 | * Schemes up to 16 characters are accepted. | |
232 | * | |
233 | * Governed by RFC 3986 section 3.1 | |
234 | */ | |
235 | static AnyP::UriScheme | |
236 | uriParseScheme(Parser::Tokenizer &tok) | |
237 | { | |
238 | /* | |
239 | * RFC 3986 section 3.1 paragraph 2: | |
240 | * | |
241 | * Scheme names consist of a sequence of characters beginning with a | |
242 | * letter and followed by any combination of letters, digits, plus | |
243 | * ("+"), period ("."), or hyphen ("-"). | |
244 | */ | |
245 | static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT; | |
246 | ||
247 | SBuf str; | |
248 | if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) { | |
249 | const auto protocol = AnyP::UriScheme::FindProtocolType(str); | |
250 | if (protocol == AnyP::PROTO_UNKNOWN) | |
251 | return AnyP::UriScheme(protocol, str.c_str()); | |
252 | return AnyP::UriScheme(protocol, nullptr); | |
253 | } | |
254 | ||
255 | throw TextException("invalid URI scheme", Here()); | |
256 | } | |
257 | ||
258 | /** | |
259 | * Appends configured append_domain to hostname, assuming | |
260 | * the given buffer is at least SQUIDHOSTNAMELEN bytes long, | |
261 | * and that the host FQDN is not a 'dotless' TLD. | |
262 | * | |
263 | * \returns false if and only if there is not enough space to append | |
264 | */ | |
265 | bool | |
266 | urlAppendDomain(char *host) | |
267 | { | |
268 | /* For IPv4 addresses check for a dot */ | |
269 | /* For IPv6 addresses also check for a colon */ | |
270 | if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) { | |
271 | const uint64_t dlen = strlen(host); | |
272 | const uint64_t want = dlen + Config.appendDomainLen; | |
273 | if (want > SQUIDHOSTNAMELEN - 1) { | |
274 | debugs(23, 2, "URL domain too large (" << dlen << " bytes)"); | |
275 | return false; | |
276 | } | |
277 | strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1); | |
278 | } | |
279 | return true; | |
280 | } | |
281 | ||
282 | /* | |
283 | * Parse a URI/URL. | |
284 | * | |
285 | * It is assumed that the URL is complete - | |
286 | * ie, the end of the string is the end of the URL. Don't pass a partial | |
287 | * URL here as this routine doesn't have any way of knowing whether | |
288 | * it is partial or not (ie, it handles the case of no trailing slash as | |
289 | * being "end of host with implied path of /". | |
290 | * | |
291 | * method is used to switch parsers. If method is Http::METHOD_CONNECT, | |
292 | * then rather than a URL a hostname:port is looked for. | |
293 | */ | |
294 | bool | |
295 | AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) | |
296 | { | |
297 | try { | |
298 | ||
299 | LOCAL_ARRAY(char, login, MAX_URL); | |
300 | LOCAL_ARRAY(char, foundHost, MAX_URL); | |
301 | LOCAL_ARRAY(char, urlpath, MAX_URL); | |
302 | char *t = nullptr; | |
303 | char *q = nullptr; | |
304 | int foundPort; | |
305 | int l; | |
306 | int i; | |
307 | const char *src; | |
308 | char *dst; | |
309 | foundHost[0] = urlpath[0] = login[0] = '\0'; | |
310 | ||
311 | if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) { | |
312 | debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)"); | |
313 | return false; | |
314 | } | |
315 | ||
316 | if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) && | |
317 | Asterisk().cmp(rawUrl) == 0) { | |
318 | // XXX: these methods might also occur in HTTPS traffic. Handle this better. | |
319 | setScheme(AnyP::PROTO_HTTP, nullptr); | |
320 | port(getScheme().defaultPort()); | |
321 | path(Asterisk()); | |
322 | return true; | |
323 | } | |
324 | ||
325 | Parser::Tokenizer tok(rawUrl); | |
326 | AnyP::UriScheme scheme; | |
327 | ||
328 | if (method == Http::METHOD_CONNECT) { | |
329 | // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and | |
330 | // port number of the tunnel destination, separated by a colon". | |
331 | ||
332 | const auto rawHost = parseHost(tok); | |
333 | Assure(rawHost.length() < sizeof(foundHost)); | |
334 | SBufToCstring(foundHost, rawHost); | |
335 | ||
336 | if (!tok.skip(':')) | |
337 | throw TextException("missing required :port in CONNECT target", Here()); | |
338 | foundPort = parsePort(tok); | |
339 | ||
340 | if (!tok.remaining().isEmpty()) | |
341 | throw TextException("garbage after host:port in CONNECT target", Here()); | |
342 | } else { | |
343 | ||
344 | scheme = uriParseScheme(tok); | |
345 | ||
346 | if (scheme == AnyP::PROTO_NONE) | |
347 | return false; // invalid scheme | |
348 | ||
349 | if (scheme == AnyP::PROTO_URN) { | |
350 | parseUrn(tok); // throws on any error | |
351 | return true; | |
352 | } | |
353 | ||
354 | // URLs then have "//" | |
355 | static const SBuf doubleSlash("//"); | |
356 | if (!tok.skip(doubleSlash)) | |
357 | return false; | |
358 | ||
359 | auto B = tok.remaining(); | |
360 | const char *url = B.c_str(); | |
361 | ||
362 | /* Parse the URL: */ | |
363 | src = url; | |
364 | i = 0; | |
365 | ||
366 | /* Then everything until first /; that's host (and port; which we'll look for here later) */ | |
367 | // bug 1881: If we don't get a "/" then we imply it was there | |
368 | // bug 3074: We could just be given a "?" or "#". These also imply "/" | |
369 | // bug 3233: whitespace is also a hostname delimiter. | |
370 | for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) { | |
371 | *dst = *src; | |
372 | } | |
373 | ||
374 | /* | |
375 | * We can't check for "i >= l" here because we could be at the end of the line | |
376 | * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've | |
377 | * been -given- a valid URL and the path is just '/'. | |
378 | */ | |
379 | if (i > l) | |
380 | return false; | |
381 | *dst = '\0'; | |
382 | ||
383 | // We are looking at path-abempty. | |
384 | if (*src != '/') { | |
385 | // path-empty, including the end of the `src` c-string cases | |
386 | urlpath[0] = '/'; | |
387 | dst = &urlpath[1]; | |
388 | } else { | |
389 | dst = urlpath; | |
390 | } | |
391 | /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */ | |
392 | for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { | |
393 | *dst = *src; | |
394 | } | |
395 | ||
396 | /* We -could- be at the end of the buffer here */ | |
397 | if (i > l) | |
398 | return false; | |
399 | *dst = '\0'; | |
400 | ||
401 | // If the parsed scheme has no (known) default port, and there is no | |
402 | // explicit port, then we will reject the zero port during foundPort | |
403 | // validation, often resulting in a misleading 400/ERR_INVALID_URL. | |
404 | // TODO: Remove this hack when switching to Tokenizer-based parsing. | |
405 | foundPort = scheme.defaultPort().value_or(0); // may be reset later | |
406 | ||
407 | /* Is there any login information? (we should eventually parse it above) */ | |
408 | t = strrchr(foundHost, '@'); | |
409 | if (t != nullptr) { | |
410 | strncpy((char *) login, (char *) foundHost, sizeof(login)-1); | |
411 | login[sizeof(login)-1] = '\0'; | |
412 | t = strrchr(login, '@'); | |
413 | *t = 0; | |
414 | strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1); | |
415 | foundHost[sizeof(foundHost)-1] = '\0'; | |
416 | // Bug 4498: URL-unescape the login info after extraction | |
417 | rfc1738_unescape(login); | |
418 | } | |
419 | ||
420 | /* Is there any host information? (we should eventually parse it above) */ | |
421 | if (*foundHost == '[') { | |
422 | /* strip any IPA brackets. valid under IPv6. */ | |
423 | dst = foundHost; | |
424 | /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */ | |
425 | src = foundHost; | |
426 | ++src; | |
427 | l = strlen(foundHost); | |
428 | i = 1; | |
429 | for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) { | |
430 | *dst = *src; | |
431 | } | |
432 | ||
433 | /* we moved in-place, so truncate the actual hostname found */ | |
434 | *dst = '\0'; | |
435 | ++dst; | |
436 | ||
437 | /* skip ahead to either start of port, or original EOS */ | |
438 | while (*dst != '\0' && *dst != ':') | |
439 | ++dst; | |
440 | t = dst; | |
441 | } else { | |
442 | t = strrchr(foundHost, ':'); | |
443 | ||
444 | if (t != strchr(foundHost,':') ) { | |
445 | /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ | |
446 | /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ | |
447 | /* therefore we MUST accept the case where they are not bracketed at all. */ | |
448 | t = nullptr; | |
449 | } | |
450 | } | |
451 | ||
452 | // Bug 3183 sanity check: If scheme is present, host must be too. | |
453 | if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') { | |
454 | debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details."); | |
455 | return false; | |
456 | } | |
457 | ||
458 | if (t && *t == ':') { | |
459 | *t = '\0'; | |
460 | ++t; | |
461 | foundPort = atoi(t); | |
462 | } | |
463 | } | |
464 | ||
465 | for (t = foundHost; *t; ++t) | |
466 | *t = xtolower(*t); | |
467 | ||
468 | if (stringHasWhitespace(foundHost)) { | |
469 | if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { | |
470 | t = q = foundHost; | |
471 | while (*t) { | |
472 | if (!xisspace(*t)) { | |
473 | *q = *t; | |
474 | ++q; | |
475 | } | |
476 | ++t; | |
477 | } | |
478 | *q = '\0'; | |
479 | } | |
480 | } | |
481 | ||
482 | debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'"); | |
483 | ||
484 | if (Config.onoff.check_hostnames && | |
485 | strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) { | |
486 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'"); | |
487 | return false; | |
488 | } | |
489 | ||
490 | if (!urlAppendDomain(foundHost)) | |
491 | return false; | |
492 | ||
493 | /* remove trailing dots from hostnames */ | |
494 | while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.') | |
495 | foundHost[l] = '\0'; | |
496 | ||
497 | /* reject duplicate or leading dots */ | |
498 | if (strstr(foundHost, "..") || *foundHost == '.') { | |
499 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'"); | |
500 | return false; | |
501 | } | |
502 | ||
503 | if (foundPort < 1 || foundPort > 65535) { | |
504 | debugs(23, 3, "Invalid port '" << foundPort << "'"); | |
505 | return false; | |
506 | } | |
507 | ||
508 | if (stringHasWhitespace(urlpath)) { | |
509 | debugs(23, 2, "URI has whitespace: {" << rawUrl << "}"); | |
510 | ||
511 | switch (Config.uri_whitespace) { | |
512 | ||
513 | case URI_WHITESPACE_DENY: | |
514 | return false; | |
515 | ||
516 | case URI_WHITESPACE_ALLOW: | |
517 | break; | |
518 | ||
519 | case URI_WHITESPACE_ENCODE: | |
520 | t = rfc1738_escape_unescaped(urlpath); | |
521 | xstrncpy(urlpath, t, MAX_URL); | |
522 | break; | |
523 | ||
524 | case URI_WHITESPACE_CHOP: | |
525 | *(urlpath + strcspn(urlpath, w_space)) = '\0'; | |
526 | break; | |
527 | ||
528 | case URI_WHITESPACE_STRIP: | |
529 | default: | |
530 | t = q = urlpath; | |
531 | while (*t) { | |
532 | if (!xisspace(*t)) { | |
533 | *q = *t; | |
534 | ++q; | |
535 | } | |
536 | ++t; | |
537 | } | |
538 | *q = '\0'; | |
539 | } | |
540 | } | |
541 | ||
542 | setScheme(scheme); | |
543 | path(urlpath); | |
544 | host(foundHost); | |
545 | userInfo(SBuf(login)); | |
546 | port(foundPort); | |
547 | return true; | |
548 | ||
549 | } catch (...) { | |
550 | debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length())); | |
551 | return false; | |
552 | } | |
553 | } | |
554 | ||
555 | /** | |
556 | * Governed by RFC 8141 section 2: | |
557 | * | |
558 | * assigned-name = "urn" ":" NID ":" NSS | |
559 | * NID = (alphanum) 0*30(ldh) (alphanum) | |
560 | * ldh = alphanum / "-" | |
561 | * NSS = pchar *(pchar / "/") | |
562 | * | |
563 | * RFC 3986 Appendix D.2 defines (as deprecated): | |
564 | * | |
565 | * alphanum = ALPHA / DIGIT | |
566 | * | |
567 | * Notice that NID is exactly 2-32 characters in length. | |
568 | */ | |
569 | void | |
570 | AnyP::Uri::parseUrn(Parser::Tokenizer &tok) | |
571 | { | |
572 | static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT; | |
573 | static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum"); | |
574 | SBuf nid; | |
575 | if (!tok.prefix(nid, nidChars, 32)) | |
576 | throw TextException("NID not found", Here()); | |
577 | ||
578 | if (!tok.skip(':')) | |
579 | throw TextException("NID too long or missing ':' delimiter", Here()); | |
580 | ||
581 | if (nid.length() < 2) | |
582 | throw TextException("NID too short", Here()); | |
583 | ||
584 | if (!alphanum[*nid.begin()]) | |
585 | throw TextException("NID prefix is not alphanumeric", Here()); | |
586 | ||
587 | if (!alphanum[*nid.rbegin()]) | |
588 | throw TextException("NID suffix is not alphanumeric", Here()); | |
589 | ||
590 | setScheme(AnyP::PROTO_URN, nullptr); | |
591 | host(nid.c_str()); | |
592 | // TODO validate path characters | |
593 | path(tok.remaining()); | |
594 | debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length())); | |
595 | } | |
596 | ||
597 | /// Extracts and returns a (suspected but only partially validated) uri-host | |
598 | /// IPv6address, IPv4address, or reg-name component. This function uses (and | |
599 | /// quotes) RFC 3986, Section 3.2.2 syntax rules. | |
600 | SBuf | |
601 | AnyP::Uri::parseHost(Parser::Tokenizer &tok) const | |
602 | { | |
603 | // host = IP-literal / IPv4address / reg-name | |
604 | ||
605 | // XXX: CharacterSets below reject uri-host values containing whitespace | |
606 | // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive | |
607 | // can be interpreted as if it applies to uri-host and this code. TODO: Fix | |
608 | // uri_whitespace and the code using it to exclude uri-host (and URI scheme, | |
609 | // port, etc.) from that directive scope. | |
610 | ||
611 | // IP-literal = "[" ( IPv6address / IPvFuture ) "]" | |
612 | if (tok.skip('[')) { | |
613 | // Add "." because IPv6address in RFC 3986 includes ls32, which includes | |
614 | // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address | |
615 | // This set rejects IPvFuture that needs a "v" character. | |
616 | static const CharacterSet IPv6chars = ( | |
617 | CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6"); | |
618 | SBuf ipv6ish; | |
619 | if (!tok.prefix(ipv6ish, IPv6chars)) | |
620 | throw TextException("malformed or unsupported bracketed IP address in uri-host", Here()); | |
621 | ||
622 | if (!tok.skip(']')) | |
623 | throw TextException("IPv6 address is missing a closing bracket in uri-host", Here()); | |
624 | ||
625 | // This rejects bracketed IPv4address and domain names because they lack ":". | |
626 | if (ipv6ish.find(':') == SBuf::npos) | |
627 | throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here()); | |
628 | ||
629 | // This rejects bracketed non-IP addresses that our caller would have | |
630 | // otherwise mistaken for a domain name (e.g., '[127.0.0:1]'). | |
631 | Ip::Address ipv6check; | |
632 | if (!ipv6check.fromHost(ipv6ish.c_str())) | |
633 | throw TextException("malformed bracketed IPv6 address in uri-host", Here()); | |
634 | ||
635 | return ipv6ish; | |
636 | } | |
637 | ||
638 | // no brackets implies we are looking at IPv4address or reg-name | |
639 | ||
640 | // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&" | |
641 | // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the | |
642 | // non-CONNECT uri-host parsing code to use us. | |
643 | ||
644 | SBuf otherHost; // IPv4address-ish or reg-name-ish; | |
645 | // ":" is not in TCHAR so we will stop before any port specification | |
646 | if (tok.prefix(otherHost, CharacterSet::TCHAR)) | |
647 | return otherHost; | |
648 | ||
649 | throw TextException("malformed IPv4 address or host name in uri-host", Here()); | |
650 | } | |
651 | ||
652 | /// Extracts and returns an RFC 3986 URI authority port value (with additional | |
653 | /// restrictions). The RFC defines port as a possibly empty sequence of decimal | |
654 | /// digits. We reject certain ports (that are syntactically valid from the RFC | |
655 | /// point of view) because we are worried that Squid and other traffic handlers | |
656 | /// may dangerously mishandle unusual (and virtually always bogus) port numbers. | |
657 | /// Rejected ports cannot be successfully used by Squid itself. | |
658 | int | |
659 | AnyP::Uri::parsePort(Parser::Tokenizer &tok) const | |
660 | { | |
661 | if (tok.skip('0')) | |
662 | throw TextException("zero or zero-prefixed port", Here()); | |
663 | ||
664 | int64_t rawPort = 0; | |
665 | if (!tok.int64(rawPort, 10, false)) // port = *DIGIT | |
666 | throw TextException("malformed or missing port", Here()); | |
667 | ||
668 | Assure(rawPort > 0); | |
669 | constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it. | |
670 | constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max(); | |
671 | static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number"); | |
672 | if (Less(portMax, rawPort)) | |
673 | throw TextException("huge port", Here()); | |
674 | ||
675 | // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing | |
676 | // code to use us (so that foundPort "int" disappears or starts using Port). | |
677 | return NaturalCast<int>(rawPort); | |
678 | } | |
679 | ||
680 | void | |
681 | AnyP::Uri::touch() | |
682 | { | |
683 | absolute_.clear(); | |
684 | authorityHttp_.clear(); | |
685 | authorityWithPort_.clear(); | |
686 | } | |
687 | ||
688 | SBuf & | |
689 | AnyP::Uri::authority(bool requirePort) const | |
690 | { | |
691 | if (authorityHttp_.isEmpty()) { | |
692 | ||
693 | // both formats contain Host/IP | |
694 | authorityWithPort_.append(host()); | |
695 | authorityHttp_ = authorityWithPort_; | |
696 | ||
697 | if (port().has_value()) { | |
698 | authorityWithPort_.appendf(":%hu", *port()); | |
699 | // authorityHttp_ only has :port for known non-default ports | |
700 | if (port() != getScheme().defaultPort()) | |
701 | authorityHttp_ = authorityWithPort_; | |
702 | } | |
703 | // else XXX: We made authorityWithPort_ that does not have a port. | |
704 | // TODO: Audit callers and refuse to give out broken authorityWithPort_. | |
705 | } | |
706 | ||
707 | return requirePort ? authorityWithPort_ : authorityHttp_; | |
708 | } | |
709 | ||
710 | SBuf & | |
711 | AnyP::Uri::absolute() const | |
712 | { | |
713 | if (absolute_.isEmpty()) { | |
714 | // TODO: most URL will be much shorter, avoid allocating this much | |
715 | absolute_.reserveCapacity(MAX_URL); | |
716 | ||
717 | absolute_.append(getScheme().image()); | |
718 | absolute_.append(":",1); | |
719 | if (getScheme() != AnyP::PROTO_URN) { | |
720 | absolute_.append("//", 2); | |
721 | const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP || | |
722 | getScheme() == AnyP::PROTO_UNKNOWN; | |
723 | ||
724 | if (allowUserInfo && !userInfo().isEmpty()) { | |
725 | static const CharacterSet uiChars = CharacterSet(UserInfoChars()) | |
726 | .remove('%') | |
727 | .rename("userinfo-reserved"); | |
728 | absolute_.append(Encode(userInfo(), uiChars)); | |
729 | absolute_.append("@", 1); | |
730 | } | |
731 | absolute_.append(authority()); | |
732 | } else { | |
733 | absolute_.append(host()); | |
734 | absolute_.append(":", 1); | |
735 | } | |
736 | absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed. | |
737 | } | |
738 | ||
739 | return absolute_; | |
740 | } | |
741 | ||
742 | /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string. | |
743 | * After copying it on in the first place! Would be less code to merge the two with a flag parameter. | |
744 | * and never copy the query-string part in the first place | |
745 | */ | |
746 | char * | |
747 | urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme) | |
748 | { | |
749 | LOCAL_ARRAY(char, buf, MAX_URL); | |
750 | ||
751 | snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url)); | |
752 | buf[sizeof(buf)-1] = '\0'; | |
753 | ||
754 | // URN, CONNECT method, and non-stripped URIs can go straight out | |
755 | if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) { | |
756 | // strip anything AFTER a question-mark | |
757 | // leaving the '?' in place | |
758 | if (auto t = strchr(buf, '?')) { | |
759 | *(++t) = '\0'; | |
760 | } | |
761 | } | |
762 | ||
763 | if (stringHasCntl(buf)) | |
764 | xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL); | |
765 | ||
766 | return buf; | |
767 | } | |
768 | ||
769 | /** | |
770 | * Yet another alternative to urlCanonical. | |
771 | * This one adds the https:// parts to Http::METHOD_CONNECT URL | |
772 | * for use in error page outputs. | |
773 | * Luckily we can leverage the others instead of duplicating. | |
774 | */ | |
775 | const char * | |
776 | urlCanonicalFakeHttps(const HttpRequest * request) | |
777 | { | |
778 | LOCAL_ARRAY(char, buf, MAX_URL); | |
779 | ||
780 | // method CONNECT and port HTTPS | |
781 | if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) { | |
782 | snprintf(buf, MAX_URL, "https://%s/*", request->url.host()); | |
783 | return buf; | |
784 | } | |
785 | ||
786 | // else do the normal complete canonical thing. | |
787 | return request->canonicalCleanUrl(); | |
788 | } | |
789 | ||
790 | /** | |
791 | * Test if a URL is a relative reference. | |
792 | * | |
793 | * Governed by RFC 3986 section 4.2 | |
794 | * | |
795 | * relative-ref = relative-part [ "?" query ] [ "#" fragment ] | |
796 | * | |
797 | * relative-part = "//" authority path-abempty | |
798 | * / path-absolute | |
799 | * / path-noscheme | |
800 | * / path-empty | |
801 | */ | |
802 | bool | |
803 | urlIsRelative(const char *url) | |
804 | { | |
805 | if (!url) | |
806 | return false; // no URL | |
807 | ||
808 | /* | |
809 | * RFC 3986 section 5.2.3 | |
810 | * | |
811 | * path = path-abempty ; begins with "/" or is empty | |
812 | * / path-absolute ; begins with "/" but not "//" | |
813 | * / path-noscheme ; begins with a non-colon segment | |
814 | * / path-rootless ; begins with a segment | |
815 | * / path-empty ; zero characters | |
816 | */ | |
817 | ||
818 | if (*url == '\0') | |
819 | return true; // path-empty | |
820 | ||
821 | if (*url == '/') { | |
822 | // network-path reference (a.k.a. 'scheme-relative URI') or | |
823 | // path-absolute (a.k.a. 'absolute-path reference') | |
824 | return true; | |
825 | } | |
826 | ||
827 | for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) { | |
828 | if (*p == ':') | |
829 | return false; // colon is forbidden in first segment | |
830 | } | |
831 | ||
832 | return true; // path-noscheme, path-abempty, path-rootless | |
833 | } | |
834 | ||
835 | void | |
836 | AnyP::Uri::addRelativePath(const char *relUrl) | |
837 | { | |
838 | // URN cannot be merged | |
839 | if (getScheme() == AnyP::PROTO_URN) | |
840 | return; | |
841 | ||
842 | // TODO: Handle . and .. segment normalization | |
843 | ||
844 | const auto lastSlashPos = path_.rfind('/'); | |
845 | // TODO: To optimize and simplify, add and use SBuf::replace(). | |
846 | const auto relUrlLength = strlen(relUrl); | |
847 | if (lastSlashPos == SBuf::npos) { | |
848 | // start replacing the whole path | |
849 | path_.reserveCapacity(1 + relUrlLength); | |
850 | path_.assign("/", 1); | |
851 | } else { | |
852 | // start replacing just the last segment | |
853 | path_.reserveCapacity(lastSlashPos + 1 + relUrlLength); | |
854 | path_.chop(0, lastSlashPos+1); | |
855 | } | |
856 | path_.append(relUrl, relUrlLength); | |
857 | } | |
858 | ||
859 | int | |
860 | matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags) | |
861 | { | |
862 | int dl; | |
863 | int hl; | |
864 | ||
865 | const bool hostIncludesSubdomains = (*h == '.'); | |
866 | while ('.' == *h) | |
867 | ++h; | |
868 | ||
869 | hl = strlen(h); | |
870 | ||
871 | if (hl == 0) | |
872 | return -1; | |
873 | ||
874 | dl = strlen(d); | |
875 | if (dl == 0) | |
876 | return 1; | |
877 | ||
878 | /* | |
879 | * Start at the ends of the two strings and work towards the | |
880 | * beginning. | |
881 | */ | |
882 | while (xtolower(h[--hl]) == xtolower(d[--dl])) { | |
883 | if (hl == 0 && dl == 0) { | |
884 | /* | |
885 | * We made it all the way to the beginning of both | |
886 | * strings without finding any difference. | |
887 | */ | |
888 | return 0; | |
889 | } | |
890 | ||
891 | if (0 == hl) { | |
892 | /* | |
893 | * The host string is shorter than the domain string. | |
894 | * There is only one case when this can be a match. | |
895 | * If the domain is just one character longer, and if | |
896 | * that character is a leading '.' then we call it a | |
897 | * match. | |
898 | */ | |
899 | ||
900 | if (1 == dl && '.' == d[0]) | |
901 | return 0; | |
902 | else | |
903 | return -1; | |
904 | } | |
905 | ||
906 | if (0 == dl) { | |
907 | /* | |
908 | * The domain string is shorter than the host string. | |
909 | * This is a match only if the first domain character | |
910 | * is a leading '.'. | |
911 | */ | |
912 | ||
913 | if ('.' == d[0]) { | |
914 | if (flags & mdnRejectSubsubDomains) { | |
915 | // Check for sub-sub domain and reject | |
916 | while(--hl >= 0 && h[hl] != '.'); | |
917 | if (hl < 0) { | |
918 | // No sub-sub domain found, but reject if there is a | |
919 | // leading dot in given host string (which is removed | |
920 | // before the check is started). | |
921 | return hostIncludesSubdomains ? 1 : 0; | |
922 | } else | |
923 | return 1; // sub-sub domain, reject | |
924 | } else | |
925 | return 0; | |
926 | } else | |
927 | return 1; | |
928 | } | |
929 | } | |
930 | ||
931 | /* | |
932 | * We found different characters in the same position (from the end). | |
933 | */ | |
934 | ||
935 | // If the h has a form of "*.foo.com" and d has a form of "x.foo.com" | |
936 | // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x' | |
937 | // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'. | |
938 | if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.') | |
939 | return 0; | |
940 | ||
941 | /* | |
942 | * If one of those character is '.' then its special. In order | |
943 | * for splay tree sorting to work properly, "x-foo.com" must | |
944 | * be greater than ".foo.com" even though '-' is less than '.'. | |
945 | */ | |
946 | if ('.' == d[dl]) | |
947 | return 1; | |
948 | ||
949 | if ('.' == h[hl]) | |
950 | return -1; | |
951 | ||
952 | return (xtolower(h[hl]) - xtolower(d[dl])); | |
953 | } | |
954 | ||
955 | /* | |
956 | * return true if we can serve requests for this method. | |
957 | */ | |
958 | bool | |
959 | urlCheckRequest(const HttpRequest * r) | |
960 | { | |
961 | /* protocol "independent" methods | |
962 | * | |
963 | * actually these methods are specific to HTTP: | |
964 | * they are methods we receive on our HTTP port, | |
965 | * and if we had a FTP listener would not be relevant | |
966 | * there. | |
967 | * | |
968 | * So, we should delegate them to HTTP. The problem is that we | |
969 | * do not have a default protocol from the client side of HTTP. | |
970 | */ | |
971 | ||
972 | if (r->method == Http::METHOD_CONNECT) | |
973 | return true; | |
974 | ||
975 | // we support OPTIONS and TRACE directed at us (with a 501 reply, for now) | |
976 | // we also support forwarding OPTIONS and TRACE, except for the *-URI ones | |
977 | if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE) | |
978 | return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk()); | |
979 | ||
980 | if (r->method == Http::METHOD_PURGE) | |
981 | return true; | |
982 | ||
983 | /* does method match the protocol? */ | |
984 | switch (r->url.getScheme()) { | |
985 | ||
986 | case AnyP::PROTO_URN: | |
987 | case AnyP::PROTO_HTTP: | |
988 | return true; | |
989 | ||
990 | case AnyP::PROTO_FTP: | |
991 | if (r->method == Http::METHOD_PUT || | |
992 | r->method == Http::METHOD_GET || | |
993 | r->method == Http::METHOD_HEAD ) | |
994 | return true; | |
995 | return false; | |
996 | ||
997 | case AnyP::PROTO_WAIS: | |
998 | case AnyP::PROTO_WHOIS: | |
999 | if (r->method == Http::METHOD_GET || | |
1000 | r->method == Http::METHOD_HEAD) | |
1001 | return true; | |
1002 | return false; | |
1003 | ||
1004 | case AnyP::PROTO_HTTPS: | |
1005 | #if USE_OPENSSL || HAVE_LIBGNUTLS | |
1006 | return true; | |
1007 | #else | |
1008 | /* | |
1009 | * Squid can't originate an SSL connection, so it should | |
1010 | * never receive an "https:" URL. It should always be | |
1011 | * CONNECT instead. | |
1012 | */ | |
1013 | return false; | |
1014 | #endif | |
1015 | ||
1016 | default: | |
1017 | return false; | |
1018 | } | |
1019 | ||
1020 | /* notreached */ | |
1021 | return false; | |
1022 | } | |
1023 | ||
1024 | AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) : | |
1025 | scheme_(aScheme), | |
1026 | hostIsNumeric_(false) | |
1027 | { | |
1028 | *host_=0; | |
1029 | } | |
1030 | ||
1031 | // TODO: fix code duplication with AnyP::Uri::parse() | |
1032 | char * | |
1033 | AnyP::Uri::cleanup(const char *uri) | |
1034 | { | |
1035 | char *cleanedUri = nullptr; | |
1036 | switch (Config.uri_whitespace) { | |
1037 | case URI_WHITESPACE_ALLOW: { | |
1038 | const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED; | |
1039 | cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL); | |
1040 | break; | |
1041 | } | |
1042 | ||
1043 | case URI_WHITESPACE_ENCODE: | |
1044 | cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL); | |
1045 | break; | |
1046 | ||
1047 | case URI_WHITESPACE_CHOP: { | |
1048 | const auto pos = strcspn(uri, w_space); | |
1049 | char *choppedUri = nullptr; | |
1050 | if (pos < strlen(uri)) | |
1051 | choppedUri = xstrndup(uri, pos + 1); | |
1052 | cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, | |
1053 | RFC1738_ESCAPE_UNESCAPED), MAX_URL); | |
1054 | cleanedUri[pos] = '\0'; | |
1055 | xfree(choppedUri); | |
1056 | break; | |
1057 | } | |
1058 | ||
1059 | case URI_WHITESPACE_DENY: | |
1060 | case URI_WHITESPACE_STRIP: | |
1061 | default: { | |
1062 | // TODO: avoid duplication with urlParse() | |
1063 | const char *t; | |
1064 | char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1)); | |
1065 | char *q = tmp_uri; | |
1066 | t = uri; | |
1067 | while (*t) { | |
1068 | if (!xisspace(*t)) { | |
1069 | *q = *t; | |
1070 | ++q; | |
1071 | } | |
1072 | ++t; | |
1073 | } | |
1074 | *q = '\0'; | |
1075 | cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL); | |
1076 | xfree(tmp_uri); | |
1077 | break; | |
1078 | } | |
1079 | } | |
1080 | ||
1081 | assert(cleanedUri); | |
1082 | return cleanedUri; | |
1083 | } | |
1084 |