]>
Commit | Line | Data |
---|---|---|
30a4f2a8 | 1 | /* |
1f7b830e | 2 | * Copyright (C) 1996-2025 The Squid Software Foundation and contributors |
e25c139f | 3 | * |
bbc27441 AJ |
4 | * Squid software is distributed under GPLv2+ license and includes |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
019dd986 | 7 | */ |
ed43818f | 8 | |
bbc27441 AJ |
9 | /* DEBUG: section 23 URL Parsing */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
22b2a7a0 | 12 | #include "anyp/Host.h" |
c8ab5ec6 | 13 | #include "anyp/Uri.h" |
675b8408 | 14 | #include "base/Raw.h" |
582c2af2 | 15 | #include "globals.h" |
528b2c61 | 16 | #include "HttpRequest.h" |
6c880a16 | 17 | #include "parser/Tokenizer.h" |
1fa9b1a7 | 18 | #include "rfc1738.h" |
4d5904f7 | 19 | #include "SquidConfig.h" |
963ff143 | 20 | #include "SquidMath.h" |
090089c4 | 21 | |
a78278e2 | 22 | static const char valid_hostname_chars_u[] = |
62e76326 | 23 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
24 | "abcdefghijklmnopqrstuvwxyz" | |
a78278e2 | 25 | "0123456789-._" |
cc192b50 | 26 | "[:]" |
a78278e2 | 27 | ; |
28 | static const char valid_hostname_chars[] = | |
62e76326 | 29 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
30 | "abcdefghijklmnopqrstuvwxyz" | |
31 | "0123456789-." | |
cc192b50 | 32 | "[:]" |
62e76326 | 33 | ; |
090089c4 | 34 | |
614bd511 AJ |
35 | /// Characters which are valid within a URI userinfo section |
36 | static const CharacterSet & | |
37 | UserInfoChars() | |
38 | { | |
39 | /* | |
40 | * RFC 3986 section 3.2.1 | |
41 | * | |
42 | * userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) | |
43 | * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" | |
44 | * pct-encoded = "%" HEXDIG HEXDIG | |
45 | * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | |
46 | */ | |
47 | static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") + | |
48 | CharacterSet::ALPHA + | |
49 | CharacterSet::DIGIT; | |
50 | return userInfoValid; | |
51 | } | |
52 | ||
53 | /** | |
54 | * Governed by RFC 3986 section 2.1 | |
55 | */ | |
56 | SBuf | |
57 | AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore) | |
58 | { | |
59 | if (buf.isEmpty()) | |
60 | return buf; | |
61 | ||
62 | Parser::Tokenizer tk(buf); | |
63 | SBuf goodSection; | |
64 | // optimization for the arguably common "no encoding necessary" case | |
65 | if (tk.prefix(goodSection, ignore) && tk.atEnd()) | |
66 | return buf; | |
67 | ||
68 | SBuf output; | |
69 | output.reserveSpace(buf.length() * 3); // worst case: encode all chars | |
70 | output.append(goodSection); // may be empty | |
71 | ||
72 | while (!tk.atEnd()) { | |
73 | // TODO: Add Tokenizer::parseOne(void). | |
74 | const auto ch = tk.remaining()[0]; | |
65d21317 | 75 | output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table |
614bd511 AJ |
76 | (void)tk.skip(ch); |
77 | ||
78 | if (tk.prefix(goodSection, ignore)) | |
79 | output.append(goodSection); | |
80 | } | |
81 | ||
82 | return output; | |
83 | } | |
84 | ||
26256f28 FC |
85 | SBuf |
86 | AnyP::Uri::Decode(const SBuf &buf) | |
87 | { | |
88 | SBuf output; | |
89 | Parser::Tokenizer tok(buf); | |
90 | while (!tok.atEnd()) { | |
91 | SBuf token; | |
92 | static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded"); | |
93 | if (tok.prefix(token, unencodedChars)) | |
94 | output.append(token); | |
95 | ||
96 | // we are either at '%' or at end of input | |
97 | if (tok.skip('%')) { | |
98 | int64_t hex1 = 0, hex2 = 0; | |
99 | if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1)) | |
100 | output.append(static_cast<char>((hex1 << 4) | hex2)); | |
101 | else | |
102 | throw TextException("invalid pct-encoded triplet", Here()); | |
103 | } | |
104 | } | |
105 | return output; | |
106 | } | |
107 | ||
2e260208 | 108 | const SBuf & |
c8ab5ec6 | 109 | AnyP::Uri::Asterisk() |
2e260208 AJ |
110 | { |
111 | static SBuf star("*"); | |
112 | return star; | |
113 | } | |
114 | ||
51b5dcf5 | 115 | const SBuf & |
c8ab5ec6 | 116 | AnyP::Uri::SlashPath() |
51b5dcf5 AJ |
117 | { |
118 | static SBuf slash("/"); | |
119 | return slash; | |
120 | } | |
121 | ||
5c51bffb | 122 | void |
c8ab5ec6 | 123 | AnyP::Uri::host(const char *src) |
5c51bffb | 124 | { |
1560ae82 | 125 | hostAddr_.fromHost(src); |
5c51bffb AJ |
126 | if (hostAddr_.isAnyAddr()) { |
127 | xstrncpy(host_, src, sizeof(host_)); | |
128 | hostIsNumeric_ = false; | |
129 | } else { | |
130 | hostAddr_.toHostStr(host_, sizeof(host_)); | |
131 | debugs(23, 3, "given IP: " << hostAddr_); | |
132 | hostIsNumeric_ = 1; | |
133 | } | |
134 | touch(); | |
135 | } | |
136 | ||
22b2a7a0 | 137 | // TODO: Replace with ToSBuf(parsedHost()) or similar. |
9ce4a1eb CT |
138 | SBuf |
139 | AnyP::Uri::hostOrIp() const | |
140 | { | |
1560ae82 O |
141 | if (hostIsNumeric()) { |
142 | static char ip[MAX_IPSTRLEN]; | |
143 | const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip)); | |
144 | return SBuf(ip, hostStrLen); | |
145 | } else | |
9ce4a1eb CT |
146 | return SBuf(host()); |
147 | } | |
148 | ||
22b2a7a0 TW |
149 | std::optional<AnyP::Host> |
150 | AnyP::Uri::parsedHost() const | |
151 | { | |
152 | if (hostIsNumeric()) | |
153 | return Host::ParseIp(hostIP()); | |
154 | ||
155 | // XXX: Interpret host subcomponent as reg-name representing a DNS name. It | |
156 | // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but | |
157 | // current Squid APIs do not support adequate representation of those cases. | |
158 | const SBuf regName(host()); | |
159 | ||
160 | if (regName.find('%') != SBuf::npos) { | |
161 | debugs(23, 3, "rejecting percent-encoded reg-name: " << regName); | |
162 | return std::nullopt; // TODO: Decode() instead | |
163 | } | |
164 | ||
165 | return Host::ParseSimpleDomainName(regName); | |
166 | } | |
167 | ||
51b5dcf5 | 168 | const SBuf & |
c8ab5ec6 | 169 | AnyP::Uri::path() const |
51b5dcf5 AJ |
170 | { |
171 | // RFC 3986 section 3.3 says path can be empty (path-abempty). | |
172 | // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/" | |
173 | // at least when sending and using. We must still accept path-abempty as input. | |
174 | if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS)) | |
175 | return SlashPath(); | |
176 | ||
177 | return path_; | |
178 | } | |
179 | ||
b8d8561b | 180 | void |
0673c0ba | 181 | urlInitialize(void) |
090089c4 | 182 | { |
bf8fe701 | 183 | debugs(23, 5, "urlInitialize: Initializing..."); |
985c86bc | 184 | /* this ensures that the number of protocol strings is the same as |
0c3d3f65 | 185 | * the enum slots allocated because the last enum is always 'MAX'. |
985c86bc | 186 | */ |
0c3d3f65 | 187 | assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0); |
9bc73deb | 188 | /* |
189 | * These test that our matchDomainName() function works the | |
190 | * way we expect it to. | |
191 | */ | |
192 | assert(0 == matchDomainName("foo.com", "foo.com")); | |
d20b1cd0 | 193 | assert(0 == matchDomainName(".foo.com", "foo.com")); |
9bc73deb | 194 | assert(0 == matchDomainName("foo.com", ".foo.com")); |
195 | assert(0 == matchDomainName(".foo.com", ".foo.com")); | |
196 | assert(0 == matchDomainName("x.foo.com", ".foo.com")); | |
abbd7825 | 197 | assert(0 == matchDomainName("y.x.foo.com", ".foo.com")); |
9bc73deb | 198 | assert(0 != matchDomainName("x.foo.com", "foo.com")); |
199 | assert(0 != matchDomainName("foo.com", "x.foo.com")); | |
200 | assert(0 != matchDomainName("bar.com", "foo.com")); | |
201 | assert(0 != matchDomainName(".bar.com", "foo.com")); | |
202 | assert(0 != matchDomainName(".bar.com", ".foo.com")); | |
203 | assert(0 != matchDomainName("bar.com", ".foo.com")); | |
204 | assert(0 < matchDomainName("zzz.com", "foo.com")); | |
205 | assert(0 > matchDomainName("aaa.com", "foo.com")); | |
206 | assert(0 == matchDomainName("FOO.com", "foo.COM")); | |
aca95add | 207 | assert(0 < matchDomainName("bfoo.com", "afoo.com")); |
208 | assert(0 > matchDomainName("afoo.com", "bfoo.com")); | |
d20b1cd0 | 209 | assert(0 < matchDomainName("x-foo.com", ".foo.com")); |
abbd7825 CT |
210 | |
211 | assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
212 | assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
213 | assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
214 | assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
215 | ||
216 | assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards)); | |
217 | assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards)); | |
218 | assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards)); | |
219 | assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards)); | |
220 | ||
b70f8649 AW |
221 | assert(0 != matchDomainName("foo.com", "")); |
222 | assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards)); | |
223 | assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains)); | |
224 | ||
9bc73deb | 225 | /* more cases? */ |
090089c4 | 226 | } |
227 | ||
cc192b50 | 228 | /** |
6c880a16 AJ |
229 | * Extract the URI scheme and ':' delimiter from the given input buffer. |
230 | * | |
231 | * Schemes up to 16 characters are accepted. | |
232 | * | |
233 | * Governed by RFC 3986 section 3.1 | |
d4a04ed5 | 234 | */ |
6c880a16 AJ |
235 | static AnyP::UriScheme |
236 | uriParseScheme(Parser::Tokenizer &tok) | |
92a6f4b1 | 237 | { |
6c880a16 AJ |
238 | /* |
239 | * RFC 3986 section 3.1 paragraph 2: | |
240 | * | |
241 | * Scheme names consist of a sequence of characters beginning with a | |
242 | * letter and followed by any combination of letters, digits, plus | |
243 | * ("+"), period ("."), or hyphen ("-"). | |
244 | */ | |
7902bd5b | 245 | static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT; |
6c880a16 AJ |
246 | |
247 | SBuf str; | |
248 | if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) { | |
249 | const auto protocol = AnyP::UriScheme::FindProtocolType(str); | |
250 | if (protocol == AnyP::PROTO_UNKNOWN) | |
251 | return AnyP::UriScheme(protocol, str.c_str()); | |
252 | return AnyP::UriScheme(protocol, nullptr); | |
253 | } | |
d31d59d8 | 254 | |
6c880a16 | 255 | throw TextException("invalid URI scheme", Here()); |
92a6f4b1 | 256 | } |
257 | ||
38aa10ef AJ |
258 | /** |
259 | * Appends configured append_domain to hostname, assuming | |
260 | * the given buffer is at least SQUIDHOSTNAMELEN bytes long, | |
261 | * and that the host FQDN is not a 'dotless' TLD. | |
262 | * | |
263 | * \returns false if and only if there is not enough space to append | |
264 | */ | |
265 | bool | |
266 | urlAppendDomain(char *host) | |
267 | { | |
268 | /* For IPv4 addresses check for a dot */ | |
269 | /* For IPv6 addresses also check for a colon */ | |
270 | if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) { | |
271 | const uint64_t dlen = strlen(host); | |
272 | const uint64_t want = dlen + Config.appendDomainLen; | |
273 | if (want > SQUIDHOSTNAMELEN - 1) { | |
274 | debugs(23, 2, "URL domain too large (" << dlen << " bytes)"); | |
275 | return false; | |
276 | } | |
277 | strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1); | |
278 | } | |
279 | return true; | |
280 | } | |
281 | ||
d4a04ed5 | 282 | /* |
283 | * Parse a URI/URL. | |
284 | * | |
6c880a16 | 285 | * It is assumed that the URL is complete - |
cc192b50 | 286 | * ie, the end of the string is the end of the URL. Don't pass a partial |
287 | * URL here as this routine doesn't have any way of knowing whether | |
6c880a16 | 288 | * it is partial or not (ie, it handles the case of no trailing slash as |
cc192b50 | 289 | * being "end of host with implied path of /". |
6c880a16 AJ |
290 | * |
291 | * method is used to switch parsers. If method is Http::METHOD_CONNECT, | |
292 | * then rather than a URL a hostname:port is looked for. | |
cc192b50 | 293 | */ |
9157915c | 294 | bool |
6c880a16 | 295 | AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) |
7111c86a | 296 | { |
6c880a16 AJ |
297 | try { |
298 | ||
77b1029d | 299 | LOCAL_ARRAY(char, login, MAX_URL); |
300 | LOCAL_ARRAY(char, foundHost, MAX_URL); | |
301 | LOCAL_ARRAY(char, urlpath, MAX_URL); | |
aee3523a AR |
302 | char *t = nullptr; |
303 | char *q = nullptr; | |
77b1029d | 304 | int foundPort; |
305 | int l; | |
306 | int i; | |
307 | const char *src; | |
308 | char *dst; | |
309 | foundHost[0] = urlpath[0] = login[0] = '\0'; | |
310 | ||
311 | if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) { | |
312 | debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)"); | |
313 | return false; | |
314 | } | |
6c880a16 | 315 | |
77b1029d | 316 | if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) && |
317 | Asterisk().cmp(rawUrl) == 0) { | |
318 | // XXX: these methods might also occur in HTTPS traffic. Handle this better. | |
319 | setScheme(AnyP::PROTO_HTTP, nullptr); | |
320 | port(getScheme().defaultPort()); | |
321 | path(Asterisk()); | |
322 | return true; | |
323 | } | |
6c880a16 | 324 | |
77b1029d | 325 | Parser::Tokenizer tok(rawUrl); |
326 | AnyP::UriScheme scheme; | |
cc192b50 | 327 | |
77b1029d | 328 | if (method == Http::METHOD_CONNECT) { |
963ff143 AR |
329 | // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and |
330 | // port number of the tunnel destination, separated by a colon". | |
6c880a16 | 331 | |
963ff143 AR |
332 | const auto rawHost = parseHost(tok); |
333 | Assure(rawHost.length() < sizeof(foundHost)); | |
334 | SBufToCstring(foundHost, rawHost); | |
6c880a16 | 335 | |
963ff143 AR |
336 | if (!tok.skip(':')) |
337 | throw TextException("missing required :port in CONNECT target", Here()); | |
338 | foundPort = parsePort(tok); | |
6c880a16 | 339 | |
963ff143 AR |
340 | if (!tok.remaining().isEmpty()) |
341 | throw TextException("garbage after host:port in CONNECT target", Here()); | |
77b1029d | 342 | } else { |
cc192b50 | 343 | |
77b1029d | 344 | scheme = uriParseScheme(tok); |
6c880a16 | 345 | |
77b1029d | 346 | if (scheme == AnyP::PROTO_NONE) |
347 | return false; // invalid scheme | |
6c880a16 | 348 | |
77b1029d | 349 | if (scheme == AnyP::PROTO_URN) { |
350 | parseUrn(tok); // throws on any error | |
351 | return true; | |
352 | } | |
62e76326 | 353 | |
77b1029d | 354 | // URLs then have "//" |
355 | static const SBuf doubleSlash("//"); | |
356 | if (!tok.skip(doubleSlash)) | |
357 | return false; | |
cc192b50 | 358 | |
77b1029d | 359 | auto B = tok.remaining(); |
360 | const char *url = B.c_str(); | |
cc192b50 | 361 | |
77b1029d | 362 | /* Parse the URL: */ |
363 | src = url; | |
364 | i = 0; | |
62e76326 | 365 | |
2f8abb64 | 366 | /* Then everything until first /; that's host (and port; which we'll look for here later) */ |
77b1029d | 367 | // bug 1881: If we don't get a "/" then we imply it was there |
368 | // bug 3074: We could just be given a "?" or "#". These also imply "/" | |
369 | // bug 3233: whitespace is also a hostname delimiter. | |
370 | for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) { | |
371 | *dst = *src; | |
372 | } | |
373 | ||
374 | /* | |
375 | * We can't check for "i >= l" here because we could be at the end of the line | |
376 | * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've | |
377 | * been -given- a valid URL and the path is just '/'. | |
378 | */ | |
379 | if (i > l) | |
380 | return false; | |
381 | *dst = '\0'; | |
62e76326 | 382 | |
dfd81859 AJ |
383 | // We are looking at path-abempty. |
384 | if (*src != '/') { | |
385 | // path-empty, including the end of the `src` c-string cases | |
77b1029d | 386 | urlpath[0] = '/'; |
387 | dst = &urlpath[1]; | |
388 | } else { | |
389 | dst = urlpath; | |
390 | } | |
2f8abb64 | 391 | /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */ |
77b1029d | 392 | for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 393 | *dst = *src; |
394 | } | |
395 | ||
77b1029d | 396 | /* We -could- be at the end of the buffer here */ |
397 | if (i > l) | |
398 | return false; | |
5db6bf73 | 399 | *dst = '\0'; |
cc192b50 | 400 | |
380b09ae AR |
401 | // If the parsed scheme has no (known) default port, and there is no |
402 | // explicit port, then we will reject the zero port during foundPort | |
403 | // validation, often resulting in a misleading 400/ERR_INVALID_URL. | |
404 | // TODO: Remove this hack when switching to Tokenizer-based parsing. | |
405 | foundPort = scheme.defaultPort().value_or(0); // may be reset later | |
77b1029d | 406 | |
407 | /* Is there any login information? (we should eventually parse it above) */ | |
408 | t = strrchr(foundHost, '@'); | |
aee3523a | 409 | if (t != nullptr) { |
77b1029d | 410 | strncpy((char *) login, (char *) foundHost, sizeof(login)-1); |
411 | login[sizeof(login)-1] = '\0'; | |
412 | t = strrchr(login, '@'); | |
413 | *t = 0; | |
414 | strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1); | |
415 | foundHost[sizeof(foundHost)-1] = '\0'; | |
416 | // Bug 4498: URL-unescape the login info after extraction | |
417 | rfc1738_unescape(login); | |
418 | } | |
419 | ||
420 | /* Is there any host information? (we should eventually parse it above) */ | |
421 | if (*foundHost == '[') { | |
422 | /* strip any IPA brackets. valid under IPv6. */ | |
423 | dst = foundHost; | |
424 | /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */ | |
425 | src = foundHost; | |
426 | ++src; | |
427 | l = strlen(foundHost); | |
428 | i = 1; | |
429 | for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) { | |
430 | *dst = *src; | |
431 | } | |
432 | ||
433 | /* we moved in-place, so truncate the actual hostname found */ | |
434 | *dst = '\0'; | |
5db6bf73 | 435 | ++dst; |
cc192b50 | 436 | |
77b1029d | 437 | /* skip ahead to either start of port, or original EOS */ |
438 | while (*dst != '\0' && *dst != ':') | |
439 | ++dst; | |
440 | t = dst; | |
441 | } else { | |
442 | t = strrchr(foundHost, ':'); | |
443 | ||
444 | if (t != strchr(foundHost,':') ) { | |
445 | /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ | |
446 | /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ | |
447 | /* therefore we MUST accept the case where they are not bracketed at all. */ | |
aee3523a | 448 | t = nullptr; |
77b1029d | 449 | } |
cc192b50 | 450 | } |
62e76326 | 451 | |
77b1029d | 452 | // Bug 3183 sanity check: If scheme is present, host must be too. |
453 | if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') { | |
454 | debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details."); | |
455 | return false; | |
456 | } | |
b5acc277 | 457 | |
77b1029d | 458 | if (t && *t == ':') { |
459 | *t = '\0'; | |
460 | ++t; | |
461 | foundPort = atoi(t); | |
462 | } | |
62e76326 | 463 | } |
62e76326 | 464 | |
77b1029d | 465 | for (t = foundHost; *t; ++t) |
466 | *t = xtolower(*t); | |
467 | ||
468 | if (stringHasWhitespace(foundHost)) { | |
469 | if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { | |
470 | t = q = foundHost; | |
471 | while (*t) { | |
472 | if (!xisspace(*t)) { | |
473 | *q = *t; | |
474 | ++q; | |
475 | } | |
476 | ++t; | |
5db6bf73 | 477 | } |
77b1029d | 478 | *q = '\0'; |
62e76326 | 479 | } |
62e76326 | 480 | } |
62e76326 | 481 | |
77b1029d | 482 | debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'"); |
cc192b50 | 483 | |
77b1029d | 484 | if (Config.onoff.check_hostnames && |
485 | strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) { | |
486 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'"); | |
487 | return false; | |
488 | } | |
62e76326 | 489 | |
77b1029d | 490 | if (!urlAppendDomain(foundHost)) |
491 | return false; | |
cc192b50 | 492 | |
77b1029d | 493 | /* remove trailing dots from hostnames */ |
494 | while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.') | |
495 | foundHost[l] = '\0'; | |
62e76326 | 496 | |
77b1029d | 497 | /* reject duplicate or leading dots */ |
498 | if (strstr(foundHost, "..") || *foundHost == '.') { | |
499 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'"); | |
500 | return false; | |
501 | } | |
62e76326 | 502 | |
77b1029d | 503 | if (foundPort < 1 || foundPort > 65535) { |
504 | debugs(23, 3, "Invalid port '" << foundPort << "'"); | |
505 | return false; | |
506 | } | |
62e76326 | 507 | |
77b1029d | 508 | if (stringHasWhitespace(urlpath)) { |
509 | debugs(23, 2, "URI has whitespace: {" << rawUrl << "}"); | |
62e76326 | 510 | |
77b1029d | 511 | switch (Config.uri_whitespace) { |
62e76326 | 512 | |
77b1029d | 513 | case URI_WHITESPACE_DENY: |
514 | return false; | |
62e76326 | 515 | |
77b1029d | 516 | case URI_WHITESPACE_ALLOW: |
517 | break; | |
518 | ||
519 | case URI_WHITESPACE_ENCODE: | |
520 | t = rfc1738_escape_unescaped(urlpath); | |
521 | xstrncpy(urlpath, t, MAX_URL); | |
522 | break; | |
523 | ||
524 | case URI_WHITESPACE_CHOP: | |
525 | *(urlpath + strcspn(urlpath, w_space)) = '\0'; | |
526 | break; | |
527 | ||
528 | case URI_WHITESPACE_STRIP: | |
529 | default: | |
530 | t = q = urlpath; | |
531 | while (*t) { | |
532 | if (!xisspace(*t)) { | |
533 | *q = *t; | |
534 | ++q; | |
535 | } | |
536 | ++t; | |
5db6bf73 | 537 | } |
77b1029d | 538 | *q = '\0'; |
62e76326 | 539 | } |
62e76326 | 540 | } |
62e76326 | 541 | |
77b1029d | 542 | setScheme(scheme); |
543 | path(urlpath); | |
544 | host(foundHost); | |
545 | userInfo(SBuf(login)); | |
546 | port(foundPort); | |
547 | return true; | |
6c880a16 AJ |
548 | |
549 | } catch (...) { | |
550 | debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length())); | |
551 | return false; | |
552 | } | |
7111c86a | 553 | } |
554 | ||
6c880a16 AJ |
555 | /** |
556 | * Governed by RFC 8141 section 2: | |
557 | * | |
558 | * assigned-name = "urn" ":" NID ":" NSS | |
559 | * NID = (alphanum) 0*30(ldh) (alphanum) | |
560 | * ldh = alphanum / "-" | |
561 | * NSS = pchar *(pchar / "/") | |
562 | * | |
563 | * RFC 3986 Appendix D.2 defines (as deprecated): | |
564 | * | |
565 | * alphanum = ALPHA / DIGIT | |
566 | * | |
567 | * Notice that NID is exactly 2-32 characters in length. | |
568 | */ | |
db59367a | 569 | void |
6c880a16 | 570 | AnyP::Uri::parseUrn(Parser::Tokenizer &tok) |
23d92c64 | 571 | { |
6c880a16 AJ |
572 | static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT; |
573 | static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum"); | |
574 | SBuf nid; | |
575 | if (!tok.prefix(nid, nidChars, 32)) | |
576 | throw TextException("NID not found", Here()); | |
577 | ||
578 | if (!tok.skip(':')) | |
579 | throw TextException("NID too long or missing ':' delimiter", Here()); | |
580 | ||
581 | if (nid.length() < 2) | |
582 | throw TextException("NID too short", Here()); | |
583 | ||
584 | if (!alphanum[*nid.begin()]) | |
585 | throw TextException("NID prefix is not alphanumeric", Here()); | |
586 | ||
587 | if (!alphanum[*nid.rbegin()]) | |
588 | throw TextException("NID suffix is not alphanumeric", Here()); | |
589 | ||
590 | setScheme(AnyP::PROTO_URN, nullptr); | |
591 | host(nid.c_str()); | |
592 | // TODO validate path characters | |
593 | path(tok.remaining()); | |
594 | debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length())); | |
23d92c64 | 595 | } |
596 | ||
963ff143 AR |
597 | /// Extracts and returns a (suspected but only partially validated) uri-host |
598 | /// IPv6address, IPv4address, or reg-name component. This function uses (and | |
599 | /// quotes) RFC 3986, Section 3.2.2 syntax rules. | |
600 | SBuf | |
601 | AnyP::Uri::parseHost(Parser::Tokenizer &tok) const | |
602 | { | |
603 | // host = IP-literal / IPv4address / reg-name | |
604 | ||
605 | // XXX: CharacterSets below reject uri-host values containing whitespace | |
606 | // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive | |
607 | // can be interpreted as if it applies to uri-host and this code. TODO: Fix | |
608 | // uri_whitespace and the code using it to exclude uri-host (and URI scheme, | |
609 | // port, etc.) from that directive scope. | |
610 | ||
611 | // IP-literal = "[" ( IPv6address / IPvFuture ) "]" | |
612 | if (tok.skip('[')) { | |
613 | // Add "." because IPv6address in RFC 3986 includes ls32, which includes | |
614 | // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address | |
615 | // This set rejects IPvFuture that needs a "v" character. | |
616 | static const CharacterSet IPv6chars = ( | |
617 | CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6"); | |
618 | SBuf ipv6ish; | |
619 | if (!tok.prefix(ipv6ish, IPv6chars)) | |
620 | throw TextException("malformed or unsupported bracketed IP address in uri-host", Here()); | |
621 | ||
622 | if (!tok.skip(']')) | |
623 | throw TextException("IPv6 address is missing a closing bracket in uri-host", Here()); | |
624 | ||
625 | // This rejects bracketed IPv4address and domain names because they lack ":". | |
626 | if (ipv6ish.find(':') == SBuf::npos) | |
627 | throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here()); | |
628 | ||
629 | // This rejects bracketed non-IP addresses that our caller would have | |
630 | // otherwise mistaken for a domain name (e.g., '[127.0.0:1]'). | |
631 | Ip::Address ipv6check; | |
632 | if (!ipv6check.fromHost(ipv6ish.c_str())) | |
633 | throw TextException("malformed bracketed IPv6 address in uri-host", Here()); | |
634 | ||
635 | return ipv6ish; | |
636 | } | |
637 | ||
638 | // no brackets implies we are looking at IPv4address or reg-name | |
639 | ||
640 | // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&" | |
641 | // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the | |
642 | // non-CONNECT uri-host parsing code to use us. | |
643 | ||
644 | SBuf otherHost; // IPv4address-ish or reg-name-ish; | |
645 | // ":" is not in TCHAR so we will stop before any port specification | |
646 | if (tok.prefix(otherHost, CharacterSet::TCHAR)) | |
647 | return otherHost; | |
648 | ||
649 | throw TextException("malformed IPv4 address or host name in uri-host", Here()); | |
650 | } | |
651 | ||
652 | /// Extracts and returns an RFC 3986 URI authority port value (with additional | |
653 | /// restrictions). The RFC defines port as a possibly empty sequence of decimal | |
654 | /// digits. We reject certain ports (that are syntactically valid from the RFC | |
655 | /// point of view) because we are worried that Squid and other traffic handlers | |
656 | /// may dangerously mishandle unusual (and virtually always bogus) port numbers. | |
657 | /// Rejected ports cannot be successfully used by Squid itself. | |
658 | int | |
659 | AnyP::Uri::parsePort(Parser::Tokenizer &tok) const | |
660 | { | |
661 | if (tok.skip('0')) | |
662 | throw TextException("zero or zero-prefixed port", Here()); | |
663 | ||
664 | int64_t rawPort = 0; | |
665 | if (!tok.int64(rawPort, 10, false)) // port = *DIGIT | |
666 | throw TextException("malformed or missing port", Here()); | |
667 | ||
668 | Assure(rawPort > 0); | |
669 | constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it. | |
670 | constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max(); | |
671 | static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number"); | |
672 | if (Less(portMax, rawPort)) | |
673 | throw TextException("huge port", Here()); | |
674 | ||
675 | // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing | |
676 | // code to use us (so that foundPort "int" disappears or starts using Port). | |
677 | return NaturalCast<int>(rawPort); | |
678 | } | |
679 | ||
5c51bffb | 680 | void |
c8ab5ec6 | 681 | AnyP::Uri::touch() |
5c51bffb | 682 | { |
c823e2da | 683 | absolute_.clear(); |
5c51bffb AJ |
684 | authorityHttp_.clear(); |
685 | authorityWithPort_.clear(); | |
686 | } | |
687 | ||
688 | SBuf & | |
c8ab5ec6 | 689 | AnyP::Uri::authority(bool requirePort) const |
5c51bffb AJ |
690 | { |
691 | if (authorityHttp_.isEmpty()) { | |
692 | ||
693 | // both formats contain Host/IP | |
694 | authorityWithPort_.append(host()); | |
695 | authorityHttp_ = authorityWithPort_; | |
696 | ||
380b09ae AR |
697 | if (port().has_value()) { |
698 | authorityWithPort_.appendf(":%hu", *port()); | |
699 | // authorityHttp_ only has :port for known non-default ports | |
700 | if (port() != getScheme().defaultPort()) | |
701 | authorityHttp_ = authorityWithPort_; | |
702 | } | |
703 | // else XXX: We made authorityWithPort_ that does not have a port. | |
704 | // TODO: Audit callers and refuse to give out broken authorityWithPort_. | |
5c51bffb AJ |
705 | } |
706 | ||
707 | return requirePort ? authorityWithPort_ : authorityHttp_; | |
708 | } | |
709 | ||
c823e2da | 710 | SBuf & |
c8ab5ec6 | 711 | AnyP::Uri::absolute() const |
c823e2da AJ |
712 | { |
713 | if (absolute_.isEmpty()) { | |
714 | // TODO: most URL will be much shorter, avoid allocating this much | |
715 | absolute_.reserveCapacity(MAX_URL); | |
716 | ||
d31d59d8 AJ |
717 | absolute_.append(getScheme().image()); |
718 | absolute_.append(":",1); | |
c823e2da AJ |
719 | if (getScheme() != AnyP::PROTO_URN) { |
720 | absolute_.append("//", 2); | |
0d0f5161 AJ |
721 | const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP || |
722 | getScheme() == AnyP::PROTO_UNKNOWN; | |
723 | ||
724 | if (allowUserInfo && !userInfo().isEmpty()) { | |
614bd511 AJ |
725 | static const CharacterSet uiChars = CharacterSet(UserInfoChars()) |
726 | .remove('%') | |
727 | .rename("userinfo-reserved"); | |
728 | absolute_.append(Encode(userInfo(), uiChars)); | |
c823e2da AJ |
729 | absolute_.append("@", 1); |
730 | } | |
731 | absolute_.append(authority()); | |
6c880a16 AJ |
732 | } else { |
733 | absolute_.append(host()); | |
734 | absolute_.append(":", 1); | |
c823e2da | 735 | } |
614bd511 | 736 | absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed. |
c823e2da AJ |
737 | } |
738 | ||
739 | return absolute_; | |
740 | } | |
741 | ||
9837567d | 742 | /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string. |
914b89a2 | 743 | * After copying it on in the first place! Would be less code to merge the two with a flag parameter. |
744 | * and never copy the query-string part in the first place | |
745 | */ | |
88738790 | 746 | char * |
bec110e4 | 747 | urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme) |
88738790 | 748 | { |
749 | LOCAL_ARRAY(char, buf, MAX_URL); | |
62e76326 | 750 | |
bec110e4 | 751 | snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url)); |
c823e2da | 752 | buf[sizeof(buf)-1] = '\0'; |
62e76326 | 753 | |
c823e2da | 754 | // URN, CONNECT method, and non-stripped URIs can go straight out |
bec110e4 | 755 | if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) { |
c823e2da AJ |
756 | // strip anything AFTER a question-mark |
757 | // leaving the '?' in place | |
758 | if (auto t = strchr(buf, '?')) { | |
759 | *(++t) = '\0'; | |
e2849af8 | 760 | } |
d548ee64 | 761 | } |
62e76326 | 762 | |
9bc73deb | 763 | if (stringHasCntl(buf)) |
62e76326 | 764 | xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL); |
765 | ||
88738790 | 766 | return buf; |
767 | } | |
768 | ||
b3802bdc AJ |
769 | /** |
770 | * Yet another alternative to urlCanonical. | |
c2a7cefd | 771 | * This one adds the https:// parts to Http::METHOD_CONNECT URL |
b3802bdc AJ |
772 | * for use in error page outputs. |
773 | * Luckily we can leverage the others instead of duplicating. | |
774 | */ | |
775 | const char * | |
776 | urlCanonicalFakeHttps(const HttpRequest * request) | |
777 | { | |
778 | LOCAL_ARRAY(char, buf, MAX_URL); | |
779 | ||
780 | // method CONNECT and port HTTPS | |
5c51bffb AJ |
781 | if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) { |
782 | snprintf(buf, MAX_URL, "https://%s/*", request->url.host()); | |
b3802bdc AJ |
783 | return buf; |
784 | } | |
785 | ||
786 | // else do the normal complete canonical thing. | |
bec110e4 | 787 | return request->canonicalCleanUrl(); |
b3802bdc AJ |
788 | } |
789 | ||
614bd511 AJ |
790 | /** |
791 | * Test if a URL is a relative reference. | |
792 | * | |
793 | * Governed by RFC 3986 section 4.2 | |
794 | * | |
795 | * relative-ref = relative-part [ "?" query ] [ "#" fragment ] | |
bf956b0a | 796 | * |
614bd511 AJ |
797 | * relative-part = "//" authority path-abempty |
798 | * / path-absolute | |
799 | * / path-noscheme | |
800 | * / path-empty | |
bf956b0a | 801 | */ |
6e44cca8 | 802 | bool |
bf956b0a BR |
803 | urlIsRelative(const char *url) |
804 | { | |
614bd511 AJ |
805 | if (!url) |
806 | return false; // no URL | |
bf956b0a | 807 | |
614bd511 AJ |
808 | /* |
809 | * RFC 3986 section 5.2.3 | |
810 | * | |
811 | * path = path-abempty ; begins with "/" or is empty | |
812 | * / path-absolute ; begins with "/" but not "//" | |
813 | * / path-noscheme ; begins with a non-colon segment | |
814 | * / path-rootless ; begins with a segment | |
815 | * / path-empty ; zero characters | |
816 | */ | |
bf956b0a | 817 | |
614bd511 AJ |
818 | if (*url == '\0') |
819 | return true; // path-empty | |
bf956b0a | 820 | |
614bd511 | 821 | if (*url == '/') { |
a226c967 A |
822 | // network-path reference (a.k.a. 'scheme-relative URI') or |
823 | // path-absolute (a.k.a. 'absolute-path reference') | |
824 | return true; | |
bf956b0a | 825 | } |
3cbbd242 | 826 | |
614bd511 AJ |
827 | for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) { |
828 | if (*p == ':') | |
829 | return false; // colon is forbidden in first segment | |
3cbbd242 | 830 | } |
26ac0430 | 831 | |
614bd511 AJ |
832 | return true; // path-noscheme, path-abempty, path-rootless |
833 | } | |
26ac0430 | 834 | |
614bd511 AJ |
835 | void |
836 | AnyP::Uri::addRelativePath(const char *relUrl) | |
837 | { | |
838 | // URN cannot be merged | |
839 | if (getScheme() == AnyP::PROTO_URN) | |
840 | return; | |
841 | ||
842 | // TODO: Handle . and .. segment normalization | |
843 | ||
844 | const auto lastSlashPos = path_.rfind('/'); | |
845 | // TODO: To optimize and simplify, add and use SBuf::replace(). | |
846 | const auto relUrlLength = strlen(relUrl); | |
847 | if (lastSlashPos == SBuf::npos) { | |
848 | // start replacing the whole path | |
849 | path_.reserveCapacity(1 + relUrlLength); | |
850 | path_.assign("/", 1); | |
6e44cca8 | 851 | } else { |
614bd511 AJ |
852 | // start replacing just the last segment |
853 | path_.reserveCapacity(lastSlashPos + 1 + relUrlLength); | |
854 | path_.chop(0, lastSlashPos+1); | |
6e44cca8 | 855 | } |
614bd511 | 856 | path_.append(relUrl, relUrlLength); |
3cbbd242 | 857 | } |
858 | ||
b8d8561b | 859 | int |
6c1219b9 | 860 | matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags) |
30a4f2a8 | 861 | { |
9bc73deb | 862 | int dl; |
863 | int hl; | |
62e76326 | 864 | |
abbd7825 | 865 | const bool hostIncludesSubdomains = (*h == '.'); |
d20b1cd0 | 866 | while ('.' == *h) |
5db6bf73 | 867 | ++h; |
62e76326 | 868 | |
9bc73deb | 869 | hl = strlen(h); |
62e76326 | 870 | |
abbd7825 CT |
871 | if (hl == 0) |
872 | return -1; | |
873 | ||
9bc73deb | 874 | dl = strlen(d); |
b70f8649 AW |
875 | if (dl == 0) |
876 | return 1; | |
62e76326 | 877 | |
9bc73deb | 878 | /* |
879 | * Start at the ends of the two strings and work towards the | |
880 | * beginning. | |
881 | */ | |
882 | while (xtolower(h[--hl]) == xtolower(d[--dl])) { | |
62e76326 | 883 | if (hl == 0 && dl == 0) { |
884 | /* | |
885 | * We made it all the way to the beginning of both | |
886 | * strings without finding any difference. | |
887 | */ | |
888 | return 0; | |
889 | } | |
890 | ||
891 | if (0 == hl) { | |
892 | /* | |
893 | * The host string is shorter than the domain string. | |
894 | * There is only one case when this can be a match. | |
895 | * If the domain is just one character longer, and if | |
896 | * that character is a leading '.' then we call it a | |
897 | * match. | |
898 | */ | |
899 | ||
900 | if (1 == dl && '.' == d[0]) | |
901 | return 0; | |
902 | else | |
903 | return -1; | |
904 | } | |
905 | ||
906 | if (0 == dl) { | |
907 | /* | |
908 | * The domain string is shorter than the host string. | |
909 | * This is a match only if the first domain character | |
910 | * is a leading '.'. | |
911 | */ | |
912 | ||
abbd7825 CT |
913 | if ('.' == d[0]) { |
914 | if (flags & mdnRejectSubsubDomains) { | |
915 | // Check for sub-sub domain and reject | |
916 | while(--hl >= 0 && h[hl] != '.'); | |
917 | if (hl < 0) { | |
918 | // No sub-sub domain found, but reject if there is a | |
919 | // leading dot in given host string (which is removed | |
920 | // before the check is started). | |
921 | return hostIncludesSubdomains ? 1 : 0; | |
922 | } else | |
923 | return 1; // sub-sub domain, reject | |
924 | } else | |
925 | return 0; | |
926 | } else | |
62e76326 | 927 | return 1; |
928 | } | |
9bc73deb | 929 | } |
62e76326 | 930 | |
9bc73deb | 931 | /* |
932 | * We found different characters in the same position (from the end). | |
933 | */ | |
69f69080 CT |
934 | |
935 | // If the h has a form of "*.foo.com" and d has a form of "x.foo.com" | |
936 | // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x' | |
937 | // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'. | |
abbd7825 | 938 | if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.') |
69f69080 CT |
939 | return 0; |
940 | ||
d20b1cd0 | 941 | /* |
942 | * If one of those character is '.' then its special. In order | |
943 | * for splay tree sorting to work properly, "x-foo.com" must | |
944 | * be greater than ".foo.com" even though '-' is less than '.'. | |
945 | */ | |
946 | if ('.' == d[dl]) | |
62e76326 | 947 | return 1; |
948 | ||
d20b1cd0 | 949 | if ('.' == h[hl]) |
62e76326 | 950 | return -1; |
951 | ||
9bc73deb | 952 | return (xtolower(h[hl]) - xtolower(d[dl])); |
30a4f2a8 | 953 | } |
a8f7d3ee | 954 | |
985c86bc | 955 | /* |
610ee341 | 956 | * return true if we can serve requests for this method. |
985c86bc | 957 | */ |
8b082ed9 | 958 | bool |
190154cf | 959 | urlCheckRequest(const HttpRequest * r) |
a8f7d3ee | 960 | { |
610ee341 | 961 | /* protocol "independent" methods |
962 | * | |
963 | * actually these methods are specific to HTTP: | |
2f8abb64 | 964 | * they are methods we receive on our HTTP port, |
610ee341 | 965 | * and if we had a FTP listener would not be relevant |
966 | * there. | |
967 | * | |
968 | * So, we should delegate them to HTTP. The problem is that we | |
969 | * do not have a default protocol from the client side of HTTP. | |
970 | */ | |
62e76326 | 971 | |
c2a7cefd | 972 | if (r->method == Http::METHOD_CONNECT) |
8b082ed9 | 973 | return true; |
62e76326 | 974 | |
77ce6ba9 AR |
975 | // we support OPTIONS and TRACE directed at us (with a 501 reply, for now) |
976 | // we also support forwarding OPTIONS and TRACE, except for the *-URI ones | |
c2a7cefd | 977 | if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE) |
c8ab5ec6 | 978 | return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk()); |
62e76326 | 979 | |
c2a7cefd | 980 | if (r->method == Http::METHOD_PURGE) |
8b082ed9 | 981 | return true; |
62e76326 | 982 | |
99edd1c3 | 983 | /* does method match the protocol? */ |
4e3f4dc7 | 984 | switch (r->url.getScheme()) { |
62e76326 | 985 | |
0c3d3f65 | 986 | case AnyP::PROTO_URN: |
0c3d3f65 | 987 | case AnyP::PROTO_HTTP: |
8b082ed9 | 988 | return true; |
62e76326 | 989 | |
0c3d3f65 | 990 | case AnyP::PROTO_FTP: |
8b082ed9 FC |
991 | if (r->method == Http::METHOD_PUT || |
992 | r->method == Http::METHOD_GET || | |
993 | r->method == Http::METHOD_HEAD ) | |
994 | return true; | |
995 | return false; | |
62e76326 | 996 | |
0c3d3f65 | 997 | case AnyP::PROTO_WAIS: |
0c3d3f65 | 998 | case AnyP::PROTO_WHOIS: |
8b082ed9 FC |
999 | if (r->method == Http::METHOD_GET || |
1000 | r->method == Http::METHOD_HEAD) | |
1001 | return true; | |
1002 | return false; | |
62e76326 | 1003 | |
0c3d3f65 | 1004 | case AnyP::PROTO_HTTPS: |
c813943d | 1005 | #if USE_OPENSSL || HAVE_LIBGNUTLS |
8b082ed9 | 1006 | return true; |
1f7c9178 | 1007 | #else |
62e76326 | 1008 | /* |
8b082ed9 FC |
1009 | * Squid can't originate an SSL connection, so it should |
1010 | * never receive an "https:" URL. It should always be | |
1011 | * CONNECT instead. | |
1012 | */ | |
1013 | return false; | |
1f7c9178 | 1014 | #endif |
62e76326 | 1015 | |
a8f7d3ee | 1016 | default: |
8b082ed9 | 1017 | return false; |
a8f7d3ee | 1018 | } |
62e76326 | 1019 | |
8b082ed9 FC |
1020 | /* notreached */ |
1021 | return false; | |
a8f7d3ee | 1022 | } |
9ce5e3e6 | 1023 | |
c8ab5ec6 | 1024 | AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) : |
d59e4742 | 1025 | scheme_(aScheme), |
380b09ae | 1026 | hostIsNumeric_(false) |
d59e4742 FC |
1027 | { |
1028 | *host_=0; | |
1029 | } | |
1a739503 | 1030 | |
bec110e4 EB |
1031 | // TODO: fix code duplication with AnyP::Uri::parse() |
1032 | char * | |
1033 | AnyP::Uri::cleanup(const char *uri) | |
1034 | { | |
bec110e4 EB |
1035 | char *cleanedUri = nullptr; |
1036 | switch (Config.uri_whitespace) { | |
8b082ed9 FC |
1037 | case URI_WHITESPACE_ALLOW: { |
1038 | const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED; | |
bec110e4 EB |
1039 | cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL); |
1040 | break; | |
8b082ed9 FC |
1041 | } |
1042 | ||
1043 | case URI_WHITESPACE_ENCODE: | |
1044 | cleanedUri = xstrndup(rfc1738_do_escape(uri, RFC1738_ESCAPE_UNESCAPED), MAX_URL); | |
1045 | break; | |
bec110e4 EB |
1046 | |
1047 | case URI_WHITESPACE_CHOP: { | |
bec110e4 EB |
1048 | const auto pos = strcspn(uri, w_space); |
1049 | char *choppedUri = nullptr; | |
1050 | if (pos < strlen(uri)) | |
1051 | choppedUri = xstrndup(uri, pos + 1); | |
8b082ed9 FC |
1052 | cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, |
1053 | RFC1738_ESCAPE_UNESCAPED), MAX_URL); | |
bec110e4 EB |
1054 | cleanedUri[pos] = '\0'; |
1055 | xfree(choppedUri); | |
8b082ed9 | 1056 | break; |
bec110e4 | 1057 | } |
bec110e4 EB |
1058 | |
1059 | case URI_WHITESPACE_DENY: | |
1060 | case URI_WHITESPACE_STRIP: | |
1061 | default: { | |
1062 | // TODO: avoid duplication with urlParse() | |
1063 | const char *t; | |
1064 | char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1)); | |
1065 | char *q = tmp_uri; | |
1066 | t = uri; | |
1067 | while (*t) { | |
1068 | if (!xisspace(*t)) { | |
1069 | *q = *t; | |
1070 | ++q; | |
1071 | } | |
1072 | ++t; | |
1073 | } | |
1074 | *q = '\0'; | |
1075 | cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL); | |
1076 | xfree(tmp_uri); | |
8b082ed9 | 1077 | break; |
bec110e4 | 1078 | } |
bec110e4 EB |
1079 | } |
1080 | ||
1081 | assert(cleanedUri); | |
1082 | return cleanedUri; | |
1083 | } | |
279e60ef | 1084 |