]>
Commit | Line | Data |
---|---|---|
30a4f2a8 | 1 | /* |
5b74111a | 2 | * Copyright (C) 1996-2018 The Squid Software Foundation and contributors |
e25c139f | 3 | * |
bbc27441 AJ |
4 | * Squid software is distributed under GPLv2+ license and includes |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
019dd986 | 7 | */ |
ed43818f | 8 | |
bbc27441 AJ |
9 | /* DEBUG: section 23 URL Parsing */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
c8ab5ec6 | 12 | #include "anyp/Uri.h" |
582c2af2 | 13 | #include "globals.h" |
528b2c61 | 14 | #include "HttpRequest.h" |
1fa9b1a7 | 15 | #include "rfc1738.h" |
4d5904f7 | 16 | #include "SquidConfig.h" |
7a707cb5 | 17 | #include "SquidString.h" |
090089c4 | 18 | |
a78278e2 | 19 | static const char valid_hostname_chars_u[] = |
62e76326 | 20 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
21 | "abcdefghijklmnopqrstuvwxyz" | |
a78278e2 | 22 | "0123456789-._" |
cc192b50 | 23 | "[:]" |
a78278e2 | 24 | ; |
25 | static const char valid_hostname_chars[] = | |
62e76326 | 26 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
27 | "abcdefghijklmnopqrstuvwxyz" | |
28 | "0123456789-." | |
cc192b50 | 29 | "[:]" |
62e76326 | 30 | ; |
090089c4 | 31 | |
2e260208 | 32 | const SBuf & |
c8ab5ec6 | 33 | AnyP::Uri::Asterisk() |
2e260208 AJ |
34 | { |
35 | static SBuf star("*"); | |
36 | return star; | |
37 | } | |
38 | ||
51b5dcf5 | 39 | const SBuf & |
c8ab5ec6 | 40 | AnyP::Uri::SlashPath() |
51b5dcf5 AJ |
41 | { |
42 | static SBuf slash("/"); | |
43 | return slash; | |
44 | } | |
45 | ||
5c51bffb | 46 | void |
c8ab5ec6 | 47 | AnyP::Uri::host(const char *src) |
5c51bffb AJ |
48 | { |
49 | hostAddr_.setEmpty(); | |
50 | hostAddr_ = src; | |
51 | if (hostAddr_.isAnyAddr()) { | |
52 | xstrncpy(host_, src, sizeof(host_)); | |
53 | hostIsNumeric_ = false; | |
54 | } else { | |
55 | hostAddr_.toHostStr(host_, sizeof(host_)); | |
56 | debugs(23, 3, "given IP: " << hostAddr_); | |
57 | hostIsNumeric_ = 1; | |
58 | } | |
59 | touch(); | |
60 | } | |
61 | ||
51b5dcf5 | 62 | const SBuf & |
c8ab5ec6 | 63 | AnyP::Uri::path() const |
51b5dcf5 AJ |
64 | { |
65 | // RFC 3986 section 3.3 says path can be empty (path-abempty). | |
66 | // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/" | |
67 | // at least when sending and using. We must still accept path-abempty as input. | |
68 | if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS)) | |
69 | return SlashPath(); | |
70 | ||
71 | return path_; | |
72 | } | |
73 | ||
b8d8561b | 74 | void |
0673c0ba | 75 | urlInitialize(void) |
090089c4 | 76 | { |
bf8fe701 | 77 | debugs(23, 5, "urlInitialize: Initializing..."); |
985c86bc | 78 | /* this ensures that the number of protocol strings is the same as |
0c3d3f65 | 79 | * the enum slots allocated because the last enum is always 'MAX'. |
985c86bc | 80 | */ |
0c3d3f65 | 81 | assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0); |
9bc73deb | 82 | /* |
83 | * These test that our matchDomainName() function works the | |
84 | * way we expect it to. | |
85 | */ | |
86 | assert(0 == matchDomainName("foo.com", "foo.com")); | |
d20b1cd0 | 87 | assert(0 == matchDomainName(".foo.com", "foo.com")); |
9bc73deb | 88 | assert(0 == matchDomainName("foo.com", ".foo.com")); |
89 | assert(0 == matchDomainName(".foo.com", ".foo.com")); | |
90 | assert(0 == matchDomainName("x.foo.com", ".foo.com")); | |
abbd7825 | 91 | assert(0 == matchDomainName("y.x.foo.com", ".foo.com")); |
9bc73deb | 92 | assert(0 != matchDomainName("x.foo.com", "foo.com")); |
93 | assert(0 != matchDomainName("foo.com", "x.foo.com")); | |
94 | assert(0 != matchDomainName("bar.com", "foo.com")); | |
95 | assert(0 != matchDomainName(".bar.com", "foo.com")); | |
96 | assert(0 != matchDomainName(".bar.com", ".foo.com")); | |
97 | assert(0 != matchDomainName("bar.com", ".foo.com")); | |
98 | assert(0 < matchDomainName("zzz.com", "foo.com")); | |
99 | assert(0 > matchDomainName("aaa.com", "foo.com")); | |
100 | assert(0 == matchDomainName("FOO.com", "foo.COM")); | |
aca95add | 101 | assert(0 < matchDomainName("bfoo.com", "afoo.com")); |
102 | assert(0 > matchDomainName("afoo.com", "bfoo.com")); | |
d20b1cd0 | 103 | assert(0 < matchDomainName("x-foo.com", ".foo.com")); |
abbd7825 CT |
104 | |
105 | assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
106 | assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
107 | assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
108 | assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains)); | |
109 | ||
110 | assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards)); | |
111 | assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards)); | |
112 | assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards)); | |
113 | assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards)); | |
114 | ||
9bc73deb | 115 | /* more cases? */ |
090089c4 | 116 | } |
117 | ||
cc192b50 | 118 | /** |
5a7fb80a AJ |
119 | * Parse the scheme name from string b, into protocol type. |
120 | * The string must be 0-terminated. | |
d4a04ed5 | 121 | */ |
0c3d3f65 | 122 | AnyP::ProtocolType |
5a7fb80a | 123 | urlParseProtocol(const char *b) |
92a6f4b1 | 124 | { |
5a7fb80a AJ |
125 | // make e point to the ':' character |
126 | const char *e = b + strcspn(b, ":"); | |
d4a04ed5 | 127 | int len = e - b; |
128 | ||
fcd2d3ef | 129 | /* test common stuff first */ |
62e76326 | 130 | |
d4a04ed5 | 131 | if (strncasecmp(b, "http", len) == 0) |
0c3d3f65 | 132 | return AnyP::PROTO_HTTP; |
62e76326 | 133 | |
d4a04ed5 | 134 | if (strncasecmp(b, "ftp", len) == 0) |
0c3d3f65 | 135 | return AnyP::PROTO_FTP; |
62e76326 | 136 | |
d4a04ed5 | 137 | if (strncasecmp(b, "https", len) == 0) |
0c3d3f65 | 138 | return AnyP::PROTO_HTTPS; |
62e76326 | 139 | |
d4a04ed5 | 140 | if (strncasecmp(b, "file", len) == 0) |
0c3d3f65 | 141 | return AnyP::PROTO_FTP; |
62e76326 | 142 | |
330f829e AJ |
143 | if (strncasecmp(b, "coap", len) == 0) |
144 | return AnyP::PROTO_COAP; | |
145 | ||
146 | if (strncasecmp(b, "coaps", len) == 0) | |
147 | return AnyP::PROTO_COAPS; | |
148 | ||
d4a04ed5 | 149 | if (strncasecmp(b, "gopher", len) == 0) |
0c3d3f65 | 150 | return AnyP::PROTO_GOPHER; |
62e76326 | 151 | |
d4a04ed5 | 152 | if (strncasecmp(b, "wais", len) == 0) |
0c3d3f65 | 153 | return AnyP::PROTO_WAIS; |
62e76326 | 154 | |
d4a04ed5 | 155 | if (strncasecmp(b, "cache_object", len) == 0) |
39a19cb7 | 156 | return AnyP::PROTO_CACHE_OBJECT; |
62e76326 | 157 | |
d4a04ed5 | 158 | if (strncasecmp(b, "urn", len) == 0) |
0c3d3f65 | 159 | return AnyP::PROTO_URN; |
62e76326 | 160 | |
d4a04ed5 | 161 | if (strncasecmp(b, "whois", len) == 0) |
0c3d3f65 | 162 | return AnyP::PROTO_WHOIS; |
62e76326 | 163 | |
d31d59d8 AJ |
164 | if (len > 0) |
165 | return AnyP::PROTO_UNKNOWN; | |
166 | ||
0c3d3f65 | 167 | return AnyP::PROTO_NONE; |
92a6f4b1 | 168 | } |
169 | ||
d4a04ed5 | 170 | /* |
171 | * Parse a URI/URL. | |
172 | * | |
9157915c | 173 | * Stores parsed values in the `request` argument. |
c21ad0f5 | 174 | * |
26ac0430 | 175 | * This abuses HttpRequest as a way of representing the parsed url |
c21ad0f5 | 176 | * and its components. |
177 | * method is used to switch parsers and to init the HttpRequest. | |
c2a7cefd | 178 | * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is |
c21ad0f5 | 179 | * looked for. |
180 | * The url is non const so that if its too long we can NULL-terminate it in place. | |
d4a04ed5 | 181 | */ |
cc192b50 | 182 | |
183 | /* | |
184 | * This routine parses a URL. Its assumed that the URL is complete - | |
185 | * ie, the end of the string is the end of the URL. Don't pass a partial | |
186 | * URL here as this routine doesn't have any way of knowing whether | |
187 | * its partial or not (ie, it handles the case of no trailing slash as | |
188 | * being "end of host with implied path of /". | |
189 | */ | |
9157915c | 190 | bool |
c8ab5ec6 | 191 | AnyP::Uri::parse(const HttpRequestMethod& method, const char *url) |
7111c86a | 192 | { |
f2052513 | 193 | LOCAL_ARRAY(char, proto, MAX_URL); |
194 | LOCAL_ARRAY(char, login, MAX_URL); | |
91489e45 | 195 | LOCAL_ARRAY(char, foundHost, MAX_URL); |
f2052513 | 196 | LOCAL_ARRAY(char, urlpath, MAX_URL); |
7111c86a | 197 | char *t = NULL; |
7e3ce7b9 | 198 | char *q = NULL; |
91489e45 | 199 | int foundPort; |
0c3d3f65 | 200 | AnyP::ProtocolType protocol = AnyP::PROTO_NONE; |
774cc2d8 | 201 | int l; |
cc192b50 | 202 | int i; |
203 | const char *src; | |
204 | char *dst; | |
91489e45 | 205 | proto[0] = foundHost[0] = urlpath[0] = login[0] = '\0'; |
7111c86a | 206 | |
ba0fd1b6 | 207 | if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) { |
91489e45 | 208 | debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)"); |
a0924f71 | 209 | return false; |
0a5b9b32 | 210 | } |
c2a7cefd | 211 | if (method == Http::METHOD_CONNECT) { |
91489e45 AJ |
212 | /* |
213 | * RFC 7230 section 5.3.3: authority-form = authority | |
214 | * "excluding any userinfo and its "@" delimiter" | |
215 | * | |
216 | * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ] | |
217 | * | |
218 | * As an HTTP(S) proxy we assume HTTPS (443) if no port provided. | |
219 | */ | |
220 | foundPort = 443; | |
62e76326 | 221 | |
91489e45 AJ |
222 | if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1) |
223 | if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1) | |
a0924f71 | 224 | return false; |
cc192b50 | 225 | |
c2a7cefd | 226 | } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) && |
c8ab5ec6 | 227 | AnyP::Uri::Asterisk().cmp(url) == 0) { |
91489e45 | 228 | parseFinish(AnyP::PROTO_HTTP, nullptr, url, foundHost, SBuf(), 80 /* HTTP default port */); |
db59367a AJ |
229 | return true; |
230 | } else if (strncmp(url, "urn:", 4) == 0) { | |
231 | debugs(23, 3, "Split URI '" << url << "' into proto='urn', path='" << (url+4) << "'"); | |
232 | debugs(50, 5, "urn=" << (url+4)); | |
91489e45 AJ |
233 | setScheme(AnyP::PROTO_URN, nullptr); |
234 | path(url + 4); | |
db59367a | 235 | return true; |
7111c86a | 236 | } else { |
cc192b50 | 237 | /* Parse the URL: */ |
238 | src = url; | |
239 | i = 0; | |
240 | /* Find first : - everything before is protocol */ | |
5db6bf73 | 241 | for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) { |
cc192b50 | 242 | *dst = *src; |
243 | } | |
244 | if (i >= l) | |
a0924f71 | 245 | return false; |
cc192b50 | 246 | *dst = '\0'; |
247 | ||
248 | /* Then its :// */ | |
5e245980 | 249 | if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/') |
a0924f71 | 250 | return false; |
cc192b50 | 251 | i += 3; |
252 | src += 3; | |
62e76326 | 253 | |
cc192b50 | 254 | /* Then everything until first /; thats host (and port; which we'll look for here later) */ |
68338d14 F |
255 | // bug 1881: If we don't get a "/" then we imply it was there |
256 | // bug 3074: We could just be given a "?" or "#". These also imply "/" | |
b2ab59ad | 257 | // bug 3233: whitespace is also a hostname delimiter. |
91489e45 | 258 | for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) { |
cc192b50 | 259 | *dst = *src; |
260 | } | |
261 | ||
26ac0430 | 262 | /* |
cc192b50 | 263 | * We can't check for "i >= l" here because we could be at the end of the line |
264 | * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've | |
265 | * been -given- a valid URL and the path is just '/'. | |
266 | */ | |
267 | if (i > l) | |
a0924f71 | 268 | return false; |
cc192b50 | 269 | *dst = '\0'; |
270 | ||
68338d14 F |
271 | // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/' |
272 | if (*src == '?' || *src == '#' || *src == '\0') { | |
273 | urlpath[0] = '/'; | |
274 | dst = &urlpath[1]; | |
275 | } else { | |
276 | dst = urlpath; | |
277 | } | |
cc192b50 | 278 | /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */ |
5db6bf73 | 279 | for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 280 | *dst = *src; |
281 | } | |
62e76326 | 282 | |
cc192b50 | 283 | /* We -could- be at the end of the buffer here */ |
284 | if (i > l) | |
a0924f71 | 285 | return false; |
cc192b50 | 286 | /* If the URL path is empty we set it to be "/" */ |
287 | if (dst == urlpath) { | |
5db6bf73 FC |
288 | *dst = '/'; |
289 | ++dst; | |
cc192b50 | 290 | } |
291 | *dst = '\0'; | |
292 | ||
293 | protocol = urlParseProtocol(proto); | |
91489e45 | 294 | foundPort = AnyP::UriScheme(protocol).defaultPort(); |
62e76326 | 295 | |
cc192b50 | 296 | /* Is there any login information? (we should eventually parse it above) */ |
91489e45 | 297 | t = strrchr(foundHost, '@'); |
810635e3 | 298 | if (t != NULL) { |
91489e45 | 299 | strncpy((char *) login, (char *) foundHost, sizeof(login)-1); |
0a84e4fb | 300 | login[sizeof(login)-1] = '\0'; |
62e76326 | 301 | t = strrchr(login, '@'); |
302 | *t = 0; | |
91489e45 AJ |
303 | strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1); |
304 | foundHost[sizeof(foundHost)-1] = '\0'; | |
bcddfefb AJ |
305 | // Bug 4498: URL-unescape the login info after extraction |
306 | rfc1738_unescape(login); | |
62e76326 | 307 | } |
308 | ||
cc192b50 | 309 | /* Is there any host information? (we should eventually parse it above) */ |
91489e45 | 310 | if (*foundHost == '[') { |
cc192b50 | 311 | /* strip any IPA brackets. valid under IPv6. */ |
91489e45 | 312 | dst = foundHost; |
cc192b50 | 313 | /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */ |
91489e45 | 314 | src = foundHost; |
5db6bf73 | 315 | ++src; |
91489e45 | 316 | l = strlen(foundHost); |
cc192b50 | 317 | i = 1; |
5db6bf73 | 318 | for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) { |
cc192b50 | 319 | *dst = *src; |
320 | } | |
321 | ||
322 | /* we moved in-place, so truncate the actual hostname found */ | |
5db6bf73 FC |
323 | *dst = '\0'; |
324 | ++dst; | |
cc192b50 | 325 | |
326 | /* skip ahead to either start of port, or original EOS */ | |
5db6bf73 FC |
327 | while (*dst != '\0' && *dst != ':') |
328 | ++dst; | |
cc192b50 | 329 | t = dst; |
330 | } else { | |
91489e45 | 331 | t = strrchr(foundHost, ':'); |
cc192b50 | 332 | |
91489e45 | 333 | if (t != strchr(foundHost,':') ) { |
cc192b50 | 334 | /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */ |
335 | /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */ | |
336 | /* therefore we MUST accept the case where they are not bracketed at all. */ | |
337 | t = NULL; | |
338 | } | |
339 | } | |
62e76326 | 340 | |
b5acc277 | 341 | // Bug 3183 sanity check: If scheme is present, host must be too. |
91489e45 | 342 | if (protocol != AnyP::PROTO_NONE && foundHost[0] == '\0') { |
ac89842b | 343 | debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details."); |
a0924f71 | 344 | return false; |
b5acc277 AJ |
345 | } |
346 | ||
cc192b50 | 347 | if (t && *t == ':') { |
26ac0430 | 348 | *t = '\0'; |
5db6bf73 | 349 | ++t; |
91489e45 | 350 | foundPort = atoi(t); |
62e76326 | 351 | } |
7111c86a | 352 | } |
62e76326 | 353 | |
91489e45 | 354 | for (t = foundHost; *t; ++t) |
62e76326 | 355 | *t = xtolower(*t); |
356 | ||
91489e45 | 357 | if (stringHasWhitespace(foundHost)) { |
62e76326 | 358 | if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { |
91489e45 | 359 | t = q = foundHost; |
62e76326 | 360 | while (*t) { |
5db6bf73 FC |
361 | if (!xisspace(*t)) { |
362 | *q = *t; | |
363 | ++q; | |
364 | } | |
365 | ++t; | |
62e76326 | 366 | } |
62e76326 | 367 | *q = '\0'; |
368 | } | |
d20b1cd0 | 369 | } |
62e76326 | 370 | |
91489e45 | 371 | debugs(23, 3, "Split URL '" << url << "' into proto='" << proto << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'"); |
cc192b50 | 372 | |
91489e45 AJ |
373 | if (Config.onoff.check_hostnames && |
374 | strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) { | |
375 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'"); | |
a0924f71 | 376 | return false; |
b3f3dd02 | 377 | } |
62e76326 | 378 | |
532e5dd4 | 379 | /* For IPV6 addresses also check for a colon */ |
91489e45 AJ |
380 | if (Config.appendDomain && !strchr(foundHost, '.') && !strchr(foundHost, ':')) |
381 | strncat(foundHost, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(foundHost) - 1); | |
cc192b50 | 382 | |
1c481e00 | 383 | /* remove trailing dots from hostnames */ |
91489e45 AJ |
384 | while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.') |
385 | foundHost[l] = '\0'; | |
62e76326 | 386 | |
cc192b50 | 387 | /* reject duplicate or leading dots */ |
91489e45 AJ |
388 | if (strstr(foundHost, "..") || *foundHost == '.') { |
389 | debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'"); | |
a0924f71 | 390 | return false; |
cc192b50 | 391 | } |
62e76326 | 392 | |
91489e45 AJ |
393 | if (foundPort < 1 || foundPort > 65535) { |
394 | debugs(23, 3, "Invalid port '" << foundPort << "'"); | |
a0924f71 | 395 | return false; |
7111c86a | 396 | } |
62e76326 | 397 | |
32d002cb | 398 | #if HARDCODE_DENY_PORTS |
429fdbec | 399 | /* These ports are filtered in the default squid.conf, but |
400 | * maybe someone wants them hardcoded... */ | |
91489e45 AJ |
401 | if (foundPort == 7 || foundPort == 9 || foundPort == 19) { |
402 | debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort); | |
a0924f71 | 403 | return false; |
429fdbec | 404 | } |
6ef12318 | 405 | #endif |
cc192b50 | 406 | |
30abd221 | 407 | if (stringHasWhitespace(urlpath)) { |
91489e45 | 408 | debugs(23, 2, "URI has whitespace: {" << url << "}"); |
62e76326 | 409 | |
410 | switch (Config.uri_whitespace) { | |
411 | ||
412 | case URI_WHITESPACE_DENY: | |
a0924f71 | 413 | return false; |
62e76326 | 414 | |
415 | case URI_WHITESPACE_ALLOW: | |
416 | break; | |
417 | ||
418 | case URI_WHITESPACE_ENCODE: | |
419 | t = rfc1738_escape_unescaped(urlpath); | |
420 | xstrncpy(urlpath, t, MAX_URL); | |
421 | break; | |
422 | ||
423 | case URI_WHITESPACE_CHOP: | |
424 | *(urlpath + strcspn(urlpath, w_space)) = '\0'; | |
425 | break; | |
426 | ||
427 | case URI_WHITESPACE_STRIP: | |
62e76326 | 428 | default: |
429 | t = q = urlpath; | |
62e76326 | 430 | while (*t) { |
5db6bf73 FC |
431 | if (!xisspace(*t)) { |
432 | *q = *t; | |
433 | ++q; | |
434 | } | |
435 | ++t; | |
62e76326 | 436 | } |
62e76326 | 437 | *q = '\0'; |
438 | } | |
d548ee64 | 439 | } |
62e76326 | 440 | |
91489e45 | 441 | parseFinish(protocol, proto, urlpath, foundHost, SBuf(login), foundPort); |
9157915c | 442 | return true; |
7111c86a | 443 | } |
444 | ||
db59367a AJ |
445 | /// Update the URL object with parsed URI data. |
446 | void | |
c8ab5ec6 | 447 | AnyP::Uri::parseFinish(const AnyP::ProtocolType protocol, |
e69ca1f1 | 448 | const char *const protoStr, // for unknown protocols |
449 | const char *const aUrlPath, | |
450 | const char *const aHost, | |
451 | const SBuf &aLogin, | |
452 | const int aPort) | |
23d92c64 | 453 | { |
db59367a AJ |
454 | setScheme(protocol, protoStr); |
455 | path(aUrlPath); | |
456 | host(aHost); | |
457 | userInfo(aLogin); | |
458 | port(aPort); | |
23d92c64 | 459 | } |
460 | ||
5c51bffb | 461 | void |
c8ab5ec6 | 462 | AnyP::Uri::touch() |
5c51bffb | 463 | { |
c823e2da | 464 | absolute_.clear(); |
5c51bffb AJ |
465 | authorityHttp_.clear(); |
466 | authorityWithPort_.clear(); | |
467 | } | |
468 | ||
469 | SBuf & | |
c8ab5ec6 | 470 | AnyP::Uri::authority(bool requirePort) const |
5c51bffb AJ |
471 | { |
472 | if (authorityHttp_.isEmpty()) { | |
473 | ||
474 | // both formats contain Host/IP | |
475 | authorityWithPort_.append(host()); | |
476 | authorityHttp_ = authorityWithPort_; | |
477 | ||
478 | // authorityForm_ only has :port if it is non-default | |
479 | authorityWithPort_.appendf(":%u",port()); | |
480 | if (port() != getScheme().defaultPort()) | |
481 | authorityHttp_ = authorityWithPort_; | |
482 | } | |
483 | ||
484 | return requirePort ? authorityWithPort_ : authorityHttp_; | |
485 | } | |
486 | ||
c823e2da | 487 | SBuf & |
c8ab5ec6 | 488 | AnyP::Uri::absolute() const |
c823e2da AJ |
489 | { |
490 | if (absolute_.isEmpty()) { | |
491 | // TODO: most URL will be much shorter, avoid allocating this much | |
492 | absolute_.reserveCapacity(MAX_URL); | |
493 | ||
d31d59d8 AJ |
494 | absolute_.append(getScheme().image()); |
495 | absolute_.append(":",1); | |
c823e2da AJ |
496 | if (getScheme() != AnyP::PROTO_URN) { |
497 | absolute_.append("//", 2); | |
498 | const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP || | |
499 | getScheme() != AnyP::PROTO_HTTPS || | |
500 | userInfo().isEmpty(); | |
501 | if (!omitUserInfo) { | |
502 | absolute_.append(userInfo()); | |
503 | absolute_.append("@", 1); | |
504 | } | |
505 | absolute_.append(authority()); | |
506 | } | |
507 | absolute_.append(path()); | |
508 | } | |
509 | ||
510 | return absolute_; | |
511 | } | |
512 | ||
851feda6 | 513 | /** \todo AYJ: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string. |
914b89a2 | 514 | * After copying it on in the first place! Would be less code to merge the two with a flag parameter. |
515 | * and never copy the query-string part in the first place | |
516 | */ | |
88738790 | 517 | char * |
190154cf | 518 | urlCanonicalClean(const HttpRequest * request) |
88738790 | 519 | { |
520 | LOCAL_ARRAY(char, buf, MAX_URL); | |
62e76326 | 521 | |
851feda6 | 522 | snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(request->effectiveRequestUri())); |
c823e2da | 523 | buf[sizeof(buf)-1] = '\0'; |
62e76326 | 524 | |
c823e2da | 525 | // URN, CONNECT method, and non-stripped URIs can go straight out |
851feda6 | 526 | if (Config.onoff.strip_query_terms && !(request->method == Http::METHOD_CONNECT || request->url.getScheme() == AnyP::PROTO_URN)) { |
c823e2da AJ |
527 | // strip anything AFTER a question-mark |
528 | // leaving the '?' in place | |
529 | if (auto t = strchr(buf, '?')) { | |
530 | *(++t) = '\0'; | |
e2849af8 | 531 | } |
d548ee64 | 532 | } |
62e76326 | 533 | |
9bc73deb | 534 | if (stringHasCntl(buf)) |
62e76326 | 535 | xstrncpy(buf, rfc1738_escape_unescaped(buf), MAX_URL); |
536 | ||
88738790 | 537 | return buf; |
538 | } | |
539 | ||
b3802bdc AJ |
540 | /** |
541 | * Yet another alternative to urlCanonical. | |
c2a7cefd | 542 | * This one adds the https:// parts to Http::METHOD_CONNECT URL |
b3802bdc AJ |
543 | * for use in error page outputs. |
544 | * Luckily we can leverage the others instead of duplicating. | |
545 | */ | |
546 | const char * | |
547 | urlCanonicalFakeHttps(const HttpRequest * request) | |
548 | { | |
549 | LOCAL_ARRAY(char, buf, MAX_URL); | |
550 | ||
551 | // method CONNECT and port HTTPS | |
5c51bffb AJ |
552 | if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) { |
553 | snprintf(buf, MAX_URL, "https://%s/*", request->url.host()); | |
b3802bdc AJ |
554 | return buf; |
555 | } | |
556 | ||
557 | // else do the normal complete canonical thing. | |
558 | return urlCanonicalClean(request); | |
559 | } | |
560 | ||
bf956b0a BR |
561 | /* |
562 | * Test if a URL is relative. | |
563 | * | |
71051277 BR |
564 | * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will |
565 | * appear before a ':'. | |
bf956b0a | 566 | */ |
6e44cca8 | 567 | bool |
bf956b0a BR |
568 | urlIsRelative(const char *url) |
569 | { | |
570 | const char *p; | |
571 | ||
572 | if (url == NULL) { | |
6e44cca8 | 573 | return (false); |
bf956b0a BR |
574 | } |
575 | if (*url == '\0') { | |
6e44cca8 | 576 | return (false); |
bf956b0a BR |
577 | } |
578 | ||
5db6bf73 | 579 | for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p); |
bf956b0a BR |
580 | |
581 | if (*p == ':') { | |
6e44cca8 | 582 | return (false); |
bf956b0a | 583 | } |
6e44cca8 | 584 | return (true); |
bf956b0a BR |
585 | } |
586 | ||
587 | /* | |
71051277 | 588 | * Convert a relative URL to an absolute URL using the context of a given |
bf956b0a | 589 | * request. |
71051277 BR |
590 | * |
591 | * It is assumed that you have already ensured that the URL is relative. | |
592 | * | |
6e44cca8 BR |
593 | * If NULL is returned it is an indication that the method in use in the |
594 | * request does not distinguish between relative and absolute and you should | |
595 | * use the url unchanged. | |
0376a4c9 BR |
596 | * |
597 | * If non-NULL is returned, it is up to the caller to free the resulting | |
598 | * memory using safe_free(). | |
bf956b0a | 599 | */ |
6e44cca8 | 600 | char * |
bf956b0a | 601 | urlMakeAbsolute(const HttpRequest * req, const char *relUrl) |
3cbbd242 | 602 | { |
3cbbd242 | 603 | |
c2a7cefd | 604 | if (req->method.id() == Http::METHOD_CONNECT) { |
f3900427 | 605 | return (NULL); |
3cbbd242 | 606 | } |
26ac0430 | 607 | |
6e44cca8 | 608 | char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char)); |
26ac0430 | 609 | |
4e3f4dc7 | 610 | if (req->url.getScheme() == AnyP::PROTO_URN) { |
c823e2da AJ |
611 | // XXX: this is what the original code did, but it seems to break the |
612 | // intended behaviour of this function. It returns the stored URN path, | |
613 | // not converting the given one into a URN... | |
614 | snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute())); | |
71051277 | 615 | return (urlbuf); |
3cbbd242 | 616 | } |
26ac0430 | 617 | |
5c51bffb | 618 | SBuf authorityForm = req->url.authority(); // host[:port] |
d31d59d8 AJ |
619 | const SBuf &scheme = req->url.getScheme().image(); |
620 | size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH, | |
621 | SQUIDSBUFPRINT(scheme), | |
5c51bffb AJ |
622 | SQUIDSBUFPRINT(req->url.userInfo()), |
623 | !req->url.userInfo().isEmpty() ? "@" : "", | |
624 | SQUIDSBUFPRINT(authorityForm)); | |
6e44cca8 | 625 | |
51b5dcf5 AJ |
626 | // if the first char is '/' assume its a relative path |
627 | // XXX: this breaks on scheme-relative URLs, | |
628 | // but we should not see those outside ESI, and rarely there. | |
c823e2da | 629 | // XXX: also breaks on any URL containing a '/' in the query-string portion |
6e44cca8 | 630 | if (relUrl[0] == '/') { |
51b5dcf5 | 631 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 | 632 | } else { |
51b5dcf5 AJ |
633 | SBuf path = req->url.path(); |
634 | SBuf::size_type lastSlashPos = path.rfind('/'); | |
6e44cca8 | 635 | |
51b5dcf5 AJ |
636 | if (lastSlashPos == SBuf::npos) { |
637 | // replace the whole path with the given bit(s) | |
5db6bf73 FC |
638 | urlbuf[urllen] = '/'; |
639 | ++urllen; | |
51b5dcf5 | 640 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 | 641 | } else { |
51b5dcf5 AJ |
642 | // replace only the last (file?) segment with the given bit(s) |
643 | ++lastSlashPos; | |
644 | if (lastSlashPos > MAX_URL - urllen - 1) { | |
645 | // XXX: crops bits in the middle of the combined URL. | |
646 | lastSlashPos = MAX_URL - urllen - 1; | |
6e44cca8 | 647 | } |
3f0e38d6 | 648 | SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos)); |
51b5dcf5 | 649 | urllen += lastSlashPos; |
6e44cca8 | 650 | if (urllen + 1 < MAX_URL) { |
51b5dcf5 | 651 | xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); |
6e44cca8 BR |
652 | } |
653 | } | |
654 | } | |
3cbbd242 | 655 | |
bc9ad11f | 656 | return (urlbuf); |
3cbbd242 | 657 | } |
658 | ||
b8d8561b | 659 | int |
abbd7825 | 660 | matchDomainName(const char *h, const char *d, uint flags) |
30a4f2a8 | 661 | { |
9bc73deb | 662 | int dl; |
663 | int hl; | |
62e76326 | 664 | |
abbd7825 | 665 | const bool hostIncludesSubdomains = (*h == '.'); |
d20b1cd0 | 666 | while ('.' == *h) |
5db6bf73 | 667 | ++h; |
62e76326 | 668 | |
9bc73deb | 669 | hl = strlen(h); |
62e76326 | 670 | |
abbd7825 CT |
671 | if (hl == 0) |
672 | return -1; | |
673 | ||
9bc73deb | 674 | dl = strlen(d); |
62e76326 | 675 | |
9bc73deb | 676 | /* |
677 | * Start at the ends of the two strings and work towards the | |
678 | * beginning. | |
679 | */ | |
680 | while (xtolower(h[--hl]) == xtolower(d[--dl])) { | |
62e76326 | 681 | if (hl == 0 && dl == 0) { |
682 | /* | |
683 | * We made it all the way to the beginning of both | |
684 | * strings without finding any difference. | |
685 | */ | |
686 | return 0; | |
687 | } | |
688 | ||
689 | if (0 == hl) { | |
690 | /* | |
691 | * The host string is shorter than the domain string. | |
692 | * There is only one case when this can be a match. | |
693 | * If the domain is just one character longer, and if | |
694 | * that character is a leading '.' then we call it a | |
695 | * match. | |
696 | */ | |
697 | ||
698 | if (1 == dl && '.' == d[0]) | |
699 | return 0; | |
700 | else | |
701 | return -1; | |
702 | } | |
703 | ||
704 | if (0 == dl) { | |
705 | /* | |
706 | * The domain string is shorter than the host string. | |
707 | * This is a match only if the first domain character | |
708 | * is a leading '.'. | |
709 | */ | |
710 | ||
abbd7825 CT |
711 | if ('.' == d[0]) { |
712 | if (flags & mdnRejectSubsubDomains) { | |
713 | // Check for sub-sub domain and reject | |
714 | while(--hl >= 0 && h[hl] != '.'); | |
715 | if (hl < 0) { | |
716 | // No sub-sub domain found, but reject if there is a | |
717 | // leading dot in given host string (which is removed | |
718 | // before the check is started). | |
719 | return hostIncludesSubdomains ? 1 : 0; | |
720 | } else | |
721 | return 1; // sub-sub domain, reject | |
722 | } else | |
723 | return 0; | |
724 | } else | |
62e76326 | 725 | return 1; |
726 | } | |
9bc73deb | 727 | } |
62e76326 | 728 | |
9bc73deb | 729 | /* |
730 | * We found different characters in the same position (from the end). | |
731 | */ | |
69f69080 CT |
732 | |
733 | // If the h has a form of "*.foo.com" and d has a form of "x.foo.com" | |
734 | // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x' | |
735 | // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'. | |
abbd7825 | 736 | if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.') |
69f69080 CT |
737 | return 0; |
738 | ||
d20b1cd0 | 739 | /* |
740 | * If one of those character is '.' then its special. In order | |
741 | * for splay tree sorting to work properly, "x-foo.com" must | |
742 | * be greater than ".foo.com" even though '-' is less than '.'. | |
743 | */ | |
744 | if ('.' == d[dl]) | |
62e76326 | 745 | return 1; |
746 | ||
d20b1cd0 | 747 | if ('.' == h[hl]) |
62e76326 | 748 | return -1; |
749 | ||
9bc73deb | 750 | return (xtolower(h[hl]) - xtolower(d[dl])); |
30a4f2a8 | 751 | } |
a8f7d3ee | 752 | |
985c86bc | 753 | /* |
610ee341 | 754 | * return true if we can serve requests for this method. |
985c86bc | 755 | */ |
b8d8561b | 756 | int |
190154cf | 757 | urlCheckRequest(const HttpRequest * r) |
a8f7d3ee | 758 | { |
759 | int rc = 0; | |
610ee341 | 760 | /* protocol "independent" methods |
761 | * | |
762 | * actually these methods are specific to HTTP: | |
763 | * they are methods we recieve on our HTTP port, | |
764 | * and if we had a FTP listener would not be relevant | |
765 | * there. | |
766 | * | |
767 | * So, we should delegate them to HTTP. The problem is that we | |
768 | * do not have a default protocol from the client side of HTTP. | |
769 | */ | |
62e76326 | 770 | |
c2a7cefd | 771 | if (r->method == Http::METHOD_CONNECT) |
62e76326 | 772 | return 1; |
773 | ||
77ce6ba9 AR |
774 | // we support OPTIONS and TRACE directed at us (with a 501 reply, for now) |
775 | // we also support forwarding OPTIONS and TRACE, except for the *-URI ones | |
c2a7cefd | 776 | if (r->method == Http::METHOD_OPTIONS || r->method == Http::METHOD_TRACE) |
c8ab5ec6 | 777 | return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk()); |
62e76326 | 778 | |
c2a7cefd | 779 | if (r->method == Http::METHOD_PURGE) |
62e76326 | 780 | return 1; |
781 | ||
99edd1c3 | 782 | /* does method match the protocol? */ |
4e3f4dc7 | 783 | switch (r->url.getScheme()) { |
62e76326 | 784 | |
0c3d3f65 | 785 | case AnyP::PROTO_URN: |
62e76326 | 786 | |
0c3d3f65 | 787 | case AnyP::PROTO_HTTP: |
62e76326 | 788 | |
39a19cb7 | 789 | case AnyP::PROTO_CACHE_OBJECT: |
62e76326 | 790 | rc = 1; |
791 | break; | |
792 | ||
0c3d3f65 | 793 | case AnyP::PROTO_FTP: |
62e76326 | 794 | |
c2a7cefd | 795 | if (r->method == Http::METHOD_PUT) |
62e76326 | 796 | rc = 1; |
797 | ||
0c3d3f65 | 798 | case AnyP::PROTO_GOPHER: |
62e76326 | 799 | |
0c3d3f65 | 800 | case AnyP::PROTO_WAIS: |
62e76326 | 801 | |
0c3d3f65 | 802 | case AnyP::PROTO_WHOIS: |
c2a7cefd | 803 | if (r->method == Http::METHOD_GET) |
62e76326 | 804 | rc = 1; |
c2a7cefd | 805 | else if (r->method == Http::METHOD_HEAD) |
62e76326 | 806 | rc = 1; |
807 | ||
808 | break; | |
809 | ||
0c3d3f65 | 810 | case AnyP::PROTO_HTTPS: |
cb4f4424 | 811 | #if USE_OPENSSL |
62e76326 | 812 | rc = 1; |
418293da AJ |
813 | #elif USE_GNUTLS |
814 | rc = 1; | |
1f7c9178 | 815 | #else |
62e76326 | 816 | /* |
817 | * Squid can't originate an SSL connection, so it should | |
818 | * never receive an "https:" URL. It should always be | |
819 | * CONNECT instead. | |
820 | */ | |
821 | rc = 0; | |
1f7c9178 | 822 | #endif |
0166128b | 823 | break; |
62e76326 | 824 | |
a8f7d3ee | 825 | default: |
62e76326 | 826 | break; |
a8f7d3ee | 827 | } |
62e76326 | 828 | |
a8f7d3ee | 829 | return rc; |
830 | } | |
9ce5e3e6 | 831 | |
832 | /* | |
833 | * Quick-n-dirty host extraction from a URL. Steps: | |
5999b776 | 834 | * Look for a colon |
835 | * Skip any '/' after the colon | |
836 | * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[] | |
837 | * Look for an ending '/' or ':' and terminate | |
838 | * Look for login info preceeded by '@' | |
9ce5e3e6 | 839 | */ |
77bfc324 | 840 | |
841 | class URLHostName | |
842 | { | |
843 | ||
844 | public: | |
845 | char * extract(char const *url); | |
846 | ||
847 | private: | |
848 | static char Host [SQUIDHOSTNAMELEN]; | |
849 | void init(char const *); | |
850 | void findHostStart(); | |
851 | void trimTrailingChars(); | |
852 | void trimAuth(); | |
853 | char const *hostStart; | |
854 | char const *url; | |
855 | }; | |
856 | ||
9ce5e3e6 | 857 | char * |
858 | urlHostname(const char *url) | |
859 | { | |
77bfc324 | 860 | return URLHostName().extract(url); |
861 | } | |
62e76326 | 862 | |
77bfc324 | 863 | char URLHostName::Host[SQUIDHOSTNAMELEN]; |
864 | ||
865 | void | |
866 | URLHostName::init(char const *aUrl) | |
867 | { | |
868 | Host[0] = '\0'; | |
aa1cafc4 | 869 | url = aUrl; |
77bfc324 | 870 | } |
62e76326 | 871 | |
77bfc324 | 872 | void |
873 | URLHostName::findHostStart() | |
874 | { | |
875 | if (NULL == (hostStart = strchr(url, ':'))) | |
876 | return; | |
62e76326 | 877 | |
77bfc324 | 878 | ++hostStart; |
62e76326 | 879 | |
77bfc324 | 880 | while (*hostStart != '\0' && *hostStart == '/') |
881 | ++hostStart; | |
cc192b50 | 882 | |
cc192b50 | 883 | if (*hostStart == ']') |
884 | ++hostStart; | |
77bfc324 | 885 | } |
62e76326 | 886 | |
77bfc324 | 887 | void |
888 | URLHostName::trimTrailingChars() | |
889 | { | |
890 | char *t; | |
891 | ||
892 | if ((t = strchr(Host, '/'))) | |
62e76326 | 893 | *t = '\0'; |
894 | ||
cc192b50 | 895 | if ((t = strrchr(Host, ':'))) |
62e76326 | 896 | *t = '\0'; |
cc192b50 | 897 | |
cc192b50 | 898 | if ((t = strchr(Host, ']'))) |
899 | *t = '\0'; | |
77bfc324 | 900 | } |
62e76326 | 901 | |
77bfc324 | 902 | void |
903 | URLHostName::trimAuth() | |
904 | { | |
905 | char *t; | |
906 | ||
907 | if ((t = strrchr(Host, '@'))) { | |
5db6bf73 | 908 | ++t; |
41d00cd3 | 909 | memmove(Host, t, strlen(t) + 1); |
9ce5e3e6 | 910 | } |
77bfc324 | 911 | } |
912 | ||
913 | char * | |
914 | URLHostName::extract(char const *aUrl) | |
915 | { | |
916 | init(aUrl); | |
917 | findHostStart(); | |
918 | ||
919 | if (hostStart == NULL) | |
920 | return NULL; | |
921 | ||
922 | xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN); | |
923 | ||
924 | trimTrailingChars(); | |
925 | ||
926 | trimAuth(); | |
62e76326 | 927 | |
77bfc324 | 928 | return Host; |
9ce5e3e6 | 929 | } |
f53969cc | 930 | |
c8ab5ec6 | 931 | AnyP::Uri::Uri(AnyP::UriScheme const &aScheme) : |
d59e4742 FC |
932 | scheme_(aScheme), |
933 | hostIsNumeric_(false), | |
934 | port_(0) | |
935 | { | |
936 | *host_=0; | |
937 | } | |
1a739503 | 938 |