Squid Web Cache master
Loading...
Searching...
No Matches
Uri.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2025 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9/* DEBUG: section 23 URL Parsing */
10
11#include "squid.h"
12#include "anyp/Host.h"
13#include "anyp/Uri.h"
14#include "base/Raw.h"
15#include "globals.h"
16#include "HttpRequest.h"
17#include "parser/Tokenizer.h"
18#include "rfc1738.h"
19#include "SquidConfig.h"
20#include "SquidMath.h"
21
22static const char valid_hostname_chars_u[] =
23 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
24 "abcdefghijklmnopqrstuvwxyz"
25 "0123456789-._"
26 "[:]"
27 ;
28static const char valid_hostname_chars[] =
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
30 "abcdefghijklmnopqrstuvwxyz"
31 "0123456789-."
32 "[:]"
33 ;
34
36static const CharacterSet &
38{
39 /*
40 * RFC 3986 section 3.2.1
41 *
42 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
43 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
44 * pct-encoded = "%" HEXDIG HEXDIG
45 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
46 */
47 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
50 return userInfoValid;
51}
52
54static const CharacterSet &
56{
57 /*
58 * RFC 3986 section 3.3
59 *
60 * path = path-abempty ; begins with "/" or is empty
61 * ...
62 * path-abempty = *( "/" segment )
63 * segment = *pchar
64 * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
65 */
66 static const auto pathValid = CharacterSet("path", "/:@-._~%!$&'()*+,;=") +
69 return pathValid;
70}
71
75SBuf
76AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
77{
78 if (buf.isEmpty())
79 return buf;
80
81 Parser::Tokenizer tk(buf);
82 SBuf goodSection;
83 // optimization for the arguably common "no encoding necessary" case
84 if (tk.prefix(goodSection, ignore) && tk.atEnd())
85 return buf;
86
87 SBuf output;
88 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
89 output.append(goodSection); // may be empty
90
91 while (!tk.atEnd()) {
92 // TODO: Add Tokenizer::parseOne(void).
93 const auto ch = tk.remaining()[0];
94 output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table
95 (void)tk.skip(ch);
96
97 if (tk.prefix(goodSection, ignore))
98 output.append(goodSection);
99 }
100
101 return output;
102}
103
104std::optional<SBuf>
106{
107 SBuf output;
109 while (!tok.atEnd()) {
110 SBuf token;
111 static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
112 if (tok.prefix(token, unencodedChars))
113 output.append(token);
114
115 // we are either at '%' or at end of input
116 if (tok.skip('%')) {
117 const auto rawBytesAfterPercent = tok.remaining();
118 int64_t hex1 = 0, hex2 = 0;
119 if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1)) {
120 output.append(static_cast<char>((hex1 << 4) | hex2));
121 } else {
122 // see TestUri::testEncoding() for invalid pct-encoding sequence examples
123 debugs(23, 3, "invalid pct-encoding sequence starting at %" << rawBytesAfterPercent);
124 return std::nullopt;
125 }
126 }
127 }
128 return output;
129}
130
131SBuf
133{
134 if (const auto decoded = Decode(input))
135 return *decoded;
136 return input;
137}
138
139const SBuf &
141{
142 static SBuf star("*");
143 return star;
144}
145
146const SBuf &
148{
149 static SBuf slash("/");
150 return slash;
151}
152
153void
154AnyP::Uri::host(const char *src)
155{
156 hostAddr_.fromHost(src);
157 if (hostAddr_.isAnyAddr()) {
158 xstrncpy(host_, src, sizeof(host_));
159 hostIsNumeric_ = false;
160 } else {
161 hostAddr_.toHostStr(host_, sizeof(host_));
162 debugs(23, 3, "given IP: " << hostAddr_);
163 hostIsNumeric_ = 1;
164 }
165 touch();
166}
167
168// TODO: Replace with ToSBuf(parsedHost()) or similar.
169SBuf
171{
172 if (hostIsNumeric()) {
173 static char ip[MAX_IPSTRLEN];
174 const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
175 return SBuf(ip, hostStrLen);
176 } else
177 return SBuf(host());
178}
179
180std::optional<AnyP::Host>
182{
183 if (hostIsNumeric())
184 return Host::ParseIp(hostIP());
185
186 // XXX: Interpret host subcomponent as reg-name representing a DNS name. It
187 // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but
188 // current Squid APIs do not support adequate representation of those cases.
189 const SBuf regName(host());
190
191 if (regName.find('%') != SBuf::npos) {
192 debugs(23, 3, "rejecting percent-encoded reg-name: " << regName);
193 return std::nullopt; // TODO: Decode() instead
194 }
195
196 return Host::ParseSimpleDomainName(regName);
197}
198
199const SBuf &
201{
202 // RFC 3986 section 3.3 says path can be empty (path-abempty).
203 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
204 // at least when sending and using. We must still accept path-abempty as input.
205 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
206 return SlashPath();
207
208 return path_;
209}
210
211void
213{
214 debugs(23, 5, "urlInitialize: Initializing...");
215 /* this ensures that the number of protocol strings is the same as
216 * the enum slots allocated because the last enum is always 'MAX'.
217 */
218 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
219 /*
220 * These test that our matchDomainName() function works the
221 * way we expect it to.
222 */
223 assert(0 == matchDomainName("foo.com", "foo.com"));
224 assert(0 == matchDomainName(".foo.com", "foo.com"));
225 assert(0 == matchDomainName("foo.com", ".foo.com"));
226 assert(0 == matchDomainName(".foo.com", ".foo.com"));
227 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
228 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
229 assert(0 != matchDomainName("x.foo.com", "foo.com"));
230 assert(0 != matchDomainName("foo.com", "x.foo.com"));
231 assert(0 != matchDomainName("bar.com", "foo.com"));
232 assert(0 != matchDomainName(".bar.com", "foo.com"));
233 assert(0 != matchDomainName(".bar.com", ".foo.com"));
234 assert(0 != matchDomainName("bar.com", ".foo.com"));
235 assert(0 < matchDomainName("zzz.com", "foo.com"));
236 assert(0 > matchDomainName("aaa.com", "foo.com"));
237 assert(0 == matchDomainName("FOO.com", "foo.COM"));
238 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
239 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
240 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
241
242 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
243 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
244 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
245 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
246
247 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
248 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
249 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
250 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
251
252 assert(0 != matchDomainName("foo.com", ""));
253 assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards));
254 assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains));
255
256 /* more cases? */
257}
258
266static AnyP::UriScheme
268{
269 /*
270 * RFC 3986 section 3.1 paragraph 2:
271 *
272 * Scheme names consist of a sequence of characters beginning with a
273 * letter and followed by any combination of letters, digits, plus
274 * ("+"), period ("."), or hyphen ("-").
275 */
276 static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
277
278 SBuf str;
279 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
280 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
281 if (protocol == AnyP::PROTO_UNKNOWN)
282 return AnyP::UriScheme(protocol, str.c_str());
283 return AnyP::UriScheme(protocol, nullptr);
284 }
285
286 throw TextException("invalid URI scheme", Here());
287}
288
296bool
298{
299 /* For IPv4 addresses check for a dot */
300 /* For IPv6 addresses also check for a colon */
301 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
302 const uint64_t dlen = strlen(host);
303 const uint64_t want = dlen + Config.appendDomainLen;
304 if (want > SQUIDHOSTNAMELEN - 1) {
305 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
306 return false;
307 }
308 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
309 }
310 return true;
311}
312
313/*
314 * Parse a URI/URL.
315 *
316 * It is assumed that the URL is complete -
317 * ie, the end of the string is the end of the URL. Don't pass a partial
318 * URL here as this routine doesn't have any way of knowing whether
319 * it is partial or not (ie, it handles the case of no trailing slash as
320 * being "end of host with implied path of /".
321 *
322 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
323 * then rather than a URL a hostname:port is looked for.
324 */
325bool
326AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
327{
328 try {
329
330 LOCAL_ARRAY(char, login, MAX_URL);
331 LOCAL_ARRAY(char, foundHost, MAX_URL);
332 LOCAL_ARRAY(char, urlpath, MAX_URL);
333 char *t = nullptr;
334 char *q = nullptr;
335 int foundPort;
336 int l;
337 int i;
338 const char *src;
339 char *dst;
340 foundHost[0] = urlpath[0] = login[0] = '\0';
341
342 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
343 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
344 return false;
345 }
346
347 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
348 Asterisk().cmp(rawUrl) == 0) {
349 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
350 setScheme(AnyP::PROTO_HTTP, nullptr);
351 port(getScheme().defaultPort());
352 path(Asterisk());
353 return true;
354 }
355
356 Parser::Tokenizer tok(rawUrl);
357 AnyP::UriScheme scheme;
358
359 if (method == Http::METHOD_CONNECT) {
360 // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
361 // port number of the tunnel destination, separated by a colon".
362
363 const auto rawHost = parseHost(tok);
364 Assure(rawHost.length() < sizeof(foundHost));
365 SBufToCstring(foundHost, rawHost);
366
367 if (!tok.skip(':'))
368 throw TextException("missing required :port in CONNECT target", Here());
369 foundPort = parsePort(tok);
370
371 if (!tok.remaining().isEmpty())
372 throw TextException("garbage after host:port in CONNECT target", Here());
373 } else {
374
375 scheme = uriParseScheme(tok);
376
377 if (scheme == AnyP::PROTO_NONE)
378 return false; // invalid scheme
379
380 if (scheme == AnyP::PROTO_URN) {
381 parseUrn(tok); // throws on any error
382 return true;
383 }
384
385 // URLs then have "//"
386 static const SBuf doubleSlash("//");
387 if (!tok.skip(doubleSlash))
388 return false;
389
390 auto B = tok.remaining();
391 const char *url = B.c_str();
392
393 /* Parse the URL: */
394 src = url;
395 i = 0;
396
397 /* Then everything until first /; that's host (and port; which we'll look for here later) */
398 // bug 1881: If we don't get a "/" then we imply it was there
399 // bug 3074: We could just be given a "?" or "#". These also imply "/"
400 // bug 3233: whitespace is also a hostname delimiter.
401 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
402 *dst = *src;
403 }
404
405 /*
406 * We can't check for "i >= l" here because we could be at the end of the line
407 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
408 * been -given- a valid URL and the path is just '/'.
409 */
410 if (i > l)
411 return false;
412 *dst = '\0';
413
414 // We are looking at path-abempty.
415 if (*src != '/') {
416 // path-empty, including the end of the `src` c-string cases
417 urlpath[0] = '/';
418 dst = &urlpath[1];
419 } else {
420 dst = urlpath;
421 }
422 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
423 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
424 *dst = *src;
425 }
426
427 /* We -could- be at the end of the buffer here */
428 if (i > l)
429 return false;
430 *dst = '\0';
431
432 // If the parsed scheme has no (known) default port, and there is no
433 // explicit port, then we will reject the zero port during foundPort
434 // validation, often resulting in a misleading 400/ERR_INVALID_URL.
435 // TODO: Remove this hack when switching to Tokenizer-based parsing.
436 foundPort = scheme.defaultPort().value_or(0); // may be reset later
437
438 /* Is there any login information? (we should eventually parse it above) */
439 t = strrchr(foundHost, '@');
440 if (t != nullptr) {
441 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
442 login[sizeof(login)-1] = '\0';
443 t = strrchr(login, '@');
444 *t = 0;
445 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
446 foundHost[sizeof(foundHost)-1] = '\0';
447 // Bug 4498: URL-unescape the login info after extraction
448 rfc1738_unescape(login);
449 }
450
451 /* Is there any host information? (we should eventually parse it above) */
452 if (*foundHost == '[') {
453 /* strip any IPA brackets. valid under IPv6. */
454 dst = foundHost;
455 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
456 src = foundHost;
457 ++src;
458 l = strlen(foundHost);
459 i = 1;
460 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
461 *dst = *src;
462 }
463
464 /* we moved in-place, so truncate the actual hostname found */
465 *dst = '\0';
466 ++dst;
467
468 /* skip ahead to either start of port, or original EOS */
469 while (*dst != '\0' && *dst != ':')
470 ++dst;
471 t = dst;
472 } else {
473 t = strrchr(foundHost, ':');
474
475 if (t != strchr(foundHost,':') ) {
476 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
477 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
478 /* therefore we MUST accept the case where they are not bracketed at all. */
479 t = nullptr;
480 }
481 }
482
483 // Bug 3183 sanity check: If scheme is present, host must be too.
484 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
485 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
486 return false;
487 }
488
489 if (t && *t == ':') {
490 *t = '\0';
491 ++t;
492 foundPort = atoi(t);
493 }
494 }
495
496 for (t = foundHost; *t; ++t)
497 *t = xtolower(*t);
498
499 if (stringHasWhitespace(foundHost)) {
501 t = q = foundHost;
502 while (*t) {
503 if (!xisspace(*t)) {
504 *q = *t;
505 ++q;
506 }
507 ++t;
508 }
509 *q = '\0';
510 }
511 }
512
513 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
514
516 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
517 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
518 return false;
519 }
520
521 if (!urlAppendDomain(foundHost))
522 return false;
523
524 /* remove trailing dots from hostnames */
525 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
526 foundHost[l] = '\0';
527
528 /* reject duplicate or leading dots */
529 if (strstr(foundHost, "..") || *foundHost == '.') {
530 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
531 return false;
532 }
533
534 if (foundPort < 1 || foundPort > 65535) {
535 debugs(23, 3, "Invalid port '" << foundPort << "'");
536 return false;
537 }
538
539 if (stringHasWhitespace(urlpath)) {
540 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
541
542 switch (Config.uri_whitespace) {
543
545 return false;
546
548 break;
549
551 t = rfc1738_escape_unescaped(urlpath);
552 xstrncpy(urlpath, t, MAX_URL);
553 break;
554
556 *(urlpath + strcspn(urlpath, w_space)) = '\0';
557 break;
558
560 default:
561 t = q = urlpath;
562 while (*t) {
563 if (!xisspace(*t)) {
564 *q = *t;
565 ++q;
566 }
567 ++t;
568 }
569 *q = '\0';
570 }
571 }
572
573 setScheme(scheme);
574 path(urlpath);
575 host(foundHost);
576 userInfo(SBuf(login));
577 port(foundPort);
578 return true;
579
580 } catch (...) {
581 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
582 return false;
583 }
584}
585
600void
602{
603 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
604 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
605 SBuf nid;
606 if (!tok.prefix(nid, nidChars, 32))
607 throw TextException("NID not found", Here());
608
609 if (!tok.skip(':'))
610 throw TextException("NID too long or missing ':' delimiter", Here());
611
612 if (nid.length() < 2)
613 throw TextException("NID too short", Here());
614
615 if (!alphanum[*nid.begin()])
616 throw TextException("NID prefix is not alphanumeric", Here());
617
618 if (!alphanum[*nid.rbegin()])
619 throw TextException("NID suffix is not alphanumeric", Here());
620
621 setScheme(AnyP::PROTO_URN, nullptr);
622 host(nid.c_str());
623 // TODO validate path characters
624 path(tok.remaining());
625 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
626}
627
631SBuf
633{
634 // host = IP-literal / IPv4address / reg-name
635
636 // XXX: CharacterSets below reject uri-host values containing whitespace
637 // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
638 // can be interpreted as if it applies to uri-host and this code. TODO: Fix
639 // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
640 // port, etc.) from that directive scope.
641
642 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
643 if (tok.skip('[')) {
644 // Add "." because IPv6address in RFC 3986 includes ls32, which includes
645 // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
646 // This set rejects IPvFuture that needs a "v" character.
647 static const CharacterSet IPv6chars = (
648 CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
649 SBuf ipv6ish;
650 if (!tok.prefix(ipv6ish, IPv6chars))
651 throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
652
653 if (!tok.skip(']'))
654 throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
655
656 // This rejects bracketed IPv4address and domain names because they lack ":".
657 if (ipv6ish.find(':') == SBuf::npos)
658 throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
659
660 // This rejects bracketed non-IP addresses that our caller would have
661 // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
662 Ip::Address ipv6check;
663 if (!ipv6check.fromHost(ipv6ish.c_str()))
664 throw TextException("malformed bracketed IPv6 address in uri-host", Here());
665
666 return ipv6ish;
667 }
668
669 // no brackets implies we are looking at IPv4address or reg-name
670
671 // XXX: This code does not detect/reject some bad host values (e.g. `!#$%&`).
672 // TODO: Add more checks here, after migrating the
673 // non-CONNECT uri-host parsing code to use us.
674
675 SBuf otherHost; // IPv4address-ish or reg-name-ish
676 // ":" is not in TCHAR so we will stop before any port specification
677 if (tok.prefix(otherHost, CharacterSet::TCHAR))
678 return otherHost;
679
680 throw TextException("malformed IPv4 address or host name in uri-host", Here());
681}
682
689int
691{
692 if (tok.skip('0'))
693 throw TextException("zero or zero-prefixed port", Here());
694
695 int64_t rawPort = 0;
696 if (!tok.int64(rawPort, 10, false)) // port = *DIGIT
697 throw TextException("malformed or missing port", Here());
698
699 Assure(rawPort > 0);
700 constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it.
701 constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max();
702 static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number");
703 if (Less(portMax, rawPort))
704 throw TextException("huge port", Here());
705
706 // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
707 // code to use us (so that foundPort "int" disappears or starts using Port).
708 return NaturalCast<int>(rawPort);
709}
710
711void
713{
714 absolute_.clear();
715 authorityHttp_.clear();
716 authorityWithPort_.clear();
717 absolutePath_.clear();
718}
719
720SBuf &
721AnyP::Uri::authority(bool requirePort) const
722{
723 if (authorityHttp_.isEmpty()) {
724
725 // both formats contain Host/IP
726 authorityWithPort_.append(host());
727 authorityHttp_ = authorityWithPort_;
728
729 if (port().has_value()) {
730 authorityWithPort_.appendf(":%hu", *port());
731 // authorityHttp_ only has :port for known non-default ports
732 if (port() != getScheme().defaultPort())
733 authorityHttp_ = authorityWithPort_;
734 }
735 // else XXX: We made authorityWithPort_ that does not have a port.
736 // TODO: Audit callers and refuse to give out broken authorityWithPort_.
737 }
738
739 return requirePort ? authorityWithPort_ : authorityHttp_;
740}
741
742SBuf &
744{
745 if (absolute_.isEmpty()) {
746 // TODO: most URL will be much shorter, avoid allocating this much
747 absolute_.reserveCapacity(MAX_URL);
748
749 absolute_.append(getScheme().image());
750 absolute_.append(":",1);
751 if (getScheme() != AnyP::PROTO_URN) {
752 absolute_.append("//", 2);
753 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
754 getScheme() == AnyP::PROTO_UNKNOWN;
755
756 if (allowUserInfo && !userInfo().isEmpty()) {
757 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
758 .remove('%')
759 .rename("userinfo-reserved");
760 absolute_.append(Encode(userInfo(), uiChars));
761 absolute_.append("@", 1);
762 }
763 absolute_.append(authority());
764 } else {
765 absolute_.append(host());
766 absolute_.append(":", 1);
767 }
768 absolute_.append(absolutePath());
769 }
770
771 return absolute_;
772}
773
774SBuf &
776{
777 if (absolutePath_.isEmpty()) {
778 // TODO: Encode each URI subcomponent in path_ as needed.
779 absolutePath_ = Encode(path(), PathChars());
780 }
781
782 return absolutePath_;
783}
784
785/* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
786 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
787 * and never copy the query-string part in the first place
788 */
789char *
791{
792 LOCAL_ARRAY(char, buf, MAX_URL);
793
794 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
795 buf[sizeof(buf)-1] = '\0';
796
797 // URN, CONNECT method, and non-stripped URIs can go straight out
798 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
799 // strip anything AFTER a question-mark
800 // leaving the '?' in place
801 if (auto t = strchr(buf, '?')) {
802 *(++t) = '\0';
803 }
804 }
805
806 if (stringHasCntl(buf))
808
809 return buf;
810}
811
818const char *
820{
821 LOCAL_ARRAY(char, buf, MAX_URL);
822
823 // method CONNECT and port HTTPS
824 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
825 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
826 return buf;
827 }
828
829 // else do the normal complete canonical thing.
830 return request->canonicalCleanUrl();
831}
832
845bool
846urlIsRelative(const char *url)
847{
848 if (!url)
849 return false; // no URL
850
851 /*
852 * RFC 3986 section 5.2.3
853 *
854 * path = path-abempty ; begins with "/" or is empty
855 * / path-absolute ; begins with "/" but not "//"
856 * / path-noscheme ; begins with a non-colon segment
857 * / path-rootless ; begins with a segment
858 * / path-empty ; zero characters
859 */
860
861 if (*url == '\0')
862 return true; // path-empty
863
864 if (*url == '/') {
865 // network-path reference (a.k.a. 'scheme-relative URI') or
866 // path-absolute (a.k.a. 'absolute-path reference')
867 return true;
868 }
869
870 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
871 if (*p == ':')
872 return false; // colon is forbidden in first segment
873 }
874
875 return true; // path-noscheme, path-abempty, path-rootless
876}
877
878void
879AnyP::Uri::addRelativePath(const char *relUrl)
880{
881 // URN cannot be merged
882 if (getScheme() == AnyP::PROTO_URN)
883 return;
884
885 // TODO: Handle . and .. segment normalization
886
887 const auto lastSlashPos = path_.rfind('/');
888 // TODO: To optimize and simplify, add and use SBuf::replace().
889 const auto relUrlLength = strlen(relUrl);
890 if (lastSlashPos == SBuf::npos) {
891 // start replacing the whole path
892 path_.reserveCapacity(1 + relUrlLength);
893 path_.assign("/", 1);
894 } else {
895 // start replacing just the last segment
896 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
897 path_.chop(0, lastSlashPos+1);
898 }
899 path_.append(relUrl, relUrlLength);
900}
901
902int
903matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
904{
905 int dl;
906 int hl;
907
908 const bool hostIncludesSubdomains = (*h == '.');
909 while ('.' == *h)
910 ++h;
911
912 hl = strlen(h);
913
914 if (hl == 0)
915 return -1;
916
917 dl = strlen(d);
918 if (dl == 0)
919 return 1;
920
921 /*
922 * Start at the ends of the two strings and work towards the
923 * beginning.
924 */
925 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
926 if (hl == 0 && dl == 0) {
927 /*
928 * We made it all the way to the beginning of both
929 * strings without finding any difference.
930 */
931 return 0;
932 }
933
934 if (0 == hl) {
935 /*
936 * The host string is shorter than the domain string.
937 * There is only one case when this can be a match.
938 * If the domain is just one character longer, and if
939 * that character is a leading '.' then we call it a
940 * match.
941 */
942
943 if (1 == dl && '.' == d[0])
944 return 0;
945 else
946 return -1;
947 }
948
949 if (0 == dl) {
950 /*
951 * The domain string is shorter than the host string.
952 * This is a match only if the first domain character
953 * is a leading '.'.
954 */
955
956 if ('.' == d[0]) {
957 if (flags & mdnRejectSubsubDomains) {
958 // Check for sub-sub domain and reject
959 while(--hl >= 0 && h[hl] != '.');
960 if (hl < 0) {
961 // No sub-sub domain found, but reject if there is a
962 // leading dot in given host string (which is removed
963 // before the check is started).
964 return hostIncludesSubdomains ? 1 : 0;
965 } else
966 return 1; // sub-sub domain, reject
967 } else
968 return 0;
969 } else
970 return 1;
971 }
972 }
973
974 /*
975 * We found different characters in the same position (from the end).
976 */
977
978 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
979 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
980 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
981 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
982 return 0;
983
984 /*
985 * If one of those character is '.' then its special. In order
986 * for splay tree sorting to work properly, "x-foo.com" must
987 * be greater than ".foo.com" even though '-' is less than '.'.
988 */
989 if ('.' == d[dl])
990 return 1;
991
992 if ('.' == h[hl])
993 return -1;
994
995 return (xtolower(h[hl]) - xtolower(d[dl]));
996}
997
998/*
999 * return true if we can serve requests for this method.
1000 */
1001bool
1003{
1004 /* protocol "independent" methods
1005 *
1006 * actually these methods are specific to HTTP:
1007 * they are methods we receive on our HTTP port,
1008 * and if we had a FTP listener would not be relevant
1009 * there.
1010 *
1011 * So, we should delegate them to HTTP. The problem is that we
1012 * do not have a default protocol from the client side of HTTP.
1013 */
1014
1015 if (r->method == Http::METHOD_CONNECT)
1016 return true;
1017
1018 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
1019 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
1022
1023 if (r->method == Http::METHOD_PURGE)
1024 return true;
1025
1026 /* does method match the protocol? */
1027 switch (r->url.getScheme()) {
1028
1029 case AnyP::PROTO_URN:
1030 case AnyP::PROTO_HTTP:
1031 return true;
1032
1033 case AnyP::PROTO_FTP:
1034 if (r->method == Http::METHOD_PUT ||
1035 r->method == Http::METHOD_GET ||
1037 return true;
1038 return false;
1039
1040 case AnyP::PROTO_WAIS:
1041 case AnyP::PROTO_WHOIS:
1042 if (r->method == Http::METHOD_GET ||
1044 return true;
1045 return false;
1046
1047 case AnyP::PROTO_HTTPS:
1048#if USE_OPENSSL || HAVE_LIBGNUTLS
1049 return true;
1050#else
1051 /*
1052 * Squid can't originate an SSL connection, so it should
1053 * never receive an "https:" URL. It should always be
1054 * CONNECT instead.
1055 */
1056 return false;
1057#endif
1058
1059 default:
1060 return false;
1061 }
1062
1063 /* notreached */
1064 return false;
1065}
1066
1068 scheme_(aScheme),
1069 hostIsNumeric_(false)
1070{
1071 *host_=0;
1072}
1073
1074// TODO: fix code duplication with AnyP::Uri::parse()
1075char *
1076AnyP::Uri::cleanup(const char *uri)
1077{
1078 char *cleanedUri = nullptr;
1079 switch (Config.uri_whitespace) {
1080 case URI_WHITESPACE_ALLOW: {
1082 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1083 break;
1084 }
1085
1088 break;
1089
1090 case URI_WHITESPACE_CHOP: {
1091 const auto pos = strcspn(uri, w_space);
1092 char *choppedUri = nullptr;
1093 if (pos < strlen(uri))
1094 choppedUri = xstrndup(uri, pos + 1);
1095 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
1097 cleanedUri[pos] = '\0';
1098 xfree(choppedUri);
1099 break;
1100 }
1101
1104 default: {
1105 // TODO: avoid duplication with urlParse()
1106 const char *t;
1107 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1108 char *q = tmp_uri;
1109 t = uri;
1110 while (*t) {
1111 if (!xisspace(*t)) {
1112 *q = *t;
1113 ++q;
1114 }
1115 ++t;
1116 }
1117 *q = '\0';
1118 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1119 xfree(tmp_uri);
1120 break;
1121 }
1122 }
1123
1124 assert(cleanedUri);
1125 return cleanedUri;
1126}
1127
#define Assure(condition)
Definition Assure.h:35
#define Here()
source code location of the caller
Definition Here.h:15
#define SQUIDSBUFPH
Definition SBuf.h:31
void SBufToCstring(char *d, const SBuf &s)
Definition SBuf.h:756
#define SQUIDSBUFPRINT(s)
Definition SBuf.h:32
class SquidConfig Config
constexpr bool Less(const A a, const B b)
whether integer a is less than integer b, with correct overflow handling
Definition SquidMath.h:48
int stringHasWhitespace(const char *)
Definition String.cc:294
int stringHasCntl(const char *)
Definition String.cc:301
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
bool urlCheckRequest(const HttpRequest *r)
Definition Uri.cc:1002
static const char valid_hostname_chars[]
Definition Uri.cc:28
static const char valid_hostname_chars_u[]
Definition Uri.cc:22
bool urlIsRelative(const char *url)
Definition Uri.cc:846
void urlInitialize(void)
Definition Uri.cc:212
static const CharacterSet & PathChars()
Characters which are valid within a URI path section.
Definition Uri.cc:55
int matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
Definition Uri.cc:903
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition Uri.cc:790
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition Uri.cc:267
static const CharacterSet & UserInfoChars()
Characters which are valid within a URI userinfo section.
Definition Uri.cc:37
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition Uri.cc:819
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition Uri.cc:297
MatchDomainNameFlags
Definition Uri.h:238
@ mdnRejectSubsubDomains
Definition Uri.h:241
@ mdnHonorWildcards
Definition Uri.h:240
#define assert(EX)
Definition assert.h:17
static std::optional< Host > ParseIp(const Ip::Address &)
converts an already parsed IP address to a Host object
Definition Host.cc:15
static std::optional< Host > ParseSimpleDomainName(const SBuf &)
Definition Host.cc:49
static AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition UriScheme.cc:52
Port defaultPort() const
Definition UriScheme.cc:71
SBuf image() const
Definition UriScheme.h:57
static const SBuf & SlashPath()
the static '/' default URL-path
Definition Uri.cc:147
SBuf parseHost(Parser::Tokenizer &) const
Definition Uri.cc:632
void parseUrn(Parser::Tokenizer &)
Definition Uri.cc:601
AnyP::UriScheme const & getScheme() const
Definition Uri.h:58
void touch()
clear the cached URI display forms
Definition Uri.cc:712
SBuf & authority(bool requirePort=false) const
Definition Uri.cc:721
static SBuf DecodeOrDupe(const SBuf &input)
Definition Uri.cc:132
void path(const char *p)
Definition Uri.h:96
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition Uri.h:188
const char * host(void) const
Definition Uri.h:76
SBuf & absolutePath() const
RFC 3986 section 4.2 relative reference called 'absolute-path'.
Definition Uri.cc:775
Uri()
Definition Uri.h:36
static std::optional< SBuf > Decode(const SBuf &)
Definition Uri.cc:105
std::optional< Host > parsedHost() const
Definition Uri.cc:181
static char * cleanup(const char *uri)
Definition Uri.cc:1076
void addRelativePath(const char *relUrl)
Definition Uri.cc:879
int parsePort(Parser::Tokenizer &) const
Definition Uri.cc:690
SBuf & absolute() const
Definition Uri.cc:743
static const SBuf & Asterisk()
the static '*' pseudo-URI
Definition Uri.cc:140
void port(const Port p)
reset authority port subcomponent
Definition Uri.h:90
const SBuf & path() const
Definition Uri.cc:200
void host(const char *src)
Definition Uri.cc:154
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition Uri.cc:326
SBuf hostOrIp() const
Definition Uri.cc:170
static SBuf Encode(const SBuf &, const CharacterSet &expected)
Definition Uri.cc:76
optimized set of C chars, with quick membership test and merge support
CharacterSet complement(const char *complementLabel=nullptr) const
static const CharacterSet TCHAR
CharacterSet & rename(const char *label)
change name; handy in const declarations that use operators
static const CharacterSet DIGIT
static const CharacterSet ALPHA
static const CharacterSet HEXDIG
CharacterSet & remove(const unsigned char c)
remove a given character from the character set
int64_t getInt64(Http::HdrType id) const
HttpRequestMethod method
char * canonicalCleanUrl() const
AnyP::Uri url
the request URI
HttpHeader header
Definition Message.h:74
bool fromHost(const char *hostWithoutPort)
Definition Address.cc:910
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition Tokenizer.cc:79
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition Tokenizer.h:44
bool atEnd() const
whether the end of the buffer has been reached
Definition Tokenizer.h:41
bool skip(const SBuf &tokenToSkip)
Definition Tokenizer.cc:189
Definition Raw.h:21
Definition SBuf.h:94
const char * rawContent() const
Definition SBuf.cc:509
static const size_type npos
Definition SBuf.h:100
char at(size_type pos) const
Definition SBuf.h:253
const char * c_str()
Definition SBuf.cc:516
void reserveCapacity(size_type minCapacity)
Definition SBuf.cc:105
size_type length() const
Returns the number of bytes stored in SBuf.
Definition SBuf.h:419
SBuf & appendf(const char *fmt,...) PRINTF_FORMAT_ARG2
Definition SBuf.cc:229
size_type find(char c, size_type startPos=0) const
Definition SBuf.cc:584
bool isEmpty() const
Definition SBuf.h:435
const_iterator begin() const
Definition SBuf.h:587
SBuf & append(const SBuf &S)
Definition SBuf.cc:185
const_reverse_iterator rbegin() const
Definition SBuf.h:595
void reserveSpace(size_type minSpace)
Definition SBuf.h:444
size_t appendDomainLen
int strip_query_terms
char * appendDomain
struct SquidConfig::@90 onoff
int check_hostnames
int allow_underscore
an std::runtime_error with thrower location info
#define w_space
#define MYNAME
Definition Stream.h:219
#define DBG_IMPORTANT
Definition Stream.h:38
#define debugs(SECTION, LEVEL, CONTENT)
Definition Stream.h:192
#define URI_WHITESPACE_CHOP
Definition defines.h:127
#define URI_WHITESPACE_STRIP
Definition defines.h:124
#define URI_WHITESPACE_DENY
Definition defines.h:128
#define URI_WHITESPACE_ALLOW
Definition defines.h:125
#define URI_WHITESPACE_ENCODE
Definition defines.h:126
#define MAX_URL
Definition defines.h:76
static int port
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition forward.h:25
const char * ProtocolType_str[]
uint16_t KnownPort
validated/supported port number; these values are never zero
Definition UriScheme.h:23
@ PROTO_NONE
@ PROTO_HTTPS
@ PROTO_UNKNOWN
@ PROTO_HTTP
@ PROTO_FTP
@ PROTO_WHOIS
@ PROTO_MAX
@ PROTO_URN
@ PROTO_WAIS
@ METHOD_TRACE
Definition MethodType.h:30
@ METHOD_PUT
Definition MethodType.h:27
@ METHOD_OPTIONS
Definition MethodType.h:31
@ METHOD_CONNECT
Definition MethodType.h:29
@ METHOD_GET
Definition MethodType.h:25
@ METHOD_PURGE
Definition MethodType.h:92
@ METHOD_HEAD
Definition MethodType.h:28
#define xfree
#define xmalloc
#define RFC1738_ESCAPE_NOSPACE
Definition rfc1738.h:22
char * rfc1738_do_escape(const char *url, int flags)
Definition rfc1738.c:56
#define RFC1738_ESCAPE_UNESCAPED
Definition rfc1738.h:25
#define rfc1738_escape_unescaped(x)
Definition rfc1738.h:59
void rfc1738_unescape(char *url)
Definition rfc1738.c:146
#define SQUIDHOSTNAMELEN
Definition rfc2181.h:30
#define LOCAL_ARRAY(type, name, size)
Definition squid.h:62
Definition parse.c:160
#define xisspace(x)
Definition xis.h:15
#define xtolower(x)
Definition xis.h:17
char * xstrncpy(char *dst, const char *src, size_t n)
Definition xstring.cc:37
char * xstrndup(const char *s, size_t n)
Definition xstring.cc:56