From: Sergey Kirpa <44341362+Sergey-Kirpa@users.noreply.github.com>
Date: Mon, 23 Dec 2019 08:01:21 +0000 (+0000)
Subject: Smarter auth_param utf8 handling, including CP1251 support (#480)
X-Git-Tag: SQUID_5_0_1~8
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7e851a3e58d6e132ae5ad558bc5e111ce0e90c00;p=thirdparty%2Fsquid.git

Smarter auth_param utf8 handling, including CP1251 support (#480)

When forwarding authentication credentials to authentication helpers:

* With auth_param utf8 parameter: Squid assumed that the received
  credentials are encoded with Latin-1 (and re-encoded them with UTF-8).
  This assumption is wrong for Internet Explorer running with CP1251
  regional settings. Now Squid uses HTTP Accept-Language request header
  to guess the received credentials encoding (Latin-1, CP1251, or UTF-8)
  and converts the first two encodings into UTF-8.

* Without auth_param utf8 parameter: No changes. Squid sends credentials
  in their original encoding, only applying RFC 1738 escaping on top.

Chrome and Firefox should not be affected because they always use UTF-8
encoding when sending authentication credentials.
---

diff --git a/lib/Makefile.am b/lib/Makefile.am
index 5f7699faf7..6c4c15cc2b 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -57,7 +57,6 @@ EXTRA_libmiscutil_la_SOURCES = \
 
 libmiscencoding_la_SOURCES = \
 	base64.c \
-	charset.c \
 	html_quote.c \
 	md5.c \
 	rfc1738.c \
diff --git a/lib/charset.c b/lib/charset.c
deleted file mode 100644
index 0882762823..0000000000
--- a/lib/charset.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
- *
- * Squid software is distributed under GPLv2+ license and includes
- * contributions from numerous individuals and organizations.
- * Please see the COPYING and CONTRIBUTORS files for details.
- */
-
-#include "squid.h"
-#include "charset.h"
-
-/** Convert ISO-LATIN-1 to UTF-8 */
-char *
-latin1_to_utf8(char *out, size_t size, const char *in)
-{
-    unsigned char *p = (unsigned char *)out;
-    for (; *in && size > 2; in++) {
-        unsigned char ch = (unsigned char)*in;
-        if (ch < 0x80) {
-            *p++ = ch;
-            size--;
-        } else {
-            *p++ = (ch >> 6) | 0xc0;
-            size--;
-            *p++ = (ch & 0x3f) | 0x80;
-            size--;
-        }
-    }
-    *p = '\0';
-    if (*in)
-        return NULL;
-    return out;
-}
-
diff --git a/src/auth/Makefile.am b/src/auth/Makefile.am
index e47477b21d..551047f14a 100644
--- a/src/auth/Makefile.am
+++ b/src/auth/Makefile.am
@@ -37,6 +37,8 @@ libauth_la_SOURCES = \
 	SchemesConfig.cc \
 	State.h \
 	State.cc \
+	toUtf.h \
+	toUtf.cc \
 	User.h \
 	User.cc \
 	UserRequest.h \
diff --git a/src/auth/SchemeConfig.cc b/src/auth/SchemeConfig.cc
index a69d7265b1..ae4fb9dc7b 100644
--- a/src/auth/SchemeConfig.cc
+++ b/src/auth/SchemeConfig.cc
@@ -17,6 +17,7 @@
 #include "ConfigParser.h"
 #include "Debug.h"
 #include "format/Format.h"
+#include "errorpage.h"
 #include "globals.h"
 #include "Store.h"
 #include "wordlist.h"
@@ -51,7 +52,7 @@ Auth::SchemeConfig::CreateAuthUser(const char *proxy_auth, AccessLogEntry::Point
         config->keyExtras->assemble(rmb, al, 0);
     }
 
-    return config->decode(proxy_auth, rmb.hasContent() ? rmb.content() : NULL);
+    return config->decode(proxy_auth, al->request, rmb.hasContent() ? rmb.content() : nullptr);
 }
 
 Auth::SchemeConfig *
@@ -178,3 +179,39 @@ Auth::SchemeConfig::done()
     keyExtrasLine.clean();
 }
 
+bool
+Auth::SchemeConfig::isCP1251EncodingAllowed(const HttpRequest *request)
+{
+    String hdr;
+
+    if (!request || !request->header.getList(Http::HdrType::ACCEPT_LANGUAGE, &hdr))
+        return false;
+
+    char lang[256];
+    size_t pos = 0; // current parsing position in header string
+
+    while (strHdrAcptLangGetItem(hdr, lang, 256, pos)) {
+
+        /* wildcard uses the configured default language */
+        if (lang[0] == '*' && lang[1] == '\0')
+            return false;
+
+        if ((strncmp(lang, "ru", 2) == 0 // Russian
+                || strncmp(lang, "uk", 2) == 0 // Ukrainian
+                || strncmp(lang, "be", 2) == 0 // Belorussian
+                || strncmp(lang, "bg", 2) == 0 // Bulgarian
+                || strncmp(lang, "sr", 2) == 0)) { // Serbian
+            if (lang[2] == '-') {
+                if (strcmp(lang + 3, "latn") == 0) // not Cyrillic
+                    return false;
+            } else if (xisalpha(lang[2])) {
+                return false;
+            }
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
diff --git a/src/auth/SchemeConfig.h b/src/auth/SchemeConfig.h
index 2ce91f7b1d..07c1cd2969 100644
--- a/src/auth/SchemeConfig.h
+++ b/src/auth/SchemeConfig.h
@@ -75,7 +75,7 @@ public:
      \param proxy_auth  Login Pattern to parse.
      \retval *      Details needed to authenticate.
      */
-    virtual UserRequest::Pointer decode(char const *proxy_auth, const char *requestRealm) = 0;
+    virtual UserRequest::Pointer decode(char const *proxy_auth, const HttpRequest *request, const char *requestRealm) = 0;
 
     /**
      * squid is finished with this config, release any unneeded resources.
@@ -133,6 +133,15 @@ public:
     int utf8 = 0; ///< wheter to accept UTF-8 characterset instead of ASCII. default: off
 
 protected:
+    /**
+     * Parse Accept-Language header and return whether a CP1251 encoding
+     * allowed or not.
+     *
+     * CP1251 (aka Windows-1251) is an 8-bit character encoding, designed
+     * to cover languages that use the Cyrillic script.
+     */
+    bool isCP1251EncodingAllowed(const HttpRequest *request);
+
     /// RFC 7235 section 2.2 - Protection Space (Realm)
     SBuf realm;
 };
diff --git a/src/auth/basic/Config.cc b/src/auth/basic/Config.cc
index 20e73c623c..2fcc67c766 100644
--- a/src/auth/basic/Config.cc
+++ b/src/auth/basic/Config.cc
@@ -20,14 +20,15 @@
 #include "auth/CredentialsCache.h"
 #include "auth/Gadgets.h"
 #include "auth/State.h"
+#include "auth/toUtf.h"
 #include "base64.h"
 #include "cache_cf.h"
-#include "charset.h"
 #include "helper.h"
 #include "HttpHeaderTools.h"
 #include "HttpReply.h"
 #include "mgr/Registration.h"
 #include "rfc1738.h"
+#include "sbuf/SBuf.h"
 #include "SquidTime.h"
 #include "Store.h"
 #include "util.h"
@@ -76,8 +77,13 @@ void
 Auth::Basic::Config::fixHeader(Auth::UserRequest::Pointer, HttpReply *rep, Http::HdrType hdrType, HttpRequest *)
 {
     if (authenticateProgram) {
-        debugs(29, 9, "Sending type:" << hdrType << " header: 'Basic realm=\"" << realm << "\"'");
-        httpHeaderPutStrf(&rep->header, hdrType, "Basic realm=\"" SQUIDSBUFPH "\"", SQUIDSBUFPRINT(realm));
+        if (utf8) {
+            debugs(29, 9, "Sending type:" << hdrType << " header: 'Basic realm=\"" << realm << "\", charset=\"UTF-8\"'");
+            httpHeaderPutStrf(&rep->header, hdrType, "Basic realm=\"" SQUIDSBUFPH "\", charset=\"UTF-8\"", SQUIDSBUFPRINT(realm));
+        } else {
+            debugs(29, 9, "Sending type:" << hdrType << " header: 'Basic realm=\"" << realm << "\"'");
+            httpHeaderPutStrf(&rep->header, hdrType, "Basic realm=\"" SQUIDSBUFPH "\"", SQUIDSBUFPRINT(realm));
+        }
     }
 }
 
@@ -149,7 +155,7 @@ authenticateBasicStats(StoreEntry * sentry)
 }
 
 char *
-Auth::Basic::Config::decodeCleartext(const char *httpAuthHeader)
+Auth::Basic::Config::decodeCleartext(const char *httpAuthHeader, const HttpRequest *request)
 {
     const char *proxy_auth = httpAuthHeader;
 
@@ -176,6 +182,13 @@ Auth::Basic::Config::decodeCleartext(const char *httpAuthHeader)
     if (base64_decode_update(&ctx, &dstLen, reinterpret_cast<uint8_t*>(cleartext), srcLen, eek) && base64_decode_final(&ctx)) {
         cleartext[dstLen] = '\0';
 
+        if (utf8 && !isValidUtf8String(cleartext, cleartext + dstLen)) {
+            auto str = isCP1251EncodingAllowed(request) ?
+                       Cp1251ToUtf8(cleartext) : Latin1ToUtf8(cleartext);
+            safe_free(cleartext);
+            cleartext = xstrdup(str.c_str());
+        }
+
         /*
          * Don't allow NL or CR in the credentials.
          * Oezguer Kesim <oec@codeblau.de>
@@ -203,13 +216,13 @@ Auth::Basic::Config::decodeCleartext(const char *httpAuthHeader)
  * descriptive message to the user.
  */
 Auth::UserRequest::Pointer
-Auth::Basic::Config::decode(char const *proxy_auth, const char *aRequestRealm)
+Auth::Basic::Config::decode(char const *proxy_auth, const HttpRequest *request, const char *aRequestRealm)
 {
     Auth::UserRequest::Pointer auth_user_request = dynamic_cast<Auth::UserRequest*>(new Auth::Basic::UserRequest);
     /* decode the username */
 
     // retrieve the cleartext (in a dynamically allocated char*)
-    char *cleartext = decodeCleartext(proxy_auth);
+    const auto cleartext = decodeCleartext(proxy_auth, request);
 
     // empty header? no auth details produced...
     if (!cleartext)
diff --git a/src/auth/basic/Config.h b/src/auth/basic/Config.h
index ec94233355..932308cc05 100644
--- a/src/auth/basic/Config.h
+++ b/src/auth/basic/Config.h
@@ -28,7 +28,7 @@ public:
     Config();
     virtual bool active() const;
     virtual bool configured() const;
-    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const char *requestRealm);
+    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const HttpRequest *request, const char *requestRealm);
     virtual void done();
     virtual void rotateHelpers();
     virtual bool dump(StoreEntry *, const char *, Auth::SchemeConfig *) const;
@@ -44,7 +44,7 @@ public:
     int casesensitive;
 
 private:
-    char * decodeCleartext(const char *httpAuthHeader);
+    char * decodeCleartext(const char *httpAuthHeader, const HttpRequest *request);
 };
 
 } // namespace Basic
diff --git a/src/auth/basic/UserRequest.cc b/src/auth/basic/UserRequest.cc
index 2b3fa53d78..c247cdf995 100644
--- a/src/auth/basic/UserRequest.cc
+++ b/src/auth/basic/UserRequest.cc
@@ -12,7 +12,6 @@
 #include "auth/basic/UserRequest.h"
 #include "auth/QueueNode.h"
 #include "auth/State.h"
-#include "charset.h"
 #include "Debug.h"
 #include "format/Format.h"
 #include "helper.h"
@@ -130,15 +129,10 @@ Auth::Basic::UserRequest::startHelperLookup(HttpRequest *request, AccessLogEntry
     char buf[HELPER_INPUT_BUFFER];
     static char usern[HELPER_INPUT_BUFFER];
     static char pass[HELPER_INPUT_BUFFER];
-    if (static_cast<Auth::Basic::Config*>(user()->config)->utf8) {
-        latin1_to_utf8(usern, sizeof(usern), user()->username());
-        latin1_to_utf8(pass, sizeof(pass), basic_auth->passwd);
-        xstrncpy(usern, rfc1738_escape(usern), sizeof(usern));
-        xstrncpy(pass, rfc1738_escape(pass), sizeof(pass));
-    } else {
-        xstrncpy(usern, rfc1738_escape(user()->username()), sizeof(usern));
-        xstrncpy(pass, rfc1738_escape(basic_auth->passwd), sizeof(pass));
-    }
+
+    xstrncpy(usern, rfc1738_escape(user()->username()), sizeof(usern));
+    xstrncpy(pass, rfc1738_escape(basic_auth->passwd), sizeof(pass));
+
     int sz = 0;
     if (const char *keyExtras = helperRequestKeyExtras(request, al))
         sz = snprintf(buf, sizeof(buf), "%s %s %s\n", usern, pass, keyExtras);
diff --git a/src/auth/digest/Config.cc b/src/auth/digest/Config.cc
index 5fd5a95b16..2f15becdfb 100644
--- a/src/auth/digest/Config.cc
+++ b/src/auth/digest/Config.cc
@@ -20,6 +20,7 @@
 #include "auth/digest/UserRequest.h"
 #include "auth/Gadgets.h"
 #include "auth/State.h"
+#include "auth/toUtf.h"
 #include "base/LookupTable.h"
 #include "cache_cf.h"
 #include "event.h"
@@ -31,6 +32,7 @@
 #include "mgr/Registration.h"
 #include "rfc2617.h"
 #include "sbuf/SBuf.h"
+#include "sbuf/StringConvert.h"
 #include "SquidTime.h"
 #include "Store.h"
 #include "StrList.h"
@@ -733,7 +735,7 @@ authDigestLogUsername(char *username, Auth::UserRequest::Pointer auth_user_reque
  * Auth_user structure.
  */
 Auth::UserRequest::Pointer
-Auth::Digest::Config::decode(char const *proxy_auth, const char *aRequestRealm)
+Auth::Digest::Config::decode(char const *proxy_auth, const HttpRequest *request, const char *aRequestRealm)
 {
     const char *item;
     const char *p;
@@ -810,8 +812,14 @@ Auth::Digest::Config::decode(char const *proxy_auth, const char *aRequestRealm)
         switch (t) {
         case DIGEST_USERNAME:
             safe_free(username);
-            if (value.size() != 0)
+            if (value.size() != 0) {
+                const auto v = value.termedBuf();
+                if (utf8 && !isValidUtf8String(v, v + value.size())) {
+                    auto str = isCP1251EncodingAllowed(request) ? Cp1251ToUtf8(v) : Latin1ToUtf8(v);
+                    value = SBufToString(str);
+                }
                 username = xstrndup(value.rawBuf(), value.size() + 1);
+            }
             debugs(29, 9, "Found Username '" << username << "'");
             break;
 
diff --git a/src/auth/digest/Config.h b/src/auth/digest/Config.h
index 886dbc682e..51d6754dd3 100644
--- a/src/auth/digest/Config.h
+++ b/src/auth/digest/Config.h
@@ -77,7 +77,7 @@ public:
     Config();
     virtual bool active() const;
     virtual bool configured() const;
-    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const char *requestRealm);
+    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const HttpRequest *request, const char *requestRealm);
     virtual void done();
     virtual void rotateHelpers();
     virtual bool dump(StoreEntry *, const char *, Auth::SchemeConfig *) const;
diff --git a/src/auth/digest/UserRequest.cc b/src/auth/digest/UserRequest.cc
index 9caa27b4b1..0b2c8e904a 100644
--- a/src/auth/digest/UserRequest.cc
+++ b/src/auth/digest/UserRequest.cc
@@ -12,7 +12,6 @@
 #include "auth/digest/User.h"
 #include "auth/digest/UserRequest.h"
 #include "auth/State.h"
-#include "charset.h"
 #include "format/Format.h"
 #include "helper.h"
 #include "helper/Reply.h"
@@ -298,19 +297,10 @@ Auth::Digest::UserRequest::startHelperLookup(HttpRequest *request, AccessLogEntr
     }
 
     const char *keyExtras = helperRequestKeyExtras(request, al);
-    if (static_cast<Auth::Digest::Config*>(Auth::SchemeConfig::Find("digest"))->utf8) {
-        char userstr[1024];
-        latin1_to_utf8(userstr, sizeof(userstr), user()->username());
-        if (keyExtras)
-            snprintf(buf, 8192, "\"%s\":\"%s\" %s\n", userstr, realm, keyExtras);
-        else
-            snprintf(buf, 8192, "\"%s\":\"%s\"\n", userstr, realm);
-    } else {
-        if (keyExtras)
-            snprintf(buf, 8192, "\"%s\":\"%s\" %s\n", user()->username(), realm, keyExtras);
-        else
-            snprintf(buf, 8192, "\"%s\":\"%s\"\n", user()->username(), realm);
-    }
+    if (keyExtras)
+        snprintf(buf, 8192, "\"%s\":\"%s\" %s\n", user()->username(), realm, keyExtras);
+    else
+        snprintf(buf, 8192, "\"%s\":\"%s\"\n", user()->username(), realm);
 
     helperSubmit(digestauthenticators, buf, Auth::Digest::UserRequest::HandleReply,
                  new Auth::StateData(this, handler, data));
diff --git a/src/auth/negotiate/Config.cc b/src/auth/negotiate/Config.cc
index 7debffb7bf..1b60369096 100644
--- a/src/auth/negotiate/Config.cc
+++ b/src/auth/negotiate/Config.cc
@@ -214,7 +214,7 @@ authenticateNegotiateStats(StoreEntry * sentry)
  * Auth_user structure.
  */
 Auth::UserRequest::Pointer
-Auth::Negotiate::Config::decode(char const *proxy_auth, const char *aRequestRealm)
+Auth::Negotiate::Config::decode(char const *proxy_auth, const HttpRequest *, const char *aRequestRealm)
 {
     Auth::Negotiate::User *newUser = new Auth::Negotiate::User(Auth::SchemeConfig::Find("negotiate"), aRequestRealm);
     Auth::UserRequest *auth_user_request = new Auth::Negotiate::UserRequest();
diff --git a/src/auth/negotiate/Config.h b/src/auth/negotiate/Config.h
index 0b27b8ff74..d2f9722ae4 100644
--- a/src/auth/negotiate/Config.h
+++ b/src/auth/negotiate/Config.h
@@ -27,7 +27,7 @@ class Config : public Auth::SchemeConfig
 public:
     virtual bool active() const;
     virtual bool configured() const;
-    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const char *requestRealm);
+    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const HttpRequest *request, const char *requestRealm);
     virtual void done();
     virtual void rotateHelpers();
     virtual void fixHeader(Auth::UserRequest::Pointer, HttpReply *, Http::HdrType, HttpRequest *);
diff --git a/src/auth/ntlm/Config.cc b/src/auth/ntlm/Config.cc
index a5703c750c..4865397286 100644
--- a/src/auth/ntlm/Config.cc
+++ b/src/auth/ntlm/Config.cc
@@ -205,7 +205,7 @@ authenticateNTLMStats(StoreEntry * sentry)
  * Auth_user structure.
  */
 Auth::UserRequest::Pointer
-Auth::Ntlm::Config::decode(char const *proxy_auth, const char *aRequestRealm)
+Auth::Ntlm::Config::decode(char const *proxy_auth, const HttpRequest *, const char *aRequestRealm)
 {
     Auth::Ntlm::User *newUser = new Auth::Ntlm::User(Auth::SchemeConfig::Find("ntlm"), aRequestRealm);
     Auth::UserRequest::Pointer auth_user_request = new Auth::Ntlm::UserRequest();
diff --git a/src/auth/ntlm/Config.h b/src/auth/ntlm/Config.h
index e234f49b86..183d7c6922 100644
--- a/src/auth/ntlm/Config.h
+++ b/src/auth/ntlm/Config.h
@@ -30,7 +30,7 @@ class Config : public Auth::SchemeConfig
 public:
     virtual bool active() const;
     virtual bool configured() const;
-    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const char *requestRealm);
+    virtual Auth::UserRequest::Pointer decode(char const *proxy_auth, const HttpRequest *request, const char *requestRealm);
     virtual void done();
     virtual void rotateHelpers();
     virtual void fixHeader(Auth::UserRequest::Pointer, HttpReply *, Http::HdrType, HttpRequest *);
diff --git a/src/auth/toUtf.cc b/src/auth/toUtf.cc
new file mode 100644
index 0000000000..0b5a5dc647
--- /dev/null
+++ b/src/auth/toUtf.cc
@@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#include "squid.h"
+#include "sbuf/SBuf.h"
+#include "auth/toUtf.h"
+
+SBuf
+Latin1ToUtf8(const char *in)
+{
+    SBuf result;
+
+    if (!in)
+        return result;
+
+    for (; *in; in++) {
+        const auto ch = static_cast<unsigned char>(*in);
+
+        if (ch < 0x80) {
+            result.append(ch);
+        } else {
+            result.append(static_cast<char>((ch >> 6) | 0xc0));
+            result.append(static_cast<char>((ch & 0x3f) | 0x80));
+        }
+    }
+    return result;
+}
+
+SBuf
+Cp1251ToUtf8(const char *in)
+{
+    static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
+    static const unsigned unicodevalues[] = {
+        0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
+        0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
+        0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+        0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
+        0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
+        0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
+        0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
+        0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
+    };
+    SBuf result;
+
+    if (!in)
+        return result;
+
+    for (; *in; in++) {
+        const auto ch = static_cast<unsigned char>(*in);
+        unsigned u = 0;
+        size_t bytesToWrite = 0;
+        char sequence[4] = {0, 0, 0, 0};
+
+        if (ch < 0x80)
+            u = ch;
+        else if (ch >= 0xC0 && ch <= 0xFF) // 0x0410..0x044F
+            u = 0x0350 + ch;
+        else
+            u = unicodevalues[ch - 0x80];
+
+        if (u < 0x80)
+            bytesToWrite = 1;
+        else if (u < 0x800)
+            bytesToWrite = 2;
+        else
+            bytesToWrite = 3;
+
+        switch (bytesToWrite) {
+        case 3:
+            sequence[2] = static_cast<char>(u & 0x3f) | 0x80;
+            u >>= 6;
+        // fall through
+        case 2:
+            sequence[1] = static_cast<char>(u & 0x3f) | 0x80;
+            u >>= 6;
+        // fall through
+        case 1:
+            sequence[0] = static_cast<char>(u)        | firstByteMark[bytesToWrite];
+            // fall through
+        }
+        result.append(sequence, bytesToWrite);
+    }
+    return result;
+}
+
+/**
+ * \returns the length of a UTF-8 code point that starts at the given byte
+ * \retval 0 indicates an invalid code point
+ *
+ * \param b0 the first byte of a UTF-8 code point
+ */
+static inline size_t
+utf8CodePointLength(const char b0)
+{
+    if ((b0 & 0x80) == 0)
+        return 1;
+    if ((b0 & 0xC0) != 0xC0)
+        return 0; // invalid code point
+    if ((b0 & 0xE0) == 0xC0)
+        return 2;
+    if ((b0 & 0xF0) == 0xE0)
+        return 3;
+    if ((b0 & 0xF8) == 0xF0)
+        return 4;
+    return 0; // invalid code point
+}
+
+/**
+ * Utility routine to tell whether a sequence of bytes is valid UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * If presented with a length > 4, this returns false.  The Unicode
+ * definition of UTF-8 goes up to 4-byte code points.
+ */
+static bool
+isValidUtf8CodePoint(const unsigned char* source, const size_t length)
+{
+    unsigned char a;
+    const unsigned char* srcptr = source + length;
+    switch (length) {
+    default:
+        return false;
+    // Everything else falls through when "true"...
+    case 4:
+        if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 3:
+        if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 2:
+        if ((a = (*--srcptr)) > 0xBF) return false;
+
+        switch (*source) {
+        // no fall-through in this inner switch
+        case 0xE0:
+            if (a < 0xA0) return false;
+            break;
+        case 0xED:
+            if (a > 0x9F) return false;
+            break;
+        case 0xF0:
+            if (a < 0x90) return false;
+            break;
+        case 0xF4:
+            if (a > 0x8F) return false;
+            break;
+        default:
+            if (a < 0x80) return false;
+            break;
+        }
+
+    case 1:
+        if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4)
+        return false;
+    return true;
+}
+
+/**
+ * \returns whether the given input is a valid (or empty) sequence of UTF-8 code points
+ */
+bool
+isValidUtf8String(const char *source, const char *sourceEnd) {
+    while (source < sourceEnd) {
+        const auto length = utf8CodePointLength(*source);
+        if (source + length > sourceEnd || !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source), length))
+            return false;
+        source += length;
+    }
+    return true; // including zero-length input
+}
+
diff --git a/include/charset.h b/src/auth/toUtf.h
similarity index 53%
rename from include/charset.h
rename to src/auth/toUtf.h
index 92e1b0d5be..d902e087bc 100644
--- a/include/charset.h
+++ b/src/auth/toUtf.h
@@ -9,13 +9,16 @@
 #ifndef _SQUID_CHARSET_H
 #define _SQUID_CHARSET_H
 
-#ifdef __cplusplus
-extern "C"
-#else
-extern
-#endif
+#include "sbuf/forward.h"
 
-char *latin1_to_utf8(char *out, size_t size, const char *in);
+/// converts ISO-LATIN-1 to UTF-8
+SBuf Latin1ToUtf8(const char *in);
+
+/// converts CP1251 to UTF-8
+SBuf Cp1251ToUtf8(const char *in);
+
+/// returns whether the given input is a valid (or empty) sequence of UTF-8 code points
+bool isValidUtf8String(const char *source, const char *sourceEnd);
 
 #endif /* _SQUID_CHARSET_H */
 
diff --git a/src/cf.data.pre b/src/cf.data.pre
index 1b3191962b..f69f67198c 100644
--- a/src/cf.data.pre
+++ b/src/cf.data.pre
@@ -673,12 +673,22 @@ DOC_START
 		For Basic and Digest this parameter is ignored.
 
 	"utf8" on|off
-		HTTP uses iso-latin-1 as character set, while some
-		authentication backends such as LDAP expects UTF-8. If this is
-		set to on Squid will translate the HTTP iso-latin-1 charset to
-		UTF-8 before sending the username and password to the helper.
-
-		For NTLM and Negotiate this parameter is ignored.
+		Useful for sending credentials to authentication backends that
+		expect UTF-8 encoding (e.g., LDAP).
+
+		When this option is enabled, Squid uses HTTP Accept-Language
+		request header to guess the received credentials encoding
+		(ISO-Latin-1, CP1251, or UTF-8) and then converts the first
+		two encodings into UTF-8.
+
+		When this option is disabled and by default, Squid sends
+		credentials in their original (i.e. received) encoding.
+
+		This parameter is only honored for Basic and Digest schemes.
+		For Basic, the entire username:password credentials are
+		checked and, if necessary, re-encoded. For Digest -- just the
+		username component. For NTLM and Negotiate schemes, this
+		parameter is ignored.
 
 IF HAVE_AUTH_MODULE_BASIC
 	=== Basic authentication parameters ===