]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/auth/toUtf.cc
2 * Copyright (C) 1996-2022 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
10 #include "auth/toUtf.h"
11 #include "sbuf/SBuf.h"
16 Latin1ToUtf8(const char *in
)
24 const auto ch
= static_cast<unsigned char>(*in
);
29 result
.append(static_cast<char>((ch
>> 6) | 0xc0));
30 result
.append(static_cast<char>((ch
& 0x3f) | 0x80));
37 Cp1251ToUtf8(const char *in
)
39 static const unsigned char firstByteMark
[] = { 0x00, 0x00, 0xC0, 0xE0 };
40 static const unsigned unicodevalues
[] = {
41 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
42 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
43 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
44 0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
45 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
46 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
47 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
48 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
56 const auto ch
= static_cast<unsigned char>(*in
);
58 size_t bytesToWrite
= 0;
59 char sequence
[4] = {0, 0, 0, 0};
61 static_assert(std::numeric_limits
<unsigned char>::max() == 0xFFu
,
62 "we require char to be exactly 8 bits");
65 else if (ch
>= 0xC0) // 0x0410..0x044F
68 u
= unicodevalues
[ch
- 0x80];
77 switch (bytesToWrite
) {
79 sequence
[2] = static_cast<char>(u
& 0x3f) | 0x80;
83 sequence
[1] = static_cast<char>(u
& 0x3f) | 0x80;
87 sequence
[0] = static_cast<char>(u
) | firstByteMark
[bytesToWrite
];
89 result
.append(sequence
, bytesToWrite
);
95 * \returns the length of a UTF-8 code point that starts at the given byte
96 * \retval 0 indicates an invalid code point
98 * \param b0 the first byte of a UTF-8 code point
101 utf8CodePointLength(const char b0
)
103 if ((b0
& 0x80) == 0)
105 if ((b0
& 0xC0) != 0xC0)
106 return 0; // invalid code point
107 if ((b0
& 0xE0) == 0xC0)
109 if ((b0
& 0xF0) == 0xE0)
111 if ((b0
& 0xF8) == 0xF0)
113 return 0; // invalid code point
117 * Utility routine to tell whether a sequence of bytes is valid UTF-8.
118 * This must be called with the length pre-determined by the first byte.
119 * If presented with a length > 4, this returns false. The Unicode
120 * definition of UTF-8 goes up to 4-byte code points.
123 isValidUtf8CodePoint(const unsigned char* source
, const size_t length
)
126 const unsigned char* srcptr
= source
+ length
;
130 // Everything else falls through when "true"...
132 if ((a
= (*--srcptr
)) < 0x80 || a
> 0xBF) return false;
133 /* [[fallthrough]] */
135 if ((a
= (*--srcptr
)) < 0x80 || a
> 0xBF) return false;
136 /* [[fallthrough]] */
138 if ((a
= (*--srcptr
)) > 0xBF) return false;
141 // no fall-through in this inner switch
143 if (a
< 0xA0) return false;
146 if (a
> 0x9F) return false;
149 if (a
< 0x90) return false;
152 if (a
> 0x8F) return false;
155 if (a
< 0x80) return false;
158 /* [[fallthrough]] */
161 if (*source
>= 0x80 && *source
< 0xC2) return false;
169 * \returns whether the given input is a valid (or empty) sequence of UTF-8 code points
172 isValidUtf8String(const char *source
, const char *sourceEnd
) {
173 while (source
< sourceEnd
) {
174 const auto length
= utf8CodePointLength(*source
);
175 if (source
+ length
> sourceEnd
|| !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source
), length
))
179 return true; // including zero-length input