]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/auth/toUtf.cc
2 * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
10 #include "sbuf/SBuf.h"
11 #include "auth/toUtf.h"
14 Latin1ToUtf8(const char *in
)
22 const auto ch
= static_cast<unsigned char>(*in
);
27 result
.append(static_cast<char>((ch
>> 6) | 0xc0));
28 result
.append(static_cast<char>((ch
& 0x3f) | 0x80));
35 Cp1251ToUtf8(const char *in
)
37 static const unsigned char firstByteMark
[] = { 0x00, 0x00, 0xC0, 0xE0 };
38 static const unsigned unicodevalues
[] = {
39 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
40 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
41 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
42 0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
43 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
44 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
45 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
46 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
54 const auto ch
= static_cast<unsigned char>(*in
);
56 size_t bytesToWrite
= 0;
57 char sequence
[4] = {0, 0, 0, 0};
61 else if (ch
>= 0xC0 && ch
<= 0xFF) // 0x0410..0x044F
64 u
= unicodevalues
[ch
- 0x80];
73 switch (bytesToWrite
) {
75 sequence
[2] = static_cast<char>(u
& 0x3f) | 0x80;
79 sequence
[1] = static_cast<char>(u
& 0x3f) | 0x80;
83 sequence
[0] = static_cast<char>(u
) | firstByteMark
[bytesToWrite
];
86 result
.append(sequence
, bytesToWrite
);
92 * \returns the length of a UTF-8 code point that starts at the given byte
93 * \retval 0 indicates an invalid code point
95 * \param b0 the first byte of a UTF-8 code point
98 utf8CodePointLength(const char b0
)
100 if ((b0
& 0x80) == 0)
102 if ((b0
& 0xC0) != 0xC0)
103 return 0; // invalid code point
104 if ((b0
& 0xE0) == 0xC0)
106 if ((b0
& 0xF0) == 0xE0)
108 if ((b0
& 0xF8) == 0xF0)
110 return 0; // invalid code point
114 * Utility routine to tell whether a sequence of bytes is valid UTF-8.
115 * This must be called with the length pre-determined by the first byte.
116 * If presented with a length > 4, this returns false. The Unicode
117 * definition of UTF-8 goes up to 4-byte code points.
120 isValidUtf8CodePoint(const unsigned char* source
, const size_t length
)
123 const unsigned char* srcptr
= source
+ length
;
127 // Everything else falls through when "true"...
129 if ((a
= (*--srcptr
)) < 0x80 || a
> 0xBF) return false;
131 if ((a
= (*--srcptr
)) < 0x80 || a
> 0xBF) return false;
133 if ((a
= (*--srcptr
)) > 0xBF) return false;
136 // no fall-through in this inner switch
138 if (a
< 0xA0) return false;
141 if (a
> 0x9F) return false;
144 if (a
< 0x90) return false;
147 if (a
> 0x8F) return false;
150 if (a
< 0x80) return false;
155 if (*source
>= 0x80 && *source
< 0xC2) return false;
163 * \returns whether the given input is a valid (or empty) sequence of UTF-8 code points
166 isValidUtf8String(const char *source
, const char *sourceEnd
) {
167 while (source
< sourceEnd
) {
168 const auto length
= utf8CodePointLength(*source
);
169 if (source
+ length
> sourceEnd
|| !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source
), length
))
173 return true; // including zero-length input