]> git.ipfire.org Git - thirdparty/squid.git/blob - src/auth/toUtf.cc
Source Format Enforcement (#963)
[thirdparty/squid.git] / src / auth / toUtf.cc
1 /*
2 * Copyright (C) 1996-2022 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 #include "squid.h"
10 #include "auth/toUtf.h"
11 #include "sbuf/SBuf.h"
12
13 #include <limits>
14
15 SBuf
16 Latin1ToUtf8(const char *in)
17 {
18 SBuf result;
19
20 if (!in)
21 return result;
22
23 for (; *in; in++) {
24 const auto ch = static_cast<unsigned char>(*in);
25
26 if (ch < 0x80) {
27 result.append(ch);
28 } else {
29 result.append(static_cast<char>((ch >> 6) | 0xc0));
30 result.append(static_cast<char>((ch & 0x3f) | 0x80));
31 }
32 }
33 return result;
34 }
35
36 SBuf
37 Cp1251ToUtf8(const char *in)
38 {
39 static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
40 static const unsigned unicodevalues[] = {
41 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
42 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
43 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
44 0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
45 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
46 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
47 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
48 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
49 };
50 SBuf result;
51
52 if (!in)
53 return result;
54
55 for (; *in; in++) {
56 const auto ch = static_cast<unsigned char>(*in);
57 unsigned u = 0;
58 size_t bytesToWrite = 0;
59 char sequence[4] = {0, 0, 0, 0};
60
61 static_assert(std::numeric_limits<unsigned char>::max() == 0xFFu,
62 "we require char to be exactly 8 bits");
63 if (ch < 0x80)
64 u = ch;
65 else if (ch >= 0xC0) // 0x0410..0x044F
66 u = 0x0350 + ch;
67 else
68 u = unicodevalues[ch - 0x80];
69
70 if (u < 0x80)
71 bytesToWrite = 1;
72 else if (u < 0x800)
73 bytesToWrite = 2;
74 else
75 bytesToWrite = 3;
76
77 switch (bytesToWrite) {
78 case 3:
79 sequence[2] = static_cast<char>(u & 0x3f) | 0x80;
80 u >>= 6;
81 /* [[fallthrough]] */
82 case 2:
83 sequence[1] = static_cast<char>(u & 0x3f) | 0x80;
84 u >>= 6;
85 /* [[fallthrough]] */
86 case 1:
87 sequence[0] = static_cast<char>(u) | firstByteMark[bytesToWrite];
88 }
89 result.append(sequence, bytesToWrite);
90 }
91 return result;
92 }
93
94 /**
95 * \returns the length of a UTF-8 code point that starts at the given byte
96 * \retval 0 indicates an invalid code point
97 *
98 * \param b0 the first byte of a UTF-8 code point
99 */
100 static inline size_t
101 utf8CodePointLength(const char b0)
102 {
103 if ((b0 & 0x80) == 0)
104 return 1;
105 if ((b0 & 0xC0) != 0xC0)
106 return 0; // invalid code point
107 if ((b0 & 0xE0) == 0xC0)
108 return 2;
109 if ((b0 & 0xF0) == 0xE0)
110 return 3;
111 if ((b0 & 0xF8) == 0xF0)
112 return 4;
113 return 0; // invalid code point
114 }
115
116 /**
117 * Utility routine to tell whether a sequence of bytes is valid UTF-8.
118 * This must be called with the length pre-determined by the first byte.
119 * If presented with a length > 4, this returns false. The Unicode
120 * definition of UTF-8 goes up to 4-byte code points.
121 */
122 static bool
123 isValidUtf8CodePoint(const unsigned char* source, const size_t length)
124 {
125 unsigned char a;
126 const unsigned char* srcptr = source + length;
127 switch (length) {
128 default:
129 return false;
130 // Everything else falls through when "true"...
131 case 4:
132 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
133 /* [[fallthrough]] */
134 case 3:
135 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
136 /* [[fallthrough]] */
137 case 2:
138 if ((a = (*--srcptr)) > 0xBF) return false;
139
140 switch (*source) {
141 // no fall-through in this inner switch
142 case 0xE0:
143 if (a < 0xA0) return false;
144 break;
145 case 0xED:
146 if (a > 0x9F) return false;
147 break;
148 case 0xF0:
149 if (a < 0x90) return false;
150 break;
151 case 0xF4:
152 if (a > 0x8F) return false;
153 break;
154 default:
155 if (a < 0x80) return false;
156 break;
157 }
158 /* [[fallthrough]] */
159
160 case 1:
161 if (*source >= 0x80 && *source < 0xC2) return false;
162 }
163 if (*source > 0xF4)
164 return false;
165 return true;
166 }
167
168 /**
169 * \returns whether the given input is a valid (or empty) sequence of UTF-8 code points
170 */
171 bool
172 isValidUtf8String(const char *source, const char *sourceEnd) {
173 while (source < sourceEnd) {
174 const auto length = utf8CodePointLength(*source);
175 if (source + length > sourceEnd || !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source), length))
176 return false;
177 source += length;
178 }
179 return true; // including zero-length input
180 }
181