]> git.ipfire.org Git - thirdparty/squid.git/blob - src/auth/toUtf.cc
Smarter auth_param utf8 handling, including CP1251 support (#480)
[thirdparty/squid.git] / src / auth / toUtf.cc
1 /*
2 * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 #include "squid.h"
10 #include "sbuf/SBuf.h"
11 #include "auth/toUtf.h"
12
13 SBuf
14 Latin1ToUtf8(const char *in)
15 {
16 SBuf result;
17
18 if (!in)
19 return result;
20
21 for (; *in; in++) {
22 const auto ch = static_cast<unsigned char>(*in);
23
24 if (ch < 0x80) {
25 result.append(ch);
26 } else {
27 result.append(static_cast<char>((ch >> 6) | 0xc0));
28 result.append(static_cast<char>((ch & 0x3f) | 0x80));
29 }
30 }
31 return result;
32 }
33
34 SBuf
35 Cp1251ToUtf8(const char *in)
36 {
37 static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
38 static const unsigned unicodevalues[] = {
39 0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
40 0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
41 0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
42 0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
43 0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
44 0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
45 0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
46 0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
47 };
48 SBuf result;
49
50 if (!in)
51 return result;
52
53 for (; *in; in++) {
54 const auto ch = static_cast<unsigned char>(*in);
55 unsigned u = 0;
56 size_t bytesToWrite = 0;
57 char sequence[4] = {0, 0, 0, 0};
58
59 if (ch < 0x80)
60 u = ch;
61 else if (ch >= 0xC0 && ch <= 0xFF) // 0x0410..0x044F
62 u = 0x0350 + ch;
63 else
64 u = unicodevalues[ch - 0x80];
65
66 if (u < 0x80)
67 bytesToWrite = 1;
68 else if (u < 0x800)
69 bytesToWrite = 2;
70 else
71 bytesToWrite = 3;
72
73 switch (bytesToWrite) {
74 case 3:
75 sequence[2] = static_cast<char>(u & 0x3f) | 0x80;
76 u >>= 6;
77 // fall through
78 case 2:
79 sequence[1] = static_cast<char>(u & 0x3f) | 0x80;
80 u >>= 6;
81 // fall through
82 case 1:
83 sequence[0] = static_cast<char>(u) | firstByteMark[bytesToWrite];
84 // fall through
85 }
86 result.append(sequence, bytesToWrite);
87 }
88 return result;
89 }
90
91 /**
92 * \returns the length of a UTF-8 code point that starts at the given byte
93 * \retval 0 indicates an invalid code point
94 *
95 * \param b0 the first byte of a UTF-8 code point
96 */
97 static inline size_t
98 utf8CodePointLength(const char b0)
99 {
100 if ((b0 & 0x80) == 0)
101 return 1;
102 if ((b0 & 0xC0) != 0xC0)
103 return 0; // invalid code point
104 if ((b0 & 0xE0) == 0xC0)
105 return 2;
106 if ((b0 & 0xF0) == 0xE0)
107 return 3;
108 if ((b0 & 0xF8) == 0xF0)
109 return 4;
110 return 0; // invalid code point
111 }
112
113 /**
114 * Utility routine to tell whether a sequence of bytes is valid UTF-8.
115 * This must be called with the length pre-determined by the first byte.
116 * If presented with a length > 4, this returns false. The Unicode
117 * definition of UTF-8 goes up to 4-byte code points.
118 */
119 static bool
120 isValidUtf8CodePoint(const unsigned char* source, const size_t length)
121 {
122 unsigned char a;
123 const unsigned char* srcptr = source + length;
124 switch (length) {
125 default:
126 return false;
127 // Everything else falls through when "true"...
128 case 4:
129 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
130 case 3:
131 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
132 case 2:
133 if ((a = (*--srcptr)) > 0xBF) return false;
134
135 switch (*source) {
136 // no fall-through in this inner switch
137 case 0xE0:
138 if (a < 0xA0) return false;
139 break;
140 case 0xED:
141 if (a > 0x9F) return false;
142 break;
143 case 0xF0:
144 if (a < 0x90) return false;
145 break;
146 case 0xF4:
147 if (a > 0x8F) return false;
148 break;
149 default:
150 if (a < 0x80) return false;
151 break;
152 }
153
154 case 1:
155 if (*source >= 0x80 && *source < 0xC2) return false;
156 }
157 if (*source > 0xF4)
158 return false;
159 return true;
160 }
161
162 /**
163 * \returns whether the given input is a valid (or empty) sequence of UTF-8 code points
164 */
165 bool
166 isValidUtf8String(const char *source, const char *sourceEnd) {
167 while (source < sourceEnd) {
168 const auto length = utf8CodePointLength(*source);
169 if (source + length > sourceEnd || !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source), length))
170 return false;
171 source += length;
172 }
173 return true; // including zero-length input
174 }
175