src/auth/toUtf.cc

   1 /*
   2  * Copyright (C) 1996-2022 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 #include "squid.h"
  10 #include "auth/toUtf.h"
  11 #include "sbuf/SBuf.h"
  12
  13 #include <limits>
  14
  15 SBuf
  16 Latin1ToUtf8(const char *in)
  17 {
  18     SBuf result;
  19
  20     if (!in)
  21         return result;
  22
  23     for (; *in; in++) {
  24         const auto ch = static_cast<unsigned char>(*in);
  25
  26         if (ch < 0x80) {
  27             result.append(ch);
  28         } else {
  29             result.append(static_cast<char>((ch >> 6) | 0xc0));
  30             result.append(static_cast<char>((ch & 0x3f) | 0x80));
  31         }
  32     }
  33     return result;
  34 }
  35
  36 SBuf
  37 Cp1251ToUtf8(const char *in)
  38 {
  39     static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
  40     static const unsigned unicodevalues[] = {
  41         0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
  42         0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
  43         0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  44         0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
  45         0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
  46         0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
  47         0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
  48         0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
  49     };
  50     SBuf result;
  51
  52     if (!in)
  53         return result;
  54
  55     for (; *in; in++) {
  56         const auto ch = static_cast<unsigned char>(*in);
  57         unsigned u = 0;
  58         size_t bytesToWrite = 0;
  59         char sequence[4] = {0, 0, 0, 0};
  60
  61         static_assert(std::numeric_limits<unsigned char>::max() == 0xFFu,
  62                       "we require char to be exactly 8 bits");
  63         if (ch < 0x80)
  64             u = ch;
  65         else if (ch >= 0xC0) // 0x0410..0x044F
  66             u = 0x0350 + ch;
  67         else
  68             u = unicodevalues[ch - 0x80];
  69
  70         if (u < 0x80)
  71             bytesToWrite = 1;
  72         else if (u < 0x800)
  73             bytesToWrite = 2;
  74         else
  75             bytesToWrite = 3;
  76
  77         switch (bytesToWrite) {
  78         case 3:
  79             sequence[2] = static_cast<char>(u & 0x3f) | 0x80;
  80             u >>= 6;
  81         /* [[fallthrough]] */
  82         case 2:
  83             sequence[1] = static_cast<char>(u & 0x3f) | 0x80;
  84             u >>= 6;
  85         /* [[fallthrough]] */
  86         case 1:
  87             sequence[0] = static_cast<char>(u)        | firstByteMark[bytesToWrite];
  88         }
  89         result.append(sequence, bytesToWrite);
  90     }
  91     return result;
  92 }
  93
  94 /**
  95  * \returns the length of a UTF-8 code point that starts at the given byte
  96  * \retval 0 indicates an invalid code point
  97  *
  98  * \param b0 the first byte of a UTF-8 code point
  99  */
 100 static inline size_t
 101 utf8CodePointLength(const char b0)
 102 {
 103     if ((b0 & 0x80) == 0)
 104         return 1;
 105     if ((b0 & 0xC0) != 0xC0)
 106         return 0; // invalid code point
 107     if ((b0 & 0xE0) == 0xC0)
 108         return 2;
 109     if ((b0 & 0xF0) == 0xE0)
 110         return 3;
 111     if ((b0 & 0xF8) == 0xF0)
 112         return 4;
 113     return 0; // invalid code point
 114 }
 115
 116 /**
 117  * Utility routine to tell whether a sequence of bytes is valid UTF-8.
 118  * This must be called with the length pre-determined by the first byte.
 119  * If presented with a length > 4, this returns false.  The Unicode
 120  * definition of UTF-8 goes up to 4-byte code points.
 121  */
 122 static bool
 123 isValidUtf8CodePoint(const unsigned char* source, const size_t length)
 124 {
 125     unsigned char a;
 126     const unsigned char* srcptr = source + length;
 127     switch (length) {
 128     default:
 129         return false;
 130     // Everything else falls through when "true"...
 131     case 4:
 132         if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 133     /* [[fallthrough]] */
 134     case 3:
 135         if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 136     /* [[fallthrough]] */
 137     case 2:
 138         if ((a = (*--srcptr)) > 0xBF) return false;
 139
 140         switch (*source) {
 141         // no fall-through in this inner switch
 142         case 0xE0:
 143             if (a < 0xA0) return false;
 144             break;
 145         case 0xED:
 146             if (a > 0x9F) return false;
 147             break;
 148         case 0xF0:
 149             if (a < 0x90) return false;
 150             break;
 151         case 0xF4:
 152             if (a > 0x8F) return false;
 153             break;
 154         default:
 155             if (a < 0x80) return false;
 156             break;
 157         }
 158     /* [[fallthrough]] */
 159
 160     case 1:
 161         if (*source >= 0x80 && *source < 0xC2) return false;
 162     }
 163     if (*source > 0xF4)
 164         return false;
 165     return true;
 166 }
 167
 168 /**
 169  * \returns whether the given input is a valid (or empty) sequence of UTF-8 code points
 170  */
 171 bool
 172 isValidUtf8String(const char *source, const char *sourceEnd) {
 173     while (source < sourceEnd) {
 174         const auto length = utf8CodePointLength(*source);
 175         if (source + length > sourceEnd || !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source), length))
 176             return false;
 177         source += length;
 178     }
 179     return true; // including zero-length input
 180 }
 181