gcc/rust/lex/rust-input-source.h

   1 // Copyright (C) 2020-2023 Free Software Foundation, Inc.
   2
   3 // This file is part of GCC.
   4
   5 // GCC is free software; you can redistribute it and/or modify it under
   6 // the terms of the GNU General Public License as published by the Free
   7 // Software Foundation; either version 3, or (at your option) any later
   8 // version.
   9
  10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 // for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with GCC; see the file COPYING3.  If not see
  17 // <http://www.gnu.org/licenses/>.
  18
  19 #ifndef RUST_INPUT_SOURCE_H
  20 #define RUST_INPUT_SOURCE_H
  21
  22 #include "rust-codepoint.h"
  23 #include "optional.h"
  24
  25 namespace Rust {
  26
  27 constexpr uint8_t UTF8_BOM1 = 0xEF;
  28 constexpr uint8_t UTF8_BOM2 = 0xBB;
  29 constexpr uint8_t UTF8_BOM3 = 0xBF;
  30
  31 // Input source wrapper thing.
  32 class InputSource
  33 {
  34 private:
  35   // position of current character
  36   unsigned int pos;
  37   std::vector<Codepoint> chars;
  38   bool is_valid_utf8;
  39
  40   // Overload operator () to return next char from input stream.
  41   virtual int next_byte () = 0;
  42
  43   Codepoint next_codepoint ()
  44   {
  45     uint32_t input = next_byte ();
  46
  47     if ((int32_t) input == EOF)
  48       return Codepoint::eof ();
  49     else if (input <= MAX_ASCII_CODEPOINT)
  50       {
  51         // ascii -- 1 byte
  52         return {input};
  53       }
  54     else if ((input & 0xC0) == 0x80)
  55       {
  56         // invalid (continuation; can't be first char)
  57         return {CODEPOINT_INVALID};
  58       }
  59     else if ((input & 0xE0) == 0xC0)
  60       {
  61         // 2 bytes
  62         uint8_t input2 = next_byte ();
  63         if ((input2 & 0xC0) != 0x80)
  64           return {CODEPOINT_INVALID};
  65
  66         uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
  67         return output;
  68       }
  69     else if ((input & 0xF0) == 0xE0)
  70       {
  71         // 3 bytes or UTF-8 BOM
  72         uint8_t input2 = next_byte ();
  73         // If the second byte is equal to 0xBB then the input is no longer a
  74         // valid UTF-8 char. Then, we check if the third byte makes up a UTF
  75         // BOM.
  76         if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
  77           {
  78             uint8_t input3 = next_byte ();
  79             if (input3 == UTF8_BOM3)
  80               // found BOM
  81               return next_codepoint ();
  82             else
  83               return {CODEPOINT_INVALID};
  84           }
  85
  86         if ((input2 & 0xC0) != 0x80)
  87           return {CODEPOINT_INVALID};
  88
  89         uint8_t input3 = next_byte ();
  90
  91         if ((input3 & 0xC0) != 0x80)
  92           return {CODEPOINT_INVALID};
  93
  94         uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
  95                           | ((input3 & 0x3F) << 0);
  96         return {output};
  97       }
  98     else if ((input & 0xF8) == 0xF0)
  99       {
 100         // 4 bytes
 101         uint8_t input2 = next_byte ();
 102         if ((input2 & 0xC0) != 0x80)
 103           return {CODEPOINT_INVALID};
 104
 105         uint8_t input3 = next_byte ();
 106         if ((input3 & 0xC0) != 0x80)
 107           return {CODEPOINT_INVALID};
 108
 109         uint8_t input4 = next_byte ();
 110         if ((input4 & 0xC0) != 0x80)
 111           return {CODEPOINT_INVALID};
 112
 113         uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
 114                           | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
 115         return {output};
 116       }
 117     else
 118       {
 119         return {CODEPOINT_INVALID};
 120       }
 121   }
 122
 123 protected:
 124   // This method must be called by the constructor to initialize the input
 125   // source. We cannot move this to the constructor because it calls a
 126   // virtual method .
 127   void init ()
 128   {
 129     // Check if the input source is valid as utf-8 and copy all characters to
 130     // `chars`.
 131     Codepoint char32 = next_codepoint ();
 132     while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
 133       {
 134         chars.push_back (char32);
 135         char32 = next_codepoint ();
 136       }
 137
 138     if (char32 == CODEPOINT_INVALID)
 139       {
 140         // Input source is not valid as utf-8.
 141         is_valid_utf8 = false;
 142       }
 143   }
 144
 145 public:
 146   InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
 147
 148   virtual ~InputSource () {}
 149
 150   // Checks if input source is a valid UTF-8 string
 151   bool is_valid () { return is_valid_utf8; }
 152
 153   // get the next UTF-8 character
 154   Codepoint next ()
 155   {
 156     if (pos >= chars.size ())
 157       return Codepoint::eof ();
 158     else
 159       {
 160         Codepoint c = chars[pos];
 161         pos++;
 162         return c;
 163       }
 164   }
 165
 166   // Returns codepoint if input source is a valid UTF-8 string. Returns
 167   // nullopt otherwise.
 168   tl::optional<std::vector<Codepoint>> get_chars ()
 169   {
 170     if (is_valid ())
 171       return {chars};
 172     else
 173       return tl::nullopt;
 174   }
 175 };
 176
 177 class FileInputSource : public InputSource
 178 {
 179 private:
 180   // Input source file.
 181   FILE *input;
 182
 183   int next_byte () override { return fgetc (input); }
 184
 185 public:
 186   // Create new input source from file.
 187   FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
 188 };
 189
 190 class BufferInputSource : public InputSource
 191 {
 192 private:
 193   const std::string &buffer;
 194   size_t offs;
 195
 196   int next_byte () override
 197   {
 198     if (offs >= buffer.size ())
 199       return EOF;
 200     return static_cast<uint8_t> (buffer.at (offs++));
 201   }
 202
 203 public:
 204   // Create new input source from file.
 205   BufferInputSource (const std::string &b, size_t offset)
 206     : InputSource (), buffer (b), offs (offset)
 207   {
 208     init ();
 209   }
 210 };
 211
 212 } // namespace Rust
 213
 214 #endif