]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/rust/lex/rust-input-source.h
03bf43b5b3b480afe2c8ab74d1d368bfbcf1c351
[thirdparty/gcc.git] / gcc / rust / lex / rust-input-source.h
1 // Copyright (C) 2020-2023 Free Software Foundation, Inc.
2
3 // This file is part of GCC.
4
5 // GCC is free software; you can redistribute it and/or modify it under
6 // the terms of the GNU General Public License as published by the Free
7 // Software Foundation; either version 3, or (at your option) any later
8 // version.
9
10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 // for more details.
14
15 // You should have received a copy of the GNU General Public License
16 // along with GCC; see the file COPYING3. If not see
17 // <http://www.gnu.org/licenses/>.
18
19 #ifndef RUST_INPUT_SOURCE_H
20 #define RUST_INPUT_SOURCE_H
21
22 #include "rust-codepoint.h"
23 #include "optional.h"
24
25 namespace Rust {
26
27 constexpr uint8_t UTF8_BOM1 = 0xEF;
28 constexpr uint8_t UTF8_BOM2 = 0xBB;
29 constexpr uint8_t UTF8_BOM3 = 0xBF;
30
31 // Input source wrapper thing.
32 class InputSource
33 {
34 private:
35 // position of current character
36 unsigned int pos;
37 std::vector<Codepoint> chars;
38 bool is_valid_utf8;
39
40 // Overload operator () to return next char from input stream.
41 virtual int next_byte () = 0;
42
43 Codepoint next_codepoint ()
44 {
45 uint32_t input = next_byte ();
46
47 if ((int32_t) input == EOF)
48 return Codepoint::eof ();
49 else if (input <= MAX_ASCII_CODEPOINT)
50 {
51 // ascii -- 1 byte
52 return {input};
53 }
54 else if ((input & 0xC0) == 0x80)
55 {
56 // invalid (continuation; can't be first char)
57 return {CODEPOINT_INVALID};
58 }
59 else if ((input & 0xE0) == 0xC0)
60 {
61 // 2 bytes
62 uint8_t input2 = next_byte ();
63 if ((input2 & 0xC0) != 0x80)
64 return {CODEPOINT_INVALID};
65
66 uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
67 return output;
68 }
69 else if ((input & 0xF0) == 0xE0)
70 {
71 // 3 bytes or UTF-8 BOM
72 uint8_t input2 = next_byte ();
73 // If the second byte is equal to 0xBB then the input is no longer a
74 // valid UTF-8 char. Then, we check if the third byte makes up a UTF
75 // BOM.
76 if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
77 {
78 uint8_t input3 = next_byte ();
79 if (input3 == UTF8_BOM3)
80 // found BOM
81 return next_codepoint ();
82 else
83 return {CODEPOINT_INVALID};
84 }
85
86 if ((input2 & 0xC0) != 0x80)
87 return {CODEPOINT_INVALID};
88
89 uint8_t input3 = next_byte ();
90
91 if ((input3 & 0xC0) != 0x80)
92 return {CODEPOINT_INVALID};
93
94 uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
95 | ((input3 & 0x3F) << 0);
96 return {output};
97 }
98 else if ((input & 0xF8) == 0xF0)
99 {
100 // 4 bytes
101 uint8_t input2 = next_byte ();
102 if ((input2 & 0xC0) != 0x80)
103 return {CODEPOINT_INVALID};
104
105 uint8_t input3 = next_byte ();
106 if ((input3 & 0xC0) != 0x80)
107 return {CODEPOINT_INVALID};
108
109 uint8_t input4 = next_byte ();
110 if ((input4 & 0xC0) != 0x80)
111 return {CODEPOINT_INVALID};
112
113 uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
114 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
115 return {output};
116 }
117 else
118 {
119 return {CODEPOINT_INVALID};
120 }
121 }
122
123 protected:
124 // This method must be called by the constructor to initialize the input
125 // source. We cannot move this to the constructor because it calls a
126 // virtual method .
127 void init ()
128 {
129 // Check if the input source is valid as utf-8 and copy all characters to
130 // `chars`.
131 Codepoint char32 = next_codepoint ();
132 while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
133 {
134 chars.push_back (char32);
135 char32 = next_codepoint ();
136 }
137
138 if (char32 == CODEPOINT_INVALID)
139 {
140 // Input source is not valid as utf-8.
141 is_valid_utf8 = false;
142 }
143 }
144
145 public:
146 InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
147
148 virtual ~InputSource () {}
149
150 // Checks if input source is a valid UTF-8 string
151 bool is_valid () { return is_valid_utf8; }
152
153 // get the next UTF-8 character
154 Codepoint next ()
155 {
156 if (pos >= chars.size ())
157 return Codepoint::eof ();
158 else
159 {
160 Codepoint c = chars[pos];
161 pos++;
162 return c;
163 }
164 }
165
166 // Returns codepoint if input source is a valid UTF-8 string. Returns
167 // nullopt otherwise.
168 tl::optional<std::vector<Codepoint>> get_chars ()
169 {
170 if (is_valid ())
171 return {chars};
172 else
173 return tl::nullopt;
174 }
175 };
176
177 class FileInputSource : public InputSource
178 {
179 private:
180 // Input source file.
181 FILE *input;
182
183 int next_byte () override { return fgetc (input); }
184
185 public:
186 // Create new input source from file.
187 FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
188 };
189
190 class BufferInputSource : public InputSource
191 {
192 private:
193 const std::string &buffer;
194 size_t offs;
195
196 int next_byte () override
197 {
198 if (offs >= buffer.size ())
199 return EOF;
200 return static_cast<uint8_t> (buffer.at (offs++));
201 }
202
203 public:
204 // Create new input source from file.
205 BufferInputSource (const std::string &b, size_t offset)
206 : InputSource (), buffer (b), offs (offset)
207 {
208 init ();
209 }
210 };
211
212 } // namespace Rust
213
214 #endif