]>
Commit | Line | Data |
---|---|---|
5fee5ec3 IB |
1 | /** |
2 | * Implements the lexical analyzer, which converts source code into lexical tokens. | |
3 | * | |
4 | * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) | |
5 | * | |
f99303eb | 6 | * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved |
c43b5909 IB |
7 | * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) |
8 | * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) | |
5fee5ec3 IB |
9 | * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) |
10 | * Documentation: https://dlang.org/phobos/dmd_lexer.html | |
11 | * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d | |
12 | */ | |
13 | ||
14 | module dmd.lexer; | |
15 | ||
16 | import core.stdc.ctype; | |
17 | import core.stdc.errno; | |
18 | import core.stdc.stdarg; | |
19 | import core.stdc.stdio; | |
20 | import core.stdc.stdlib : getenv; | |
21 | import core.stdc.string; | |
22 | import core.stdc.time; | |
23 | ||
24 | import dmd.entity; | |
8da8c7d3 | 25 | import dmd.errorsink; |
5fee5ec3 IB |
26 | import dmd.id; |
27 | import dmd.identifier; | |
f99303eb | 28 | import dmd.location; |
0fb57034 | 29 | import dmd.root.array; |
5fee5ec3 | 30 | import dmd.root.ctfloat; |
0fb57034 | 31 | import dmd.common.outbuffer; |
5fee5ec3 IB |
32 | import dmd.root.port; |
33 | import dmd.root.rmem; | |
34 | import dmd.root.string; | |
c43b5909 | 35 | import dmd.root.utf; |
5fee5ec3 | 36 | import dmd.tokens; |
5fee5ec3 IB |
37 | import dmd.utils; |
38 | ||
39 | nothrow: | |
40 | ||
5fee5ec3 IB |
41 | version (DMDLIB) |
42 | { | |
43 | version = LocOffset; | |
44 | } | |
45 | ||
46 | /*********************************************************** | |
47 | */ | |
48 | class Lexer | |
49 | { | |
50 | private __gshared OutBuffer stringbuffer; | |
51 | ||
52 | Loc scanloc; // for error messages | |
53 | Loc prevloc; // location of token before current | |
54 | ||
55 | const(char)* p; // current character | |
56 | ||
57 | Token token; | |
58 | ||
59 | // For ImportC | |
60 | bool Ccompile; /// true if compiling ImportC | |
61 | ||
62 | // The following are valid only if (Ccompile == true) | |
1027dc45 IB |
63 | ubyte boolsize; /// size of a C _Bool, default 1 |
64 | ubyte shortsize; /// size of a C short, default 2 | |
65 | ubyte intsize; /// size of a C int, default 4 | |
5fee5ec3 | 66 | ubyte longsize; /// size of C long, 4 or 8 |
1027dc45 | 67 | ubyte long_longsize; /// size of a C long long, default 8 |
5fee5ec3 IB |
68 | ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof |
69 | ubyte wchar_tsize; /// size of C wchar_t, 2 or 4 | |
70 | ||
8da8c7d3 IB |
71 | ErrorSink eSink; /// send error messages through this interface |
72 | ||
5fee5ec3 IB |
73 | private |
74 | { | |
75 | const(char)* base; // pointer to start of buffer | |
76 | const(char)* end; // pointer to last element of buffer | |
77 | const(char)* line; // start of current line | |
78 | ||
79 | bool doDocComment; // collect doc comment information | |
80 | bool anyToken; // seen at least one token | |
81 | bool commentToken; // comments are TOK.comment's | |
235d5a96 | 82 | bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's |
8977f4be | 83 | |
f99303eb | 84 | bool whitespaceToken; // tokenize whitespaces (only for DMDLIB) |
8977f4be | 85 | |
5fee5ec3 IB |
86 | int inTokenStringConstant; // can be larger than 1 when in nested q{} strings |
87 | int lastDocLine; // last line of previous doc comment | |
88 | ||
89 | Token* tokenFreelist; | |
f99303eb IB |
90 | uint versionNumber; |
91 | const(char)[] vendor; | |
5fee5ec3 IB |
92 | } |
93 | ||
94 | nothrow: | |
95 | ||
96 | /********************* | |
97 | * Creates a Lexer for the source code base[begoffset..endoffset+1]. | |
98 | * The last character, base[endoffset], must be null (0) or EOF (0x1A). | |
99 | * | |
100 | * Params: | |
101 | * filename = used for error messages | |
102 | * base = source code, must be terminated by a null (0) or EOF (0x1A) character | |
103 | * begoffset = starting offset into base[] | |
104 | * endoffset = the last offset to read into base[] | |
105 | * doDocComment = handle documentation comments | |
106 | * commentToken = comments become TOK.comment's | |
8da8c7d3 | 107 | * errorSink = where error messages go, must not be null |
f99303eb IB |
108 | * vendor = name of the vendor |
109 | * versionNumber = version of the caller | |
5fee5ec3 IB |
110 | */ |
111 | this(const(char)* filename, const(char)* base, size_t begoffset, | |
f99303eb | 112 | size_t endoffset, bool doDocComment, bool commentToken, |
8da8c7d3 IB |
113 | ErrorSink errorSink, |
114 | const(char)[] vendor = "DLF", uint versionNumber = 1) pure scope | |
5fee5ec3 IB |
115 | { |
116 | scanloc = Loc(filename, 1, 1); | |
31350635 IB |
117 | // debug printf("Lexer::Lexer(%p)\n", base); |
118 | // debug printf("lexer.filename = %s\n", filename); | |
5fee5ec3 IB |
119 | token = Token.init; |
120 | this.base = base; | |
121 | this.end = base + endoffset; | |
122 | p = base + begoffset; | |
123 | line = p; | |
124 | this.doDocComment = doDocComment; | |
125 | this.commentToken = commentToken; | |
235d5a96 | 126 | this.tokenizeNewlines = false; |
5fee5ec3 IB |
127 | this.inTokenStringConstant = 0; |
128 | this.lastDocLine = 0; | |
8da8c7d3 IB |
129 | this.eSink = errorSink; |
130 | assert(errorSink); | |
f99303eb IB |
131 | this.versionNumber = versionNumber; |
132 | this.vendor = vendor; | |
5fee5ec3 IB |
133 | //initKeywords(); |
134 | /* If first line starts with '#!', ignore the line | |
135 | */ | |
136 | if (p && p[0] == '#' && p[1] == '!') | |
137 | { | |
138 | p += 2; | |
6d799f0a | 139 | for (;;p++) |
5fee5ec3 | 140 | { |
6d799f0a | 141 | char c = *p; |
5fee5ec3 IB |
142 | switch (c) |
143 | { | |
6d799f0a IB |
144 | case '\n': |
145 | p++; | |
146 | goto case; | |
5fee5ec3 IB |
147 | case 0: |
148 | case 0x1A: | |
5fee5ec3 | 149 | break; |
6d799f0a | 150 | |
5fee5ec3 | 151 | default: |
6d799f0a IB |
152 | // Note: We do allow malformed UTF-8 on shebang line. |
153 | // It could have a meaning if the native system | |
154 | // encoding is not Unicode. See test compilable/test13512.d | |
155 | // for example encoded in KOI-8. | |
156 | // We also allow bidirectional control characters. | |
157 | // We do not execute the shebang line, so it can't be used | |
158 | // to conceal code. It is up to the shell to sanitize it. | |
5fee5ec3 IB |
159 | continue; |
160 | } | |
161 | break; | |
162 | } | |
163 | endOfLine(); | |
164 | } | |
165 | } | |
166 | ||
f99303eb IB |
167 | /*********************** |
168 | * Alternative entry point for DMDLIB, adds `whitespaceToken` | |
169 | */ | |
170 | this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, | |
8da8c7d3 IB |
171 | bool doDocComment, bool commentToken, bool whitespaceToken, |
172 | ErrorSink errorSink | |
173 | ) | |
f99303eb | 174 | { |
8da8c7d3 | 175 | this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink); |
f99303eb IB |
176 | this.whitespaceToken = whitespaceToken; |
177 | } | |
178 | ||
610d7898 IB |
179 | /****************** |
180 | * Used for unittests for a mock Lexer | |
181 | */ | |
8da8c7d3 | 182 | this(ErrorSink errorSink) scope { assert(errorSink); this.eSink = errorSink; } |
610d7898 IB |
183 | |
184 | /************************************** | |
185 | * Reset lexer to lex #define's | |
186 | */ | |
187 | final void resetDefineLines(const(char)[] slice) | |
188 | { | |
189 | base = slice.ptr; | |
190 | end = base + slice.length; | |
191 | assert(*end == 0); | |
192 | p = base; | |
193 | line = p; | |
194 | tokenizeNewlines = true; | |
195 | inTokenStringConstant = 0; | |
196 | lastDocLine = 0; | |
197 | scanloc = Loc("#defines", 1, 1); | |
198 | } | |
199 | ||
200 | /********************************** | |
201 | * Set up for next #define line. | |
202 | * p should be at start of next line. | |
203 | */ | |
204 | final void nextDefineLine() | |
205 | { | |
206 | tokenizeNewlines = true; | |
207 | } | |
208 | ||
f99303eb IB |
209 | /*************** |
210 | * Range interface | |
211 | */ | |
8977f4be | 212 | |
f99303eb IB |
213 | final bool empty() const pure @property @nogc @safe |
214 | { | |
215 | return front() == TOK.endOfFile; | |
216 | } | |
8977f4be | 217 | |
f99303eb IB |
218 | final TOK front() const pure @property @nogc @safe |
219 | { | |
220 | return token.value; | |
221 | } | |
8977f4be | 222 | |
f99303eb IB |
223 | final void popFront() |
224 | { | |
225 | nextToken(); | |
8977f4be IB |
226 | } |
227 | ||
5fee5ec3 IB |
228 | /// Returns: a newly allocated `Token`. |
229 | Token* allocateToken() pure nothrow @safe | |
230 | { | |
231 | if (tokenFreelist) | |
232 | { | |
233 | Token* t = tokenFreelist; | |
234 | tokenFreelist = t.next; | |
235 | t.next = null; | |
236 | return t; | |
237 | } | |
238 | return new Token(); | |
239 | } | |
240 | ||
241 | /// Frees the given token by returning it to the freelist. | |
242 | private void releaseToken(Token* token) pure nothrow @nogc @safe | |
243 | { | |
244 | if (mem.isGCEnabled) | |
245 | *token = Token.init; | |
246 | token.next = tokenFreelist; | |
247 | tokenFreelist = token; | |
248 | } | |
249 | ||
250 | final TOK nextToken() | |
251 | { | |
252 | prevloc = token.loc; | |
253 | if (token.next) | |
254 | { | |
255 | Token* t = token.next; | |
256 | memcpy(&token, t, Token.sizeof); | |
257 | releaseToken(t); | |
258 | } | |
259 | else | |
260 | { | |
261 | scan(&token); | |
262 | } | |
263 | //printf(token.toChars()); | |
264 | return token.value; | |
265 | } | |
266 | ||
267 | /*********************** | |
268 | * Look ahead at next token's value. | |
269 | */ | |
270 | final TOK peekNext() | |
271 | { | |
272 | return peek(&token).value; | |
273 | } | |
274 | ||
275 | /*********************** | |
276 | * Look 2 tokens ahead at value. | |
277 | */ | |
278 | final TOK peekNext2() | |
279 | { | |
280 | Token* t = peek(&token); | |
281 | return peek(t).value; | |
282 | } | |
283 | ||
284 | /**************************** | |
285 | * Turn next token in buffer into a token. | |
235d5a96 IB |
286 | * Params: |
287 | * t = the token to set the resulting Token to | |
5fee5ec3 IB |
288 | */ |
289 | final void scan(Token* t) | |
290 | { | |
291 | const lastLine = scanloc.linnum; | |
292 | Loc startLoc; | |
293 | t.blockComment = null; | |
294 | t.lineComment = null; | |
295 | ||
296 | while (1) | |
297 | { | |
298 | t.ptr = p; | |
299 | //printf("p = %p, *p = '%c'\n",p,*p); | |
300 | t.loc = loc(); | |
301 | switch (*p) | |
302 | { | |
303 | case 0: | |
304 | case 0x1A: | |
305 | t.value = TOK.endOfFile; // end of file | |
306 | // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. | |
307 | return; | |
308 | case ' ': | |
9c7d5e88 IB |
309 | // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary. |
310 | while ((cast(size_t)p) % uint.sizeof) | |
311 | { | |
312 | if (*p != ' ') | |
313 | goto LendSkipFourSpaces; | |
314 | p++; | |
315 | } | |
316 | while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20 | |
317 | p += 4; | |
318 | // Skip over any remaining space on the line. | |
319 | while (*p == ' ') | |
320 | p++; | |
321 | LendSkipFourSpaces: | |
8977f4be IB |
322 | version (DMDLIB) |
323 | { | |
324 | if (whitespaceToken) | |
325 | { | |
326 | t.value = TOK.whitespace; | |
327 | return; | |
328 | } | |
329 | } | |
9c7d5e88 | 330 | continue; // skip white space |
5fee5ec3 IB |
331 | case '\t': |
332 | case '\v': | |
333 | case '\f': | |
334 | p++; | |
8977f4be IB |
335 | version (DMDLIB) |
336 | { | |
337 | if (whitespaceToken) | |
338 | { | |
339 | t.value = TOK.whitespace; | |
340 | return; | |
341 | } | |
342 | } | |
5fee5ec3 IB |
343 | continue; // skip white space |
344 | case '\r': | |
345 | p++; | |
346 | if (*p != '\n') // if CR stands by itself | |
235d5a96 | 347 | { |
5fee5ec3 | 348 | endOfLine(); |
235d5a96 IB |
349 | if (tokenizeNewlines) |
350 | { | |
351 | t.value = TOK.endOfLine; | |
352 | tokenizeNewlines = false; | |
353 | return; | |
354 | } | |
355 | } | |
8977f4be IB |
356 | version (DMDLIB) |
357 | { | |
358 | if (whitespaceToken) | |
359 | { | |
360 | t.value = TOK.whitespace; | |
361 | return; | |
362 | } | |
363 | } | |
5fee5ec3 IB |
364 | continue; // skip white space |
365 | case '\n': | |
366 | p++; | |
367 | endOfLine(); | |
235d5a96 IB |
368 | if (tokenizeNewlines) |
369 | { | |
370 | t.value = TOK.endOfLine; | |
371 | tokenizeNewlines = false; | |
372 | return; | |
373 | } | |
8977f4be IB |
374 | version (DMDLIB) |
375 | { | |
376 | if (whitespaceToken) | |
377 | { | |
378 | t.value = TOK.whitespace; | |
379 | return; | |
380 | } | |
381 | } | |
5fee5ec3 IB |
382 | continue; // skip white space |
383 | case '0': | |
384 | if (!isZeroSecond(p[1])) // if numeric literal does not continue | |
385 | { | |
386 | ++p; | |
387 | t.unsvalue = 0; | |
388 | t.value = TOK.int32Literal; | |
389 | return; | |
390 | } | |
391 | goto Lnumber; | |
392 | ||
393 | case '1': .. case '9': | |
394 | if (!isDigitSecond(p[1])) // if numeric literal does not continue | |
395 | { | |
396 | t.unsvalue = *p - '0'; | |
397 | ++p; | |
398 | t.value = TOK.int32Literal; | |
399 | return; | |
400 | } | |
401 | Lnumber: | |
402 | t.value = number(t); | |
403 | return; | |
404 | ||
405 | case '\'': | |
406 | if (issinglechar(p[1]) && p[2] == '\'') | |
407 | { | |
408 | t.unsvalue = p[1]; // simple one character literal | |
6384eff5 | 409 | t.value = TOK.charLiteral; |
5fee5ec3 IB |
410 | p += 3; |
411 | } | |
412 | else if (Ccompile) | |
413 | { | |
414 | clexerCharConstant(*t, 0); | |
415 | } | |
416 | else | |
417 | { | |
418 | t.value = charConstant(t); | |
419 | } | |
420 | return; | |
421 | ||
422 | case 'u': | |
423 | case 'U': | |
424 | case 'L': | |
425 | if (!Ccompile) | |
426 | goto case_ident; | |
427 | if (p[1] == '\'') // C wide character constant | |
428 | { | |
429 | char c = *p; | |
430 | if (c == 'L') // convert L to u or U | |
431 | c = (wchar_tsize == 4) ? 'u' : 'U'; | |
432 | ++p; | |
433 | clexerCharConstant(*t, c); | |
434 | return; | |
435 | } | |
436 | else if (p[1] == '\"') // C wide string literal | |
437 | { | |
438 | const c = *p; | |
439 | ++p; | |
440 | escapeStringConstant(t); | |
441 | t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') : | |
442 | c == 'u' ? 'w' : | |
443 | 'd'; | |
444 | return; | |
445 | } | |
fbdaa581 IB |
446 | else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal |
447 | { | |
448 | p += 2; | |
449 | escapeStringConstant(t); | |
450 | return; | |
451 | } | |
5fee5ec3 IB |
452 | goto case_ident; |
453 | ||
454 | case 'r': | |
7e287503 | 455 | if (Ccompile || p[1] != '"') |
5fee5ec3 IB |
456 | goto case_ident; |
457 | p++; | |
458 | goto case '`'; | |
459 | case '`': | |
7e287503 IB |
460 | if (Ccompile) |
461 | goto default; | |
5fee5ec3 IB |
462 | wysiwygStringConstant(t); |
463 | return; | |
5fee5ec3 | 464 | case 'q': |
7e287503 IB |
465 | if (Ccompile) |
466 | goto case_ident; | |
5fee5ec3 IB |
467 | if (p[1] == '"') |
468 | { | |
469 | p++; | |
470 | delimitedStringConstant(t); | |
471 | return; | |
472 | } | |
473 | else if (p[1] == '{') | |
474 | { | |
475 | p++; | |
476 | tokenStringConstant(t); | |
477 | return; | |
478 | } | |
479 | else | |
480 | goto case_ident; | |
481 | case '"': | |
482 | escapeStringConstant(t); | |
483 | return; | |
484 | case 'a': | |
485 | case 'b': | |
486 | case 'c': | |
487 | case 'd': | |
488 | case 'e': | |
489 | case 'f': | |
490 | case 'g': | |
491 | case 'h': | |
492 | case 'i': | |
493 | case 'j': | |
494 | case 'k': | |
495 | case 'l': | |
496 | case 'm': | |
497 | case 'n': | |
498 | case 'o': | |
499 | case 'p': | |
500 | /*case 'q': case 'r':*/ | |
501 | case 's': | |
502 | case 't': | |
503 | //case 'u': | |
504 | case 'v': | |
505 | case 'w': | |
7e287503 | 506 | case 'x': |
5fee5ec3 IB |
507 | case 'y': |
508 | case 'z': | |
509 | case 'A': | |
510 | case 'B': | |
511 | case 'C': | |
512 | case 'D': | |
513 | case 'E': | |
514 | case 'F': | |
515 | case 'G': | |
516 | case 'H': | |
517 | case 'I': | |
518 | case 'J': | |
519 | case 'K': | |
520 | //case 'L': | |
521 | case 'M': | |
522 | case 'N': | |
523 | case 'O': | |
524 | case 'P': | |
525 | case 'Q': | |
526 | case 'R': | |
527 | case 'S': | |
528 | case 'T': | |
529 | //case 'U': | |
530 | case 'V': | |
531 | case 'W': | |
532 | case 'X': | |
533 | case 'Y': | |
534 | case 'Z': | |
535 | case '_': | |
536 | case_ident: | |
537 | { | |
538 | while (1) | |
539 | { | |
540 | const c = *++p; | |
541 | if (isidchar(c)) | |
542 | continue; | |
543 | else if (c & 0x80) | |
544 | { | |
545 | const s = p; | |
546 | const u = decodeUTF(); | |
547 | if (isUniAlpha(u)) | |
548 | continue; | |
6d799f0a | 549 | error(t.loc, "char 0x%04x not allowed in identifier", u); |
5fee5ec3 IB |
550 | p = s; |
551 | } | |
552 | break; | |
553 | } | |
554 | Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); | |
555 | t.ident = id; | |
556 | t.value = cast(TOK)id.getValue(); | |
557 | ||
558 | anyToken = 1; | |
559 | ||
560 | /* Different keywords for C and D | |
561 | */ | |
562 | if (Ccompile) | |
563 | { | |
564 | if (t.value != TOK.identifier) | |
565 | { | |
566 | t.value = Ckeywords[t.value]; // filter out D keywords | |
567 | } | |
568 | } | |
569 | else if (t.value >= FirstCKeyword) | |
570 | t.value = TOK.identifier; // filter out C keywords | |
571 | ||
572 | else if (*t.ptr == '_') // if special identifier token | |
573 | { | |
574 | // Lazy initialization | |
8da8c7d3 | 575 | TimeStampInfo.initialize(t.loc, eSink); |
5fee5ec3 IB |
576 | |
577 | if (id == Id.DATE) | |
578 | { | |
579 | t.ustring = TimeStampInfo.date.ptr; | |
580 | goto Lstr; | |
581 | } | |
582 | else if (id == Id.TIME) | |
583 | { | |
584 | t.ustring = TimeStampInfo.time.ptr; | |
585 | goto Lstr; | |
586 | } | |
587 | else if (id == Id.VENDOR) | |
588 | { | |
f99303eb | 589 | t.ustring = vendor.xarraydup.ptr; |
5fee5ec3 IB |
590 | goto Lstr; |
591 | } | |
592 | else if (id == Id.TIMESTAMP) | |
593 | { | |
594 | t.ustring = TimeStampInfo.timestamp.ptr; | |
595 | Lstr: | |
596 | t.value = TOK.string_; | |
597 | t.postfix = 0; | |
598 | t.len = cast(uint)strlen(t.ustring); | |
599 | } | |
600 | else if (id == Id.VERSIONX) | |
601 | { | |
602 | t.value = TOK.int64Literal; | |
f99303eb | 603 | t.unsvalue = versionNumber; |
5fee5ec3 IB |
604 | } |
605 | else if (id == Id.EOFX) | |
606 | { | |
607 | t.value = TOK.endOfFile; | |
608 | // Advance scanner to end of file | |
609 | while (!(*p == 0 || *p == 0x1A)) | |
610 | p++; | |
611 | } | |
612 | } | |
613 | //printf("t.value = %d\n",t.value); | |
614 | return; | |
615 | } | |
616 | case '/': | |
617 | p++; | |
618 | switch (*p) | |
619 | { | |
620 | case '=': | |
621 | p++; | |
622 | t.value = TOK.divAssign; | |
623 | return; | |
624 | case '*': | |
625 | p++; | |
626 | startLoc = loc(); | |
627 | while (1) | |
628 | { | |
629 | while (1) | |
630 | { | |
631 | const c = *p; | |
632 | switch (c) | |
633 | { | |
634 | case '/': | |
635 | break; | |
636 | case '\n': | |
637 | endOfLine(); | |
638 | p++; | |
639 | continue; | |
640 | case '\r': | |
641 | p++; | |
642 | if (*p != '\n') | |
643 | endOfLine(); | |
644 | continue; | |
645 | case 0: | |
646 | case 0x1A: | |
6d799f0a | 647 | error(t.loc, "unterminated /* */ comment"); |
5fee5ec3 IB |
648 | p = end; |
649 | t.loc = loc(); | |
650 | t.value = TOK.endOfFile; | |
651 | return; | |
652 | default: | |
653 | if (c & 0x80) | |
654 | { | |
655 | const u = decodeUTF(); | |
656 | if (u == PS || u == LS) | |
657 | endOfLine(); | |
658 | } | |
659 | p++; | |
660 | continue; | |
661 | } | |
662 | break; | |
663 | } | |
664 | p++; | |
665 | if (p[-2] == '*' && p - 3 != t.ptr) | |
666 | break; | |
667 | } | |
668 | if (commentToken) | |
669 | { | |
670 | t.loc = startLoc; | |
671 | t.value = TOK.comment; | |
672 | return; | |
673 | } | |
674 | else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) | |
675 | { | |
676 | // if /** but not /**/ | |
677 | getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); | |
678 | lastDocLine = scanloc.linnum; | |
679 | } | |
680 | continue; | |
681 | case '/': // do // style comments | |
682 | startLoc = loc(); | |
683 | while (1) | |
684 | { | |
685 | const c = *++p; | |
686 | switch (c) | |
687 | { | |
688 | case '\n': | |
689 | break; | |
690 | case '\r': | |
691 | if (p[1] == '\n') | |
692 | p++; | |
693 | break; | |
694 | case 0: | |
695 | case 0x1A: | |
696 | if (commentToken) | |
697 | { | |
698 | p = end; | |
699 | t.loc = startLoc; | |
700 | t.value = TOK.comment; | |
701 | return; | |
702 | } | |
703 | if (doDocComment && t.ptr[2] == '/') | |
704 | { | |
705 | getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); | |
706 | lastDocLine = scanloc.linnum; | |
707 | } | |
708 | p = end; | |
709 | t.loc = loc(); | |
710 | t.value = TOK.endOfFile; | |
711 | return; | |
712 | default: | |
713 | if (c & 0x80) | |
714 | { | |
715 | const u = decodeUTF(); | |
716 | if (u == PS || u == LS) | |
717 | break; | |
718 | } | |
719 | continue; | |
720 | } | |
721 | break; | |
722 | } | |
723 | if (commentToken) | |
724 | { | |
8977f4be IB |
725 | version (DMDLIB) {} |
726 | else | |
727 | { | |
728 | p++; | |
729 | endOfLine(); | |
730 | } | |
5fee5ec3 IB |
731 | t.loc = startLoc; |
732 | t.value = TOK.comment; | |
733 | return; | |
734 | } | |
735 | if (doDocComment && t.ptr[2] == '/') | |
736 | { | |
737 | getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); | |
738 | lastDocLine = scanloc.linnum; | |
739 | } | |
740 | p++; | |
741 | endOfLine(); | |
742 | continue; | |
743 | case '+': | |
7e287503 | 744 | if (!Ccompile) |
5fee5ec3 IB |
745 | { |
746 | int nest; | |
747 | startLoc = loc(); | |
748 | p++; | |
749 | nest = 1; | |
750 | while (1) | |
751 | { | |
752 | char c = *p; | |
753 | switch (c) | |
754 | { | |
755 | case '/': | |
756 | p++; | |
757 | if (*p == '+') | |
758 | { | |
759 | p++; | |
760 | nest++; | |
761 | } | |
762 | continue; | |
763 | case '+': | |
764 | p++; | |
765 | if (*p == '/') | |
766 | { | |
767 | p++; | |
768 | if (--nest == 0) | |
769 | break; | |
770 | } | |
771 | continue; | |
772 | case '\r': | |
773 | p++; | |
774 | if (*p != '\n') | |
775 | endOfLine(); | |
776 | continue; | |
777 | case '\n': | |
778 | endOfLine(); | |
779 | p++; | |
780 | continue; | |
781 | case 0: | |
782 | case 0x1A: | |
6d799f0a | 783 | error(t.loc, "unterminated /+ +/ comment"); |
5fee5ec3 IB |
784 | p = end; |
785 | t.loc = loc(); | |
786 | t.value = TOK.endOfFile; | |
787 | return; | |
788 | default: | |
789 | if (c & 0x80) | |
790 | { | |
791 | uint u = decodeUTF(); | |
792 | if (u == PS || u == LS) | |
793 | endOfLine(); | |
794 | } | |
795 | p++; | |
796 | continue; | |
797 | } | |
798 | break; | |
799 | } | |
800 | if (commentToken) | |
801 | { | |
802 | t.loc = startLoc; | |
803 | t.value = TOK.comment; | |
804 | return; | |
805 | } | |
806 | if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) | |
807 | { | |
808 | // if /++ but not /++/ | |
809 | getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); | |
810 | lastDocLine = scanloc.linnum; | |
811 | } | |
812 | continue; | |
813 | } | |
7e287503 | 814 | break; |
5fee5ec3 IB |
815 | default: |
816 | break; | |
817 | } | |
818 | t.value = TOK.div; | |
819 | return; | |
820 | case '.': | |
821 | p++; | |
822 | if (isdigit(*p)) | |
823 | { | |
824 | /* Note that we don't allow ._1 and ._ as being | |
825 | * valid floating point numbers. | |
826 | */ | |
827 | p--; | |
828 | t.value = inreal(t); | |
829 | } | |
830 | else if (p[0] == '.') | |
831 | { | |
832 | if (p[1] == '.') | |
833 | { | |
834 | p += 2; | |
835 | t.value = TOK.dotDotDot; | |
836 | } | |
837 | else | |
838 | { | |
839 | p++; | |
840 | t.value = TOK.slice; | |
841 | } | |
842 | } | |
843 | else | |
844 | t.value = TOK.dot; | |
845 | return; | |
846 | case '&': | |
847 | p++; | |
848 | if (*p == '=') | |
849 | { | |
850 | p++; | |
851 | t.value = TOK.andAssign; | |
852 | } | |
853 | else if (*p == '&') | |
854 | { | |
855 | p++; | |
856 | t.value = TOK.andAnd; | |
857 | } | |
858 | else | |
859 | t.value = TOK.and; | |
860 | return; | |
861 | case '|': | |
862 | p++; | |
863 | if (*p == '=') | |
864 | { | |
865 | p++; | |
866 | t.value = TOK.orAssign; | |
867 | } | |
868 | else if (*p == '|') | |
869 | { | |
870 | p++; | |
871 | t.value = TOK.orOr; | |
872 | } | |
873 | else | |
874 | t.value = TOK.or; | |
875 | return; | |
876 | case '-': | |
877 | p++; | |
878 | if (*p == '=') | |
879 | { | |
880 | p++; | |
881 | t.value = TOK.minAssign; | |
882 | } | |
883 | else if (*p == '-') | |
884 | { | |
885 | p++; | |
886 | t.value = TOK.minusMinus; | |
887 | } | |
888 | else if (*p == '>') | |
889 | { | |
890 | ++p; | |
891 | t.value = TOK.arrow; | |
892 | } | |
893 | else | |
894 | t.value = TOK.min; | |
895 | return; | |
896 | case '+': | |
897 | p++; | |
898 | if (*p == '=') | |
899 | { | |
900 | p++; | |
901 | t.value = TOK.addAssign; | |
902 | } | |
903 | else if (*p == '+') | |
904 | { | |
905 | p++; | |
906 | t.value = TOK.plusPlus; | |
907 | } | |
908 | else | |
909 | t.value = TOK.add; | |
910 | return; | |
911 | case '<': | |
912 | p++; | |
913 | if (*p == '=') | |
914 | { | |
915 | p++; | |
916 | t.value = TOK.lessOrEqual; // <= | |
917 | } | |
918 | else if (*p == '<') | |
919 | { | |
920 | p++; | |
921 | if (*p == '=') | |
922 | { | |
923 | p++; | |
924 | t.value = TOK.leftShiftAssign; // <<= | |
925 | } | |
926 | else | |
927 | t.value = TOK.leftShift; // << | |
928 | } | |
929 | else if (*p == ':' && Ccompile) | |
930 | { | |
931 | ++p; | |
932 | t.value = TOK.leftBracket; // <: | |
933 | } | |
934 | else if (*p == '%' && Ccompile) | |
935 | { | |
936 | ++p; | |
937 | t.value = TOK.leftCurly; // <% | |
938 | } | |
939 | else | |
940 | t.value = TOK.lessThan; // < | |
941 | return; | |
942 | case '>': | |
943 | p++; | |
944 | if (*p == '=') | |
945 | { | |
946 | p++; | |
947 | t.value = TOK.greaterOrEqual; // >= | |
948 | } | |
949 | else if (*p == '>') | |
950 | { | |
951 | p++; | |
952 | if (*p == '=') | |
953 | { | |
954 | p++; | |
955 | t.value = TOK.rightShiftAssign; // >>= | |
956 | } | |
957 | else if (*p == '>') | |
958 | { | |
959 | p++; | |
960 | if (*p == '=') | |
961 | { | |
962 | p++; | |
963 | t.value = TOK.unsignedRightShiftAssign; // >>>= | |
964 | } | |
965 | else | |
966 | t.value = TOK.unsignedRightShift; // >>> | |
967 | } | |
968 | else | |
969 | t.value = TOK.rightShift; // >> | |
970 | } | |
971 | else | |
972 | t.value = TOK.greaterThan; // > | |
973 | return; | |
974 | case '!': | |
975 | p++; | |
976 | if (*p == '=') | |
977 | { | |
978 | p++; | |
979 | t.value = TOK.notEqual; // != | |
980 | } | |
981 | else | |
982 | t.value = TOK.not; // ! | |
983 | return; | |
984 | case '=': | |
985 | p++; | |
986 | if (*p == '=') | |
987 | { | |
988 | p++; | |
989 | t.value = TOK.equal; // == | |
990 | } | |
991 | else if (*p == '>') | |
992 | { | |
993 | p++; | |
994 | t.value = TOK.goesTo; // => | |
995 | } | |
996 | else | |
997 | t.value = TOK.assign; // = | |
998 | return; | |
999 | case '~': | |
1000 | p++; | |
1001 | if (*p == '=') | |
1002 | { | |
1003 | p++; | |
1004 | t.value = TOK.concatenateAssign; // ~= | |
1005 | } | |
1006 | else | |
1007 | t.value = TOK.tilde; // ~ | |
1008 | return; | |
1009 | case '^': | |
1010 | p++; | |
1011 | if (*p == '^') | |
1012 | { | |
1013 | p++; | |
1014 | if (*p == '=') | |
1015 | { | |
1016 | p++; | |
1017 | t.value = TOK.powAssign; // ^^= | |
1018 | } | |
1019 | else | |
1020 | t.value = TOK.pow; // ^^ | |
1021 | } | |
1022 | else if (*p == '=') | |
1023 | { | |
1024 | p++; | |
1025 | t.value = TOK.xorAssign; // ^= | |
1026 | } | |
1027 | else | |
1028 | t.value = TOK.xor; // ^ | |
1029 | return; | |
1030 | case '(': | |
1031 | p++; | |
1032 | t.value = TOK.leftParenthesis; | |
1033 | return; | |
1034 | case ')': | |
1035 | p++; | |
1036 | t.value = TOK.rightParenthesis; | |
1037 | return; | |
1038 | case '[': | |
1039 | p++; | |
1040 | t.value = TOK.leftBracket; | |
1041 | return; | |
1042 | case ']': | |
1043 | p++; | |
1044 | t.value = TOK.rightBracket; | |
1045 | return; | |
1046 | case '{': | |
1047 | p++; | |
1048 | t.value = TOK.leftCurly; | |
1049 | return; | |
1050 | case '}': | |
1051 | p++; | |
1052 | t.value = TOK.rightCurly; | |
1053 | return; | |
1054 | case '?': | |
1055 | p++; | |
1056 | t.value = TOK.question; | |
1057 | return; | |
1058 | case ',': | |
1059 | p++; | |
1060 | t.value = TOK.comma; | |
1061 | return; | |
1062 | case ';': | |
1063 | p++; | |
1064 | t.value = TOK.semicolon; | |
1065 | return; | |
1066 | case ':': | |
1067 | p++; | |
1068 | if (*p == ':') | |
1069 | { | |
1070 | ++p; | |
1071 | t.value = TOK.colonColon; | |
1072 | } | |
1073 | else if (*p == '>' && Ccompile) | |
1074 | { | |
1075 | ++p; | |
1076 | t.value = TOK.rightBracket; | |
1077 | } | |
1078 | else | |
1079 | t.value = TOK.colon; | |
1080 | return; | |
1081 | case '$': | |
1082 | p++; | |
1083 | t.value = TOK.dollar; | |
1084 | return; | |
1085 | case '@': | |
1086 | p++; | |
1087 | t.value = TOK.at; | |
1088 | return; | |
1089 | case '*': | |
1090 | p++; | |
1091 | if (*p == '=') | |
1092 | { | |
1093 | p++; | |
1094 | t.value = TOK.mulAssign; | |
1095 | } | |
1096 | else | |
1097 | t.value = TOK.mul; | |
1098 | return; | |
1099 | case '%': | |
1100 | p++; | |
1101 | if (*p == '=') | |
1102 | { | |
1103 | p++; | |
1104 | t.value = TOK.modAssign; | |
1105 | } | |
1106 | else if (*p == '>' && Ccompile) | |
1107 | { | |
1108 | ++p; | |
1109 | t.value = TOK.rightCurly; | |
1110 | } | |
1111 | else if (*p == ':' && Ccompile) | |
1112 | { | |
1113 | goto case '#'; // %: means # | |
1114 | } | |
1115 | else | |
1116 | t.value = TOK.mod; | |
1117 | return; | |
1118 | case '#': | |
1119 | { | |
235d5a96 IB |
1120 | // https://issues.dlang.org/show_bug.cgi?id=22825 |
1121 | // Special token sequences are terminated by newlines, | |
1122 | // and should not be skipped over. | |
1123 | this.tokenizeNewlines = true; | |
5fee5ec3 | 1124 | p++; |
7e287503 | 1125 | if (parseSpecialTokenSequence()) |
5fee5ec3 | 1126 | continue; |
5fee5ec3 IB |
1127 | t.value = TOK.pound; |
1128 | return; | |
1129 | } | |
1130 | default: | |
1131 | { | |
1132 | dchar c = *p; | |
1133 | if (c & 0x80) | |
1134 | { | |
1135 | c = decodeUTF(); | |
1136 | // Check for start of unicode identifier | |
1137 | if (isUniAlpha(c)) | |
1138 | goto case_ident; | |
1139 | if (c == PS || c == LS) | |
1140 | { | |
1141 | endOfLine(); | |
1142 | p++; | |
235d5a96 IB |
1143 | if (tokenizeNewlines) |
1144 | { | |
1145 | t.value = TOK.endOfLine; | |
1146 | tokenizeNewlines = false; | |
1147 | return; | |
1148 | } | |
5fee5ec3 IB |
1149 | continue; |
1150 | } | |
1151 | } | |
1152 | if (c < 0x80 && isprint(c)) | |
6d799f0a | 1153 | error(t.loc, "character '%c' is not a valid token", c); |
5fee5ec3 | 1154 | else |
6d799f0a | 1155 | error(t.loc, "character 0x%02x is not a valid token", c); |
5fee5ec3 IB |
1156 | p++; |
1157 | continue; | |
6d799f0a | 1158 | // assert(0); |
5fee5ec3 IB |
1159 | } |
1160 | } | |
1161 | } | |
1162 | } | |
1163 | ||
1164 | final Token* peek(Token* ct) | |
1165 | { | |
1166 | Token* t; | |
1167 | if (ct.next) | |
1168 | t = ct.next; | |
1169 | else | |
1170 | { | |
1171 | t = allocateToken(); | |
1172 | scan(t); | |
1173 | ct.next = t; | |
1174 | } | |
1175 | return t; | |
1176 | } | |
1177 | ||
1178 | /********************************* | |
1179 | * tk is on the opening (. | |
1180 | * Look ahead and return token that is past the closing ). | |
1181 | */ | |
1182 | final Token* peekPastParen(Token* tk) | |
1183 | { | |
1184 | //printf("peekPastParen()\n"); | |
1185 | int parens = 1; | |
1186 | int curlynest = 0; | |
1187 | while (1) | |
1188 | { | |
1189 | tk = peek(tk); | |
1190 | //tk.print(); | |
1191 | switch (tk.value) | |
1192 | { | |
1193 | case TOK.leftParenthesis: | |
1194 | parens++; | |
1195 | continue; | |
1196 | case TOK.rightParenthesis: | |
1197 | --parens; | |
1198 | if (parens) | |
1199 | continue; | |
1200 | tk = peek(tk); | |
1201 | break; | |
1202 | case TOK.leftCurly: | |
1203 | curlynest++; | |
1204 | continue; | |
1205 | case TOK.rightCurly: | |
1206 | if (--curlynest >= 0) | |
1207 | continue; | |
1208 | break; | |
1209 | case TOK.semicolon: | |
1210 | if (curlynest) | |
1211 | continue; | |
1212 | break; | |
1213 | case TOK.endOfFile: | |
1214 | break; | |
1215 | default: | |
1216 | continue; | |
1217 | } | |
1218 | return tk; | |
1219 | } | |
1220 | } | |
1221 | ||
1222 | /******************************************* | |
1223 | * Parse escape sequence. | |
1224 | */ | |
7e7ebe3e | 1225 | private uint escapeSequence(out dchar c2) |
5fee5ec3 | 1226 | { |
7e7ebe3e | 1227 | return Lexer.escapeSequence(token.loc, p, Ccompile, c2); |
5fee5ec3 IB |
1228 | } |
1229 | ||
1230 | /******** | |
1231 | * Parse the given string literal escape sequence into a single character. | |
1232 | * D https://dlang.org/spec/lex.html#escape_sequences | |
1233 | * C11 6.4.4.4 | |
1234 | * Params: | |
1235 | * loc = location to use for error messages | |
1236 | * sequence = pointer to string with escape sequence to parse. Updated to | |
1237 | * point past the end of the escape sequence | |
1238 | * Ccompile = true for compile C11 escape sequences | |
7e7ebe3e | 1239 | * c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init` |
5fee5ec3 IB |
1240 | * Returns: |
1241 | * the escape sequence as a single character | |
1242 | */ | |
7e7ebe3e | 1243 | private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2) |
5fee5ec3 IB |
1244 | { |
1245 | const(char)* p = sequence; // cache sequence reference on stack | |
1246 | scope(exit) sequence = p; | |
1247 | ||
1248 | uint c = *p; | |
1249 | int ndigits; | |
1250 | switch (c) | |
1251 | { | |
1252 | case '\'': | |
1253 | case '"': | |
1254 | case '?': | |
1255 | case '\\': | |
1256 | Lconsume: | |
1257 | p++; | |
1258 | break; | |
1259 | case 'a': | |
1260 | c = 7; | |
1261 | goto Lconsume; | |
1262 | case 'b': | |
1263 | c = 8; | |
1264 | goto Lconsume; | |
1265 | case 'f': | |
1266 | c = 12; | |
1267 | goto Lconsume; | |
1268 | case 'n': | |
1269 | c = 10; | |
1270 | goto Lconsume; | |
1271 | case 'r': | |
1272 | c = 13; | |
1273 | goto Lconsume; | |
1274 | case 't': | |
1275 | c = 9; | |
1276 | goto Lconsume; | |
1277 | case 'v': | |
1278 | c = 11; | |
1279 | goto Lconsume; | |
1280 | case 'u': | |
1281 | ndigits = 4; | |
1282 | goto Lhex; | |
1283 | case 'U': | |
1284 | ndigits = 8; | |
1285 | goto Lhex; | |
1286 | case 'x': | |
1287 | ndigits = 2; | |
1288 | Lhex: | |
1289 | p++; | |
1290 | c = *p; | |
1291 | if (ishex(cast(char)c)) | |
1292 | { | |
1293 | uint v = 0; | |
1294 | int n = 0; | |
ae56e2da | 1295 | if (Ccompile && ndigits == 2) |
5fee5ec3 | 1296 | { |
ae56e2da IB |
1297 | /* C11 6.4.4.4-7 one to infinity hex digits |
1298 | */ | |
1299 | do | |
5fee5ec3 | 1300 | { |
ae56e2da IB |
1301 | if (isdigit(cast(char)c)) |
1302 | c -= '0'; | |
1303 | else if (islower(c)) | |
1304 | c -= 'a' - 10; | |
1305 | else | |
1306 | c -= 'A' - 10; | |
1307 | v = v * 16 + c; | |
1308 | c = *++p; | |
1309 | } while (ishex(cast(char)c)); | |
5fee5ec3 | 1310 | } |
ae56e2da | 1311 | else |
5fee5ec3 | 1312 | { |
ae56e2da IB |
1313 | while (1) |
1314 | { | |
1315 | if (isdigit(cast(char)c)) | |
1316 | c -= '0'; | |
1317 | else if (islower(c)) | |
1318 | c -= 'a' - 10; | |
1319 | else | |
1320 | c -= 'A' - 10; | |
1321 | v = v * 16 + c; | |
1322 | c = *++p; | |
1323 | if (++n == ndigits) | |
1324 | break; | |
1325 | if (!ishex(cast(char)c)) | |
1326 | { | |
610d7898 | 1327 | error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); |
ae56e2da IB |
1328 | break; |
1329 | } | |
1330 | } | |
1331 | if (ndigits != 2 && !utf_isValidDchar(v)) | |
1332 | { | |
610d7898 | 1333 | error(loc, "invalid UTF character \\U%08x", v); |
ae56e2da IB |
1334 | v = '?'; // recover with valid UTF character |
1335 | } | |
5fee5ec3 IB |
1336 | } |
1337 | c = v; | |
1338 | } | |
1339 | else | |
1340 | { | |
610d7898 | 1341 | error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); |
5fee5ec3 IB |
1342 | p++; |
1343 | } | |
1344 | break; | |
1345 | case '&': | |
1346 | if (Ccompile) | |
1347 | goto default; | |
1348 | ||
1349 | // named character entity | |
1350 | for (const idstart = ++p; 1; p++) | |
1351 | { | |
1352 | switch (*p) | |
1353 | { | |
1354 | case ';': | |
7e7ebe3e IB |
1355 | auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]); |
1356 | c = entity[0]; | |
1357 | if (entity == entity.init) | |
5fee5ec3 | 1358 | { |
610d7898 | 1359 | error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); |
5fee5ec3 IB |
1360 | c = '?'; |
1361 | } | |
7e7ebe3e IB |
1362 | if (entity[1] != entity.init[1]) |
1363 | c2 = entity[1]; | |
1364 | ||
5fee5ec3 IB |
1365 | p++; |
1366 | break; | |
1367 | default: | |
1368 | if (isalpha(*p) || (p != idstart && isdigit(*p))) | |
1369 | continue; | |
610d7898 | 1370 | error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); |
5fee5ec3 IB |
1371 | c = '?'; |
1372 | break; | |
1373 | } | |
1374 | break; | |
1375 | } | |
1376 | break; | |
1377 | case 0: | |
1378 | case 0x1A: | |
1379 | // end of file | |
1380 | c = '\\'; | |
1381 | break; | |
1382 | default: | |
1383 | if (isoctal(cast(char)c)) | |
1384 | { | |
1385 | uint v = 0; | |
1386 | int n = 0; | |
1387 | do | |
1388 | { | |
1389 | v = v * 8 + (c - '0'); | |
1390 | c = *++p; | |
1391 | } | |
1392 | while (++n < 3 && isoctal(cast(char)c)); | |
1393 | c = v; | |
1394 | if (c > 0xFF) | |
610d7898 | 1395 | error(loc, "escape octal sequence \\%03o is larger than \\377", c); |
5fee5ec3 IB |
1396 | } |
1397 | else | |
1398 | { | |
610d7898 | 1399 | error(loc, "undefined escape sequence \\%c", c); |
5fee5ec3 IB |
1400 | p++; |
1401 | } | |
1402 | break; | |
1403 | } | |
1404 | return c; | |
1405 | } | |
1406 | ||
1407 | /** | |
1408 | Lex a wysiwyg string. `p` must be pointing to the first character before the | |
1409 | contents of the string literal. The character pointed to by `p` will be used as | |
1410 | the terminating character (i.e. backtick or double-quote). | |
1411 | Params: | |
1412 | result = pointer to the token that accepts the result | |
1413 | */ | |
1414 | private void wysiwygStringConstant(Token* result) | |
1415 | { | |
1416 | result.value = TOK.string_; | |
1417 | Loc start = loc(); | |
1418 | auto terminator = p[0]; | |
1419 | p++; | |
1420 | stringbuffer.setsize(0); | |
1421 | while (1) | |
1422 | { | |
1423 | dchar c = p[0]; | |
1424 | p++; | |
1425 | switch (c) | |
1426 | { | |
1427 | case '\n': | |
1428 | endOfLine(); | |
1429 | break; | |
1430 | case '\r': | |
1431 | if (p[0] == '\n') | |
1432 | continue; // ignore | |
1433 | c = '\n'; // treat EndOfLine as \n character | |
1434 | endOfLine(); | |
1435 | break; | |
1436 | case 0: | |
1437 | case 0x1A: | |
1438 | error("unterminated string constant starting at %s", start.toChars()); | |
1439 | result.setString(); | |
1440 | // rewind `p` so it points to the EOF character | |
1441 | p--; | |
1442 | return; | |
1443 | default: | |
1444 | if (c == terminator) | |
1445 | { | |
1446 | result.setString(stringbuffer); | |
1447 | stringPostfix(result); | |
1448 | return; | |
1449 | } | |
1450 | else if (c & 0x80) | |
1451 | { | |
1452 | p--; | |
1453 | const u = decodeUTF(); | |
1454 | p++; | |
1455 | if (u == PS || u == LS) | |
1456 | endOfLine(); | |
1457 | stringbuffer.writeUTF8(u); | |
1458 | continue; | |
1459 | } | |
1460 | break; | |
1461 | } | |
1462 | stringbuffer.writeByte(c); | |
1463 | } | |
1464 | } | |
1465 | ||
5fee5ec3 IB |
1466 | /** |
1467 | Lex a delimited string. Some examples of delimited strings are: | |
1468 | --- | |
1469 | q"(foo(xxx))" // "foo(xxx)" | |
1470 | q"[foo$(LPAREN)]" // "foo$(LPAREN)" | |
1471 | q"/foo]/" // "foo]" | |
1472 | q"HERE | |
1473 | foo | |
1474 | HERE" // "foo\n" | |
1475 | --- | |
1476 | It is assumed that `p` points to the opening double-quote '"'. | |
1477 | Params: | |
1478 | result = pointer to the token that accepts the result | |
1479 | */ | |
1480 | private void delimitedStringConstant(Token* result) | |
1481 | { | |
1482 | result.value = TOK.string_; | |
1483 | Loc start = loc(); | |
1484 | dchar delimleft = 0; | |
1485 | dchar delimright = 0; | |
1486 | uint nest = 1; | |
1487 | uint nestcount = ~0; // dead assignment, needed to suppress warning | |
1488 | Identifier hereid = null; | |
1489 | uint blankrol = 0; | |
1490 | uint startline = 0; | |
1491 | p++; | |
1492 | stringbuffer.setsize(0); | |
1493 | while (1) | |
1494 | { | |
6d799f0a | 1495 | const s = p; |
5fee5ec3 IB |
1496 | dchar c = *p++; |
1497 | //printf("c = '%c'\n", c); | |
1498 | switch (c) | |
1499 | { | |
1500 | case '\n': | |
1501 | Lnextline: | |
1502 | endOfLine(); | |
1503 | startline = 1; | |
1504 | if (blankrol) | |
1505 | { | |
1506 | blankrol = 0; | |
1507 | continue; | |
1508 | } | |
1509 | if (hereid) | |
1510 | { | |
1511 | stringbuffer.writeUTF8(c); | |
1512 | continue; | |
1513 | } | |
1514 | break; | |
1515 | case '\r': | |
1516 | if (*p == '\n') | |
1517 | continue; // ignore | |
1518 | c = '\n'; // treat EndOfLine as \n character | |
1519 | goto Lnextline; | |
1520 | case 0: | |
1521 | case 0x1A: | |
1522 | error("unterminated delimited string constant starting at %s", start.toChars()); | |
1523 | result.setString(); | |
1524 | // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). | |
1525 | p--; | |
1526 | return; | |
1527 | default: | |
1528 | if (c & 0x80) | |
1529 | { | |
1530 | p--; | |
1531 | c = decodeUTF(); | |
1532 | p++; | |
1533 | if (c == PS || c == LS) | |
1534 | goto Lnextline; | |
1535 | } | |
1536 | break; | |
1537 | } | |
1538 | if (delimleft == 0) | |
1539 | { | |
1540 | delimleft = c; | |
1541 | nest = 1; | |
1542 | nestcount = 1; | |
1543 | if (c == '(') | |
1544 | delimright = ')'; | |
1545 | else if (c == '{') | |
1546 | delimright = '}'; | |
1547 | else if (c == '[') | |
1548 | delimright = ']'; | |
1549 | else if (c == '<') | |
1550 | delimright = '>'; | |
1551 | else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) | |
1552 | { | |
1553 | // Start of identifier; must be a heredoc | |
1554 | Token tok; | |
6d799f0a | 1555 | p = s; |
5fee5ec3 IB |
1556 | scan(&tok); // read in heredoc identifier |
1557 | if (tok.value != TOK.identifier) | |
1558 | { | |
1559 | error("identifier expected for heredoc, not %s", tok.toChars()); | |
1560 | delimright = c; | |
1561 | } | |
1562 | else | |
1563 | { | |
1564 | hereid = tok.ident; | |
1565 | //printf("hereid = '%s'\n", hereid.toChars()); | |
1566 | blankrol = 1; | |
1567 | } | |
1568 | nest = 0; | |
1569 | } | |
1570 | else | |
1571 | { | |
1572 | delimright = c; | |
1573 | nest = 0; | |
1574 | if (isspace(c)) | |
1575 | error("delimiter cannot be whitespace"); | |
1576 | } | |
1577 | } | |
1578 | else | |
1579 | { | |
1580 | if (blankrol) | |
1581 | { | |
1582 | error("heredoc rest of line should be blank"); | |
1583 | blankrol = 0; | |
1584 | continue; | |
1585 | } | |
1586 | if (nest == 1) | |
1587 | { | |
1588 | if (c == delimleft) | |
1589 | nestcount++; | |
1590 | else if (c == delimright) | |
1591 | { | |
1592 | nestcount--; | |
1593 | if (nestcount == 0) | |
1594 | goto Ldone; | |
1595 | } | |
1596 | } | |
1597 | else if (c == delimright) | |
1598 | goto Ldone; | |
1599 | if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) | |
1600 | { | |
1601 | Token tok; | |
1602 | auto psave = p; | |
6d799f0a | 1603 | p = s; |
5fee5ec3 IB |
1604 | scan(&tok); // read in possible heredoc identifier |
1605 | //printf("endid = '%s'\n", tok.ident.toChars()); | |
1606 | if (tok.value == TOK.identifier && tok.ident is hereid) | |
1607 | { | |
1608 | /* should check that rest of line is blank | |
1609 | */ | |
1610 | goto Ldone; | |
1611 | } | |
1612 | p = psave; | |
1613 | } | |
1614 | stringbuffer.writeUTF8(c); | |
1615 | startline = 0; | |
1616 | } | |
1617 | } | |
1618 | Ldone: | |
1619 | if (*p == '"') | |
1620 | p++; | |
1621 | else if (hereid) | |
6384eff5 IB |
1622 | error("delimited string must end in `%s\"`", hereid.toChars()); |
1623 | else if (isspace(delimright)) | |
1624 | error("delimited string must end in `\"`"); | |
5fee5ec3 | 1625 | else |
8da8c7d3 | 1626 | error(token.loc, "delimited string must end in `%c\"`", delimright); |
5fee5ec3 IB |
1627 | result.setString(stringbuffer); |
1628 | stringPostfix(result); | |
1629 | } | |
1630 | ||
1631 | /** | |
1632 | Lex a token string. Some examples of token strings are: | |
1633 | --- | |
1634 | q{ foo(xxx) } // " foo(xxx) " | |
1635 | q{foo$(LPAREN)} // "foo$(LPAREN)" | |
1636 | q{{foo}"}"} // "{foo}"}"" | |
1637 | --- | |
1638 | It is assumed that `p` points to the opening curly-brace. | |
1639 | Params: | |
1640 | result = pointer to the token that accepts the result | |
1641 | */ | |
1642 | private void tokenStringConstant(Token* result) | |
1643 | { | |
1644 | result.value = TOK.string_; | |
1645 | ||
1646 | uint nest = 1; | |
1647 | const start = loc(); | |
1648 | const pstart = ++p; | |
1649 | inTokenStringConstant++; | |
1650 | scope(exit) inTokenStringConstant--; | |
1651 | while (1) | |
1652 | { | |
1653 | Token tok; | |
1654 | scan(&tok); | |
1655 | switch (tok.value) | |
1656 | { | |
1657 | case TOK.leftCurly: | |
1658 | nest++; | |
1659 | continue; | |
1660 | case TOK.rightCurly: | |
1661 | if (--nest == 0) | |
1662 | { | |
1663 | result.setString(pstart, p - 1 - pstart); | |
1664 | stringPostfix(result); | |
1665 | return; | |
1666 | } | |
1667 | continue; | |
1668 | case TOK.endOfFile: | |
1669 | error("unterminated token string constant starting at %s", start.toChars()); | |
1670 | result.setString(); | |
1671 | return; | |
1672 | default: | |
1673 | continue; | |
1674 | } | |
1675 | } | |
1676 | } | |
1677 | ||
1678 | /** | |
1679 | Scan a quoted string while building the processed string value by | |
1680 | handling escape sequences. The result is returned in the given `t` token. | |
1681 | This function assumes that `p` currently points to the opening quote | |
1682 | of the string. | |
1683 | Params: | |
1684 | t = the token to set the resulting string to | |
1685 | * References: | |
1686 | * D https://dlang.org/spec/lex.html#double_quoted_strings | |
1687 | * ImportC C11 6.4.5 | |
1688 | */ | |
1689 | private void escapeStringConstant(Token* t) | |
1690 | { | |
1691 | t.value = TOK.string_; | |
1692 | ||
1693 | const start = loc(); | |
1694 | const tc = *p++; // opening quote | |
1695 | stringbuffer.setsize(0); | |
1696 | while (1) | |
1697 | { | |
1698 | dchar c = *p++; | |
7e7ebe3e | 1699 | dchar c2; |
5fee5ec3 IB |
1700 | switch (c) |
1701 | { | |
1702 | case '\\': | |
1703 | switch (*p) | |
1704 | { | |
1705 | case '&': | |
1706 | if (Ccompile) | |
1707 | goto default; | |
5fee5ec3 | 1708 | |
7e7ebe3e IB |
1709 | c = escapeSequence(c2); |
1710 | stringbuffer.writeUTF8(c); | |
1711 | if (c2 != dchar.init) | |
1712 | stringbuffer.writeUTF8(c2); | |
1713 | continue; | |
5fee5ec3 IB |
1714 | case 'u': |
1715 | case 'U': | |
7e7ebe3e | 1716 | c = escapeSequence(c2); |
5fee5ec3 IB |
1717 | stringbuffer.writeUTF8(c); |
1718 | continue; | |
1719 | default: | |
7e7ebe3e | 1720 | c = escapeSequence(c2); |
5fee5ec3 IB |
1721 | break; |
1722 | } | |
1723 | break; | |
1724 | case '\n': | |
1725 | endOfLine(); | |
1726 | if (Ccompile) | |
1727 | goto Lunterminated; | |
1728 | break; | |
1729 | case '\r': | |
1730 | if (*p == '\n') | |
1731 | continue; // ignore | |
1732 | c = '\n'; // treat EndOfLine as \n character | |
1733 | endOfLine(); | |
1734 | if (Ccompile) | |
1735 | goto Lunterminated; | |
1736 | break; | |
1737 | case '\'': | |
1738 | case '"': | |
1739 | if (c != tc) | |
1740 | goto default; | |
1741 | t.setString(stringbuffer); | |
1742 | if (!Ccompile) | |
1743 | stringPostfix(t); | |
1744 | return; | |
1745 | case 0: | |
1746 | case 0x1A: | |
1747 | // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). | |
1748 | p--; | |
1749 | Lunterminated: | |
1750 | error("unterminated string constant starting at %s", start.toChars()); | |
1751 | t.setString(); | |
1752 | return; | |
1753 | default: | |
1754 | if (c & 0x80) | |
1755 | { | |
1756 | p--; | |
1757 | c = decodeUTF(); | |
1758 | if (c == LS || c == PS) | |
1759 | { | |
1760 | c = '\n'; | |
1761 | endOfLine(); | |
1762 | if (Ccompile) | |
1763 | goto Lunterminated; | |
1764 | } | |
1765 | p++; | |
1766 | stringbuffer.writeUTF8(c); | |
1767 | continue; | |
1768 | } | |
1769 | break; | |
1770 | } | |
1771 | stringbuffer.writeByte(c); | |
1772 | } | |
1773 | } | |
1774 | ||
1775 | /************************************** | |
1776 | * Reference: | |
1777 | * https://dlang.org/spec/lex.html#characterliteral | |
1778 | */ | |
1779 | private TOK charConstant(Token* t) | |
1780 | { | |
1781 | TOK tk = TOK.charLiteral; | |
1782 | //printf("Lexer::charConstant\n"); | |
1783 | p++; | |
1784 | dchar c = *p++; | |
7e7ebe3e | 1785 | dchar c2; |
5fee5ec3 IB |
1786 | switch (c) |
1787 | { | |
1788 | case '\\': | |
1789 | switch (*p) | |
1790 | { | |
1791 | case 'u': | |
5fee5ec3 | 1792 | tk = TOK.wcharLiteral; |
7e7ebe3e | 1793 | goto default; |
5fee5ec3 IB |
1794 | case 'U': |
1795 | case '&': | |
5fee5ec3 | 1796 | tk = TOK.dcharLiteral; |
7e7ebe3e | 1797 | goto default; |
5fee5ec3 | 1798 | default: |
7e7ebe3e IB |
1799 | t.unsvalue = escapeSequence(c2); |
1800 | if (c2 != c2.init) | |
1801 | { | |
1802 | error("html entity requires 2 code units, use a string instead of a character"); | |
1803 | t.unsvalue = '?'; | |
1804 | } | |
5fee5ec3 IB |
1805 | break; |
1806 | } | |
1807 | break; | |
1808 | case '\n': | |
1809 | L1: | |
1810 | endOfLine(); | |
1811 | goto case; | |
1812 | case '\r': | |
1813 | goto case '\''; | |
1814 | case 0: | |
1815 | case 0x1A: | |
1816 | // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). | |
1817 | p--; | |
1818 | goto case; | |
1819 | case '\'': | |
1820 | error("unterminated character constant"); | |
1821 | t.unsvalue = '?'; | |
1822 | return tk; | |
1823 | default: | |
1824 | if (c & 0x80) | |
1825 | { | |
1826 | p--; | |
1827 | c = decodeUTF(); | |
1828 | p++; | |
1829 | if (c == LS || c == PS) | |
1830 | goto L1; | |
1831 | if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) | |
1832 | tk = TOK.wcharLiteral; | |
1833 | else | |
1834 | tk = TOK.dcharLiteral; | |
1835 | } | |
1836 | t.unsvalue = c; | |
1837 | break; | |
1838 | } | |
1839 | if (*p != '\'') | |
1840 | { | |
1841 | while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && | |
1842 | *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') | |
1843 | { | |
1844 | if (*p & 0x80) | |
1845 | { | |
1846 | const s = p; | |
1847 | c = decodeUTF(); | |
1848 | if (c == LS || c == PS) | |
1849 | { | |
1850 | p = s; | |
1851 | break; | |
1852 | } | |
1853 | } | |
1854 | p++; | |
1855 | } | |
1856 | ||
1857 | if (*p == '\'') | |
1858 | { | |
1859 | error("character constant has multiple characters"); | |
1860 | p++; | |
1861 | } | |
1862 | else | |
1863 | error("unterminated character constant"); | |
1864 | t.unsvalue = '?'; | |
1865 | return tk; | |
1866 | } | |
1867 | p++; | |
1868 | return tk; | |
1869 | } | |
1870 | ||
1871 | /*************************************** | |
1872 | * Lex C character constant. | |
1873 | * Parser is on the opening quote. | |
1874 | * Params: | |
1875 | * t = token to fill in | |
1876 | * prefix = one of `u`, `U` or 0. | |
1877 | * Reference: | |
1878 | * C11 6.4.4.4 | |
1879 | */ | |
1880 | private void clexerCharConstant(ref Token t, char prefix) | |
1881 | { | |
1882 | escapeStringConstant(&t); | |
1883 | const(char)[] str = t.ustring[0 .. t.len]; | |
1884 | const n = str.length; | |
1885 | const loc = t.loc; | |
1886 | if (n == 0) | |
1887 | { | |
1888 | error(loc, "empty character constant"); | |
1889 | t.value = TOK.semicolon; | |
1890 | return; | |
1891 | } | |
1892 | ||
1893 | uint u; | |
1894 | switch (prefix) | |
1895 | { | |
1896 | case 0: | |
1897 | if (n == 1) // fast case | |
1898 | { | |
1899 | u = str[0]; | |
1900 | } | |
1901 | else if (n > 4) | |
1902 | error(loc, "max number of chars in character literal is 4, had %d", | |
1903 | cast(int)n); | |
1904 | else | |
1905 | { | |
1906 | foreach (i, c; str) | |
1907 | (cast(char*)&u)[n - 1 - i] = c; | |
1908 | } | |
1909 | break; | |
1910 | ||
1911 | case 'u': | |
1912 | dchar d1; | |
1913 | size_t idx; | |
1914 | auto msg = utf_decodeChar(str, idx, d1); | |
1915 | dchar d2 = 0; | |
1916 | if (idx < n && !msg) | |
1917 | msg = utf_decodeChar(str, idx, d2); | |
1918 | if (msg) | |
8da8c7d3 | 1919 | error(loc, "%.*s", cast(int)msg.length, msg.ptr); |
5fee5ec3 IB |
1920 | else if (idx < n) |
1921 | error(loc, "max number of chars in 16 bit character literal is 2, had %d", | |
8da8c7d3 | 1922 | cast(int)((n + 1) >> 1)); |
5fee5ec3 IB |
1923 | else if (d1 > 0x1_0000) |
1924 | error(loc, "%d does not fit in 16 bits", d1); | |
1925 | else if (d2 > 0x1_0000) | |
1926 | error(loc, "%d does not fit in 16 bits", d2); | |
1927 | u = d1; | |
1928 | if (d2) | |
1929 | u = (d1 << 16) | d2; | |
1930 | break; | |
1931 | ||
1932 | case 'U': | |
1933 | dchar d; | |
1934 | size_t idx; | |
1935 | auto msg = utf_decodeChar(str, idx, d); | |
1936 | if (msg) | |
8da8c7d3 | 1937 | error(loc, "%.*s", cast(int)msg.length, msg.ptr); |
5fee5ec3 IB |
1938 | else if (idx < n) |
1939 | error(loc, "max number of chars in 32 bit character literal is 1, had %d", | |
8da8c7d3 | 1940 | cast(int)((n + 3) >> 2)); |
5fee5ec3 IB |
1941 | u = d; |
1942 | break; | |
1943 | ||
1944 | default: | |
1945 | assert(0); | |
1946 | } | |
6384eff5 | 1947 | t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal; |
5fee5ec3 IB |
1948 | t.unsvalue = u; |
1949 | } | |
1950 | ||
1951 | /*************************************** | |
1952 | * Get postfix of string literal. | |
1953 | */ | |
1954 | private void stringPostfix(Token* t) pure @nogc | |
1955 | { | |
1956 | switch (*p) | |
1957 | { | |
1958 | case 'c': | |
1959 | case 'w': | |
1960 | case 'd': | |
1961 | t.postfix = *p; | |
1962 | p++; | |
1963 | break; | |
1964 | default: | |
1965 | t.postfix = 0; | |
1966 | break; | |
1967 | } | |
1968 | } | |
1969 | ||
1970 | /************************************** | |
1971 | * Read in a number. | |
1972 | * If it's an integer, store it in tok.TKutok.Vlong. | |
1973 | * integers can be decimal, octal or hex | |
1974 | * Handle the suffixes U, UL, LU, L, etc. | |
1975 | * If it's double, store it in tok.TKutok.Vdouble. | |
1976 | * Returns: | |
1977 | * TKnum | |
1978 | * TKdouble,... | |
1979 | */ | |
1980 | private TOK number(Token* t) | |
1981 | { | |
1982 | int base = 10; | |
1983 | const start = p; | |
f99303eb | 1984 | ulong n = 0; // unsigned >=64 bit integer type |
5fee5ec3 IB |
1985 | int d; |
1986 | bool err = false; | |
1987 | bool overflow = false; | |
1988 | bool anyBinaryDigitsNoSingleUS = false; | |
1989 | bool anyHexDigitsNoSingleUS = false; | |
fd43568c | 1990 | char errorDigit = 0; |
5fee5ec3 IB |
1991 | dchar c = *p; |
1992 | if (c == '0') | |
1993 | { | |
1994 | ++p; | |
1995 | c = *p; | |
1996 | switch (c) | |
1997 | { | |
1998 | case '0': | |
1999 | case '1': | |
2000 | case '2': | |
2001 | case '3': | |
2002 | case '4': | |
2003 | case '5': | |
2004 | case '6': | |
2005 | case '7': | |
2006 | base = 8; | |
2007 | break; | |
2008 | ||
2009 | case '8': | |
2010 | case '9': | |
fd43568c | 2011 | errorDigit = cast(char) c; |
5fee5ec3 IB |
2012 | base = 8; |
2013 | break; | |
2014 | case 'x': | |
2015 | case 'X': | |
2016 | ++p; | |
2017 | base = 16; | |
2018 | break; | |
2019 | case 'b': | |
2020 | case 'B': | |
5fee5ec3 IB |
2021 | ++p; |
2022 | base = 2; | |
2023 | break; | |
2024 | case '.': | |
2025 | if (p[1] == '.') | |
2026 | goto Ldone; // if ".." | |
2027 | if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) | |
6384eff5 IB |
2028 | { |
2029 | if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) | |
2030 | goto Lreal; // if `0.f` or `0.L` | |
5fee5ec3 | 2031 | goto Ldone; // if ".identifier" or ".unicode" |
6384eff5 | 2032 | } |
5fee5ec3 IB |
2033 | goto Lreal; // '.' is part of current token |
2034 | case 'i': | |
2035 | case 'f': | |
2036 | case 'F': | |
2037 | goto Lreal; | |
2038 | case '_': | |
2039 | if (Ccompile) | |
2040 | error("embedded `_` not allowed"); | |
2041 | ++p; | |
2042 | base = 8; | |
2043 | break; | |
2044 | case 'L': | |
2045 | if (p[1] == 'i') | |
2046 | goto Lreal; | |
2047 | break; | |
2048 | default: | |
2049 | break; | |
2050 | } | |
2051 | } | |
2052 | while (1) | |
2053 | { | |
2054 | c = *p; | |
2055 | switch (c) | |
2056 | { | |
2057 | case '0': | |
2058 | case '1': | |
2059 | case '2': | |
2060 | case '3': | |
2061 | case '4': | |
2062 | case '5': | |
2063 | case '6': | |
2064 | case '7': | |
2065 | case '8': | |
2066 | case '9': | |
2067 | ++p; | |
2068 | d = c - '0'; | |
2069 | break; | |
2070 | case 'a': | |
2071 | case 'b': | |
2072 | case 'c': | |
2073 | case 'd': | |
2074 | case 'e': | |
2075 | case 'f': | |
2076 | case 'A': | |
2077 | case 'B': | |
2078 | case 'C': | |
2079 | case 'D': | |
2080 | case 'E': | |
2081 | case 'F': | |
2082 | ++p; | |
2083 | if (base != 16) | |
2084 | { | |
2085 | if (c == 'e' || c == 'E' || c == 'f' || c == 'F') | |
2086 | goto Lreal; | |
2087 | } | |
2088 | if (c >= 'a') | |
2089 | d = c + 10 - 'a'; | |
2090 | else | |
2091 | d = c + 10 - 'A'; | |
2092 | break; | |
2093 | case 'L': | |
2094 | if (p[1] == 'i') | |
2095 | goto Lreal; | |
2096 | goto Ldone; | |
2097 | case '.': | |
2098 | if (p[1] == '.') | |
2099 | goto Ldone; // if ".." | |
0fb57034 | 2100 | if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) |
6384eff5 IB |
2101 | { |
2102 | if (Ccompile && base == 10 && | |
fbdaa581 IB |
2103 | (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L')) |
2104 | goto Lreal; // if `1.e6` or `1.f` or `1.L` | |
5fee5ec3 | 2105 | goto Ldone; // if ".identifier" or ".unicode" |
6384eff5 | 2106 | } |
5fee5ec3 IB |
2107 | if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) |
2108 | goto Ldone; // if ".identifier" or ".unicode" | |
2109 | if (base == 2) | |
2110 | goto Ldone; // if ".identifier" or ".unicode" | |
2111 | goto Lreal; // otherwise as part of a floating point literal | |
2112 | case 'p': | |
2113 | case 'P': | |
2114 | case 'i': | |
2115 | Lreal: | |
2116 | p = start; | |
2117 | return inreal(t); | |
2118 | case '_': | |
2119 | if (Ccompile) | |
2120 | goto default; | |
2121 | ++p; | |
2122 | continue; | |
2123 | default: | |
2124 | goto Ldone; | |
2125 | } | |
2126 | // got a digit here, set any necessary flags, check for errors | |
2127 | anyHexDigitsNoSingleUS = true; | |
2128 | anyBinaryDigitsNoSingleUS = true; | |
fd43568c | 2129 | if (!errorDigit && d >= base) |
5fee5ec3 | 2130 | { |
fd43568c | 2131 | errorDigit = cast(char) c; |
5fee5ec3 IB |
2132 | } |
2133 | // Avoid expensive overflow check if we aren't at risk of overflow | |
2134 | if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) | |
2135 | n = n * base + d; | |
2136 | else | |
2137 | { | |
2138 | import core.checkedint : mulu, addu; | |
2139 | ||
2140 | n = mulu(n, base, overflow); | |
2141 | n = addu(n, d, overflow); | |
2142 | } | |
2143 | } | |
2144 | Ldone: | |
fd43568c IB |
2145 | if (errorDigit) |
2146 | { | |
8da8c7d3 | 2147 | error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr : |
fd43568c IB |
2148 | base == 8 ? "octal".ptr : |
2149 | "decimal".ptr, errorDigit); | |
2150 | err = true; | |
2151 | } | |
5fee5ec3 IB |
2152 | if (overflow && !err) |
2153 | { | |
2154 | error("integer overflow"); | |
2155 | err = true; | |
2156 | } | |
2157 | if ((base == 2 && !anyBinaryDigitsNoSingleUS) || | |
2158 | (base == 16 && !anyHexDigitsNoSingleUS)) | |
8da8c7d3 | 2159 | error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); |
5fee5ec3 IB |
2160 | |
2161 | t.unsvalue = n; | |
2162 | ||
2163 | if (Ccompile) | |
2164 | return cnumber(base, n); | |
2165 | ||
2166 | enum FLAGS : int | |
2167 | { | |
2168 | none = 0, | |
2169 | decimal = 1, // decimal | |
2170 | unsigned = 2, // u or U suffix | |
2171 | long_ = 4, // L suffix | |
2172 | } | |
2173 | ||
2174 | FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; | |
2175 | // Parse trailing 'u', 'U', 'l' or 'L' in any combination | |
2176 | const psuffix = p; | |
2177 | while (1) | |
2178 | { | |
2179 | FLAGS f; | |
2180 | switch (*p) | |
2181 | { | |
2182 | case 'U': | |
2183 | case 'u': | |
2184 | f = FLAGS.unsigned; | |
2185 | goto L1; | |
2186 | case 'l': | |
2187 | f = FLAGS.long_; | |
2188 | error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); | |
2189 | goto L1; | |
2190 | case 'L': | |
2191 | f = FLAGS.long_; | |
2192 | L1: | |
2193 | p++; | |
2194 | if ((flags & f) && !err) | |
2195 | { | |
2196 | error("unrecognized token"); | |
2197 | err = true; | |
2198 | } | |
2199 | flags = cast(FLAGS)(flags | f); | |
2200 | continue; | |
2201 | default: | |
2202 | break; | |
2203 | } | |
2204 | break; | |
2205 | } | |
2206 | if (base == 8 && n >= 8) | |
2207 | { | |
2208 | if (err) | |
2209 | // can't translate invalid octal value, just show a generic message | |
2210 | error("octal literals larger than 7 are no longer supported"); | |
2211 | else | |
8da8c7d3 | 2212 | error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead", |
5fee5ec3 IB |
2213 | n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); |
2214 | } | |
2215 | TOK result; | |
2216 | switch (flags) | |
2217 | { | |
2218 | case FLAGS.none: | |
2219 | /* Octal or Hexadecimal constant. | |
2220 | * First that fits: int, uint, long, ulong | |
2221 | */ | |
2222 | if (n & 0x8000000000000000L) | |
2223 | result = TOK.uns64Literal; | |
2224 | else if (n & 0xFFFFFFFF00000000L) | |
2225 | result = TOK.int64Literal; | |
2226 | else if (n & 0x80000000) | |
2227 | result = TOK.uns32Literal; | |
2228 | else | |
2229 | result = TOK.int32Literal; | |
2230 | break; | |
2231 | case FLAGS.decimal: | |
2232 | /* First that fits: int, long, long long | |
2233 | */ | |
2234 | if (n & 0x8000000000000000L) | |
2235 | { | |
2236 | result = TOK.uns64Literal; | |
2237 | } | |
2238 | else if (n & 0xFFFFFFFF80000000L) | |
2239 | result = TOK.int64Literal; | |
2240 | else | |
2241 | result = TOK.int32Literal; | |
2242 | break; | |
2243 | case FLAGS.unsigned: | |
2244 | case FLAGS.decimal | FLAGS.unsigned: | |
2245 | /* First that fits: uint, ulong | |
2246 | */ | |
2247 | if (n & 0xFFFFFFFF00000000L) | |
2248 | result = TOK.uns64Literal; | |
2249 | else | |
2250 | result = TOK.uns32Literal; | |
2251 | break; | |
2252 | case FLAGS.decimal | FLAGS.long_: | |
2253 | if (n & 0x8000000000000000L) | |
2254 | { | |
2255 | if (!err) | |
2256 | { | |
2257 | error("signed integer overflow"); | |
2258 | err = true; | |
2259 | } | |
2260 | result = TOK.uns64Literal; | |
2261 | } | |
2262 | else | |
2263 | result = TOK.int64Literal; | |
2264 | break; | |
2265 | case FLAGS.long_: | |
2266 | if (n & 0x8000000000000000L) | |
2267 | result = TOK.uns64Literal; | |
2268 | else | |
2269 | result = TOK.int64Literal; | |
2270 | break; | |
2271 | case FLAGS.unsigned | FLAGS.long_: | |
2272 | case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: | |
2273 | result = TOK.uns64Literal; | |
2274 | break; | |
2275 | default: | |
2276 | debug | |
2277 | { | |
2278 | printf("%x\n", flags); | |
2279 | } | |
2280 | assert(0); | |
2281 | } | |
2282 | return result; | |
2283 | } | |
2284 | ||
2285 | /************************************** | |
2286 | * Lex C integer-suffix | |
2287 | * Params: | |
2288 | * base = number base | |
2289 | * n = raw integer value | |
2290 | * Returns: | |
2291 | * token value | |
2292 | */ | |
f99303eb | 2293 | private TOK cnumber(int base, ulong n) |
5fee5ec3 IB |
2294 | { |
2295 | /* C11 6.4.4.1 | |
2296 | * Parse trailing suffixes: | |
2297 | * u or U | |
2298 | * l or L | |
2299 | * ll or LL | |
2300 | */ | |
2301 | enum FLAGS : uint | |
2302 | { | |
2303 | octalhex = 1, // octal or hexadecimal | |
2304 | decimal = 2, // decimal | |
2305 | unsigned = 4, // u or U suffix | |
2306 | long_ = 8, // l or L suffix | |
2307 | llong = 0x10 // ll or LL | |
2308 | } | |
2309 | FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex; | |
2310 | bool err; | |
2311 | Lsuffixes: | |
2312 | while (1) | |
2313 | { | |
2314 | FLAGS f; | |
2315 | const cs = *p; | |
2316 | switch (cs) | |
2317 | { | |
2318 | case 'U': | |
2319 | case 'u': | |
2320 | f = FLAGS.unsigned; | |
2321 | break; | |
2322 | ||
2323 | case 'l': | |
2324 | case 'L': | |
2325 | f = FLAGS.long_; | |
2326 | if (cs == p[1]) | |
2327 | { | |
2328 | f = FLAGS.long_ | FLAGS.llong; | |
2329 | ++p; | |
2330 | } | |
2331 | break; | |
2332 | ||
2333 | default: | |
2334 | break Lsuffixes; | |
2335 | } | |
2336 | ++p; | |
2337 | if ((flags & f) && !err) | |
2338 | { | |
2339 | error("duplicate integer suffixes"); | |
2340 | err = true; | |
2341 | } | |
2342 | flags = cast(FLAGS)(flags | f); | |
2343 | } | |
2344 | ||
5fee5ec3 IB |
2345 | TOK result = TOK.int32Literal; // default |
2346 | switch (flags) | |
2347 | { | |
2348 | /* Since D doesn't have a variable sized `long` or `unsigned long` type, | |
2349 | * this code deviates from C by picking D int, uint, long, or ulong instead | |
2350 | */ | |
2351 | ||
2352 | case FLAGS.octalhex: | |
2353 | /* Octal or Hexadecimal constant. | |
2354 | * First that fits: int, unsigned, long, unsigned long, | |
2355 | * long long, unsigned long long | |
2356 | */ | |
6384eff5 IB |
2357 | if (n & 0x8000000000000000L) |
2358 | result = TOK.uns64Literal; // unsigned long | |
2359 | else if (n & 0xFFFFFFFF00000000L) | |
2360 | result = TOK.int64Literal; // long | |
2361 | else if (n & 0x80000000) | |
2362 | result = TOK.uns32Literal; | |
5fee5ec3 | 2363 | else |
6384eff5 | 2364 | result = TOK.int32Literal; |
5fee5ec3 IB |
2365 | break; |
2366 | ||
2367 | case FLAGS.decimal: | |
2368 | /* First that fits: int, long, long long | |
2369 | */ | |
6384eff5 IB |
2370 | if (n & 0x8000000000000000L) |
2371 | result = TOK.uns64Literal; // unsigned long | |
2372 | else if (n & 0xFFFFFFFF80000000L) | |
2373 | result = TOK.int64Literal; // long | |
5fee5ec3 | 2374 | else |
6384eff5 | 2375 | result = TOK.int32Literal; |
5fee5ec3 IB |
2376 | break; |
2377 | ||
2378 | case FLAGS.octalhex | FLAGS.unsigned: | |
2379 | case FLAGS.decimal | FLAGS.unsigned: | |
2380 | /* First that fits: unsigned, unsigned long, unsigned long long | |
2381 | */ | |
6384eff5 IB |
2382 | if (n & 0xFFFFFFFF00000000L) |
2383 | result = TOK.uns64Literal; // unsigned long | |
5fee5ec3 | 2384 | else |
6384eff5 | 2385 | result = TOK.uns32Literal; |
5fee5ec3 IB |
2386 | break; |
2387 | ||
2388 | case FLAGS.decimal | FLAGS.long_: | |
2389 | /* First that fits: long, long long | |
2390 | */ | |
1027dc45 | 2391 | if (longsize == 4 || long_longsize == 4) |
5fee5ec3 | 2392 | { |
6384eff5 | 2393 | if (n & 0xFFFFFFFF_80000000L) |
5fee5ec3 IB |
2394 | result = TOK.int64Literal; |
2395 | else | |
6384eff5 | 2396 | result = TOK.int32Literal; // long |
5fee5ec3 IB |
2397 | } |
2398 | else | |
2399 | { | |
6384eff5 | 2400 | result = TOK.int64Literal; // long |
5fee5ec3 IB |
2401 | } |
2402 | break; | |
2403 | ||
2404 | case FLAGS.octalhex | FLAGS.long_: | |
2405 | /* First that fits: long, unsigned long, long long, | |
2406 | * unsigned long long | |
2407 | */ | |
1027dc45 | 2408 | if (longsize == 4 || long_longsize == 4) |
5fee5ec3 IB |
2409 | { |
2410 | if (n & 0x8000000000000000L) | |
2411 | result = TOK.uns64Literal; | |
2412 | else if (n & 0xFFFFFFFF00000000L) | |
2413 | result = TOK.int64Literal; | |
2414 | else if (n & 0x80000000) | |
2415 | result = TOK.uns32Literal; // unsigned long | |
2416 | else | |
2417 | result = TOK.int32Literal; // long | |
2418 | } | |
2419 | else | |
2420 | { | |
2421 | if (n & 0x80000000_00000000L) | |
2422 | result = TOK.uns64Literal; // unsigned long | |
2423 | else | |
2424 | result = TOK.int64Literal; // long | |
2425 | } | |
2426 | break; | |
2427 | ||
2428 | case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_: | |
2429 | case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: | |
2430 | /* First that fits: unsigned long, unsigned long long | |
2431 | */ | |
1027dc45 | 2432 | if (longsize == 4 || long_longsize == 4) |
5fee5ec3 IB |
2433 | { |
2434 | if (n & 0xFFFFFFFF00000000L) | |
2435 | result = TOK.uns64Literal; | |
2436 | else | |
2437 | result = TOK.uns32Literal; // unsigned long | |
2438 | } | |
2439 | else | |
2440 | { | |
2441 | result = TOK.uns64Literal; // unsigned long | |
2442 | } | |
2443 | break; | |
2444 | ||
2445 | case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong: | |
2446 | /* First that fits: long long, unsigned long long | |
2447 | */ | |
2448 | if (n & 0x8000000000000000L) | |
2449 | result = TOK.uns64Literal; | |
2450 | else | |
2451 | result = TOK.int64Literal; | |
2452 | break; | |
2453 | ||
2454 | case FLAGS.decimal | FLAGS.long_ | FLAGS.llong: | |
2455 | /* long long | |
2456 | */ | |
2457 | result = TOK.int64Literal; | |
2458 | break; | |
2459 | ||
2460 | case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: | |
2461 | case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong: | |
2462 | result = TOK.uns64Literal; | |
2463 | break; | |
2464 | ||
2465 | default: | |
2466 | debug printf("%x\n",flags); | |
2467 | assert(0); | |
2468 | } | |
2469 | return result; | |
2470 | } | |
2471 | ||
2472 | /************************************** | |
2473 | * Read in characters, converting them to real. | |
2474 | * Bugs: | |
2475 | * Exponent overflow not detected. | |
2476 | * Too much requested precision is not detected. | |
2477 | */ | |
2478 | private TOK inreal(Token* t) | |
2479 | { | |
2480 | //printf("Lexer::inreal()\n"); | |
2481 | debug | |
2482 | { | |
2483 | assert(*p == '.' || isdigit(*p)); | |
2484 | } | |
2485 | bool isWellformedString = true; | |
2486 | stringbuffer.setsize(0); | |
2487 | auto pstart = p; | |
2488 | bool hex = false; | |
2489 | dchar c = *p++; | |
2490 | // Leading '0x' | |
2491 | if (c == '0') | |
2492 | { | |
2493 | c = *p++; | |
2494 | if (c == 'x' || c == 'X') | |
2495 | { | |
2496 | hex = true; | |
2497 | c = *p++; | |
2498 | } | |
2499 | } | |
2500 | // Digits to left of '.' | |
2501 | while (1) | |
2502 | { | |
2503 | if (c == '.') | |
2504 | { | |
2505 | c = *p++; | |
2506 | break; | |
2507 | } | |
2508 | if (isdigit(c) || (hex && isxdigit(c)) || c == '_') | |
2509 | { | |
2510 | c = *p++; | |
2511 | continue; | |
2512 | } | |
2513 | break; | |
2514 | } | |
2515 | // Digits to right of '.' | |
2516 | while (1) | |
2517 | { | |
2518 | if (isdigit(c) || (hex && isxdigit(c)) || c == '_') | |
2519 | { | |
2520 | c = *p++; | |
2521 | continue; | |
2522 | } | |
2523 | break; | |
2524 | } | |
2525 | if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) | |
2526 | { | |
2527 | c = *p++; | |
2528 | if (c == '-' || c == '+') | |
2529 | { | |
2530 | c = *p++; | |
2531 | } | |
2532 | bool anyexp = false; | |
2533 | while (1) | |
2534 | { | |
2535 | if (isdigit(c)) | |
2536 | { | |
2537 | anyexp = true; | |
2538 | c = *p++; | |
2539 | continue; | |
2540 | } | |
2541 | if (c == '_') | |
2542 | { | |
2543 | if (Ccompile) | |
2544 | error("embedded `_` in numeric literals not allowed"); | |
2545 | c = *p++; | |
2546 | continue; | |
2547 | } | |
2548 | if (!anyexp) | |
2549 | { | |
2550 | error("missing exponent"); | |
2551 | isWellformedString = false; | |
2552 | } | |
2553 | break; | |
2554 | } | |
2555 | } | |
2556 | else if (hex) | |
2557 | { | |
2558 | error("exponent required for hex float"); | |
2559 | isWellformedString = false; | |
2560 | } | |
2561 | --p; | |
2562 | while (pstart < p) | |
2563 | { | |
2564 | if (*pstart != '_') | |
2565 | stringbuffer.writeByte(*pstart); | |
2566 | ++pstart; | |
2567 | } | |
2568 | stringbuffer.writeByte(0); | |
2569 | auto sbufptr = cast(const(char)*)stringbuffer[].ptr; | |
2570 | TOK result; | |
2571 | bool isOutOfRange = false; | |
b7a586be | 2572 | t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero); |
5fee5ec3 IB |
2573 | switch (*p) |
2574 | { | |
2575 | case 'F': | |
2576 | case 'f': | |
2577 | if (isWellformedString && !isOutOfRange) | |
2578 | isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); | |
2579 | result = TOK.float32Literal; | |
2580 | p++; | |
2581 | break; | |
2582 | default: | |
2583 | if (isWellformedString && !isOutOfRange) | |
2584 | isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); | |
2585 | result = TOK.float64Literal; | |
2586 | break; | |
2587 | case 'l': | |
2588 | if (!Ccompile) | |
2589 | error("use 'L' suffix instead of 'l'"); | |
2590 | goto case 'L'; | |
2591 | case 'L': | |
2592 | ++p; | |
2593 | if (Ccompile && long_doublesize == 8) | |
2594 | goto default; | |
2595 | result = TOK.float80Literal; | |
2596 | break; | |
2597 | } | |
2598 | if ((*p == 'i' || *p == 'I') && !Ccompile) | |
2599 | { | |
2600 | if (*p == 'I') | |
2601 | error("use 'i' suffix instead of 'I'"); | |
2602 | p++; | |
2603 | switch (result) | |
2604 | { | |
2605 | case TOK.float32Literal: | |
2606 | result = TOK.imaginary32Literal; | |
2607 | break; | |
2608 | case TOK.float64Literal: | |
2609 | result = TOK.imaginary64Literal; | |
2610 | break; | |
2611 | case TOK.float80Literal: | |
2612 | result = TOK.imaginary80Literal; | |
2613 | break; | |
2614 | default: | |
2615 | break; | |
2616 | } | |
2617 | } | |
2618 | const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); | |
fbdaa581 | 2619 | if (isOutOfRange && !isLong && (!Ccompile || hex)) |
5fee5ec3 | 2620 | { |
fbdaa581 IB |
2621 | /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex |
2622 | */ | |
c8dfa79c IB |
2623 | const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : ""; |
2624 | const char* type = [TOK.float32Literal: "`float`".ptr, | |
2625 | TOK.float64Literal: "`double`".ptr, | |
2626 | TOK.float80Literal: "`real` for the current target".ptr][result]; | |
2627 | error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type); | |
2628 | const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : ""; | |
8da8c7d3 | 2629 | eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra); |
5fee5ec3 IB |
2630 | } |
2631 | debug | |
2632 | { | |
2633 | switch (result) | |
2634 | { | |
2635 | case TOK.float32Literal: | |
2636 | case TOK.float64Literal: | |
2637 | case TOK.float80Literal: | |
2638 | case TOK.imaginary32Literal: | |
2639 | case TOK.imaginary64Literal: | |
2640 | case TOK.imaginary80Literal: | |
2641 | break; | |
2642 | default: | |
2643 | assert(0); | |
2644 | } | |
2645 | } | |
2646 | return result; | |
2647 | } | |
2648 | ||
2649 | final Loc loc() pure @nogc | |
2650 | { | |
2651 | scanloc.charnum = cast(uint)(1 + p - line); | |
2652 | version (LocOffset) | |
2653 | scanloc.fileOffset = cast(uint)(p - base); | |
2654 | return scanloc; | |
2655 | } | |
2656 | ||
8da8c7d3 IB |
2657 | void error(T...)(const(char)* format, T args) |
2658 | { | |
2659 | eSink.error(token.loc, format, args); | |
2660 | } | |
2661 | ||
2662 | void error(T...)(const ref Loc loc, const(char)* format, T args) | |
5fee5ec3 | 2663 | { |
8da8c7d3 | 2664 | eSink.error(loc, format, args); |
5fee5ec3 IB |
2665 | } |
2666 | ||
8da8c7d3 | 2667 | final void deprecation(const(char)* format) |
5fee5ec3 | 2668 | { |
8da8c7d3 | 2669 | eSink.deprecation(token.loc, format); |
5fee5ec3 IB |
2670 | } |
2671 | ||
8da8c7d3 | 2672 | final void deprecationSupplemental(const(char)* format) |
5fee5ec3 | 2673 | { |
8da8c7d3 | 2674 | eSink.deprecationSupplemental(token.loc, format); |
5fee5ec3 IB |
2675 | } |
2676 | ||
7e287503 IB |
2677 | /*************************************** |
2678 | * Parse special token sequence: | |
2679 | * Returns: | |
2680 | * true if the special token sequence was handled | |
2681 | * References: | |
2682 | * https://dlang.org/spec/lex.html#special-token-sequence | |
2683 | */ | |
2684 | bool parseSpecialTokenSequence() | |
2685 | { | |
2686 | Token n; | |
2687 | scan(&n); | |
2688 | if (n.value == TOK.identifier) | |
2689 | { | |
2690 | if (n.ident == Id.line) | |
2691 | { | |
2692 | poundLine(n, false); | |
2693 | return true; | |
2694 | } | |
2695 | else | |
2696 | { | |
2697 | const locx = loc(); | |
8da8c7d3 | 2698 | eSink.warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); |
7e287503 IB |
2699 | } |
2700 | } | |
2701 | else if (n.value == TOK.if_) | |
2702 | { | |
2703 | error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); | |
2704 | } | |
2705 | return false; | |
2706 | } | |
2707 | ||
5fee5ec3 IB |
2708 | /********************************************* |
2709 | * Parse line/file preprocessor directive: | |
2710 | * #line linnum [filespec] | |
2711 | * Allow __LINE__ for linnum, and __FILE__ for filespec. | |
2712 | * Accept linemarker format: | |
2713 | * # linnum [filespec] {flags} | |
2714 | * There can be zero or more flags, which are one of the digits 1..4, and | |
2715 | * must be in ascending order. The flags are ignored. | |
2716 | * Params: | |
2717 | * tok = token we're on, which is linnum of linemarker | |
2718 | * linemarker = true if line marker format and lexer is on linnum | |
2719 | * References: | |
2720 | * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html | |
2721 | */ | |
7e287503 | 2722 | final void poundLine(ref Token tok, bool linemarker) |
5fee5ec3 IB |
2723 | { |
2724 | auto linnum = this.scanloc.linnum; | |
2725 | const(char)* filespec = null; | |
5fee5ec3 IB |
2726 | bool flags; |
2727 | ||
2728 | if (!linemarker) | |
2729 | scan(&tok); | |
2730 | if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) | |
2731 | { | |
235d5a96 IB |
2732 | const lin = cast(int)(tok.unsvalue); |
2733 | if (lin != tok.unsvalue) | |
2734 | { | |
2735 | error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue); | |
2736 | skipToNextLine(); | |
2737 | return; | |
2738 | } | |
5fee5ec3 IB |
2739 | else |
2740 | linnum = lin; | |
2741 | } | |
2742 | else if (tok.value == TOK.line) // #line __LINE__ | |
2743 | { | |
2744 | } | |
2745 | else | |
235d5a96 IB |
2746 | { |
2747 | error(tok.loc, "positive integer argument expected following `#line`"); | |
2748 | if (tok.value != TOK.endOfLine) | |
2749 | skipToNextLine(); | |
2750 | return; | |
2751 | } | |
5fee5ec3 IB |
2752 | while (1) |
2753 | { | |
235d5a96 IB |
2754 | scan(&tok); |
2755 | switch (tok.value) | |
5fee5ec3 | 2756 | { |
235d5a96 IB |
2757 | case TOK.endOfFile: |
2758 | case TOK.endOfLine: | |
5fee5ec3 IB |
2759 | if (!inTokenStringConstant) |
2760 | { | |
2761 | this.scanloc.linnum = linnum; | |
2762 | if (filespec) | |
2763 | this.scanloc.filename = filespec; | |
2764 | } | |
2765 | return; | |
235d5a96 | 2766 | case TOK.file: |
5fee5ec3 IB |
2767 | if (filespec || flags) |
2768 | goto Lerr; | |
235d5a96 IB |
2769 | filespec = mem.xstrdup(scanloc.filename); |
2770 | continue; | |
2771 | case TOK.string_: | |
5fee5ec3 IB |
2772 | if (filespec || flags) |
2773 | goto Lerr; | |
235d5a96 | 2774 | if (tok.ptr[0] != '"' || tok.postfix != 0) |
1027dc45 | 2775 | goto Lerr; |
235d5a96 | 2776 | filespec = tok.ustring; |
5fee5ec3 | 2777 | continue; |
235d5a96 IB |
2778 | case TOK.int32Literal: |
2779 | if (!filespec) | |
2780 | goto Lerr; | |
2781 | if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4) | |
5fee5ec3 | 2782 | { |
235d5a96 IB |
2783 | flags = true; // linemarker flags seen |
2784 | continue; | |
5fee5ec3 IB |
2785 | } |
2786 | goto Lerr; | |
235d5a96 IB |
2787 | default: |
2788 | goto Lerr; | |
5fee5ec3 IB |
2789 | } |
2790 | } | |
2791 | Lerr: | |
235d5a96 IB |
2792 | if (filespec is null) |
2793 | error(tok.loc, "invalid filename for `#line` directive"); | |
2794 | else if (linemarker) | |
2795 | error(tok.loc, "invalid flag for line marker directive"); | |
2796 | else if (!Ccompile) | |
2797 | error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars()); | |
2798 | if (tok.value != TOK.endOfLine) | |
2799 | skipToNextLine(); | |
5fee5ec3 IB |
2800 | } |
2801 | ||
0fb57034 IB |
2802 | /*************************************** |
2803 | * Scan forward to start of next line. | |
610d7898 IB |
2804 | * Params: |
2805 | * defines = send characters to `defines` | |
0fb57034 | 2806 | */ |
610d7898 | 2807 | final void skipToNextLine(OutBuffer* defines = null) |
0fb57034 IB |
2808 | { |
2809 | while (1) | |
2810 | { | |
2811 | switch (*p) | |
2812 | { | |
2813 | case 0: | |
2814 | case 0x1A: | |
2815 | return; // do not advance p | |
2816 | ||
2817 | case '\n': | |
2818 | ++p; | |
2819 | break; | |
2820 | ||
2821 | case '\r': | |
2822 | ++p; | |
2823 | if (p[0] == '\n') | |
2824 | ++p; | |
2825 | break; | |
2826 | ||
2827 | default: | |
610d7898 IB |
2828 | if (defines) |
2829 | defines.writeByte(*p); // don't care about Unicode line endings for C | |
2830 | else if (*p & 0x80) | |
0fb57034 IB |
2831 | { |
2832 | const u = decodeUTF(); | |
2833 | if (u == PS || u == LS) | |
2834 | { | |
2835 | ++p; | |
2836 | break; | |
2837 | } | |
2838 | } | |
2839 | ++p; | |
2840 | continue; | |
2841 | } | |
2842 | break; | |
2843 | } | |
2844 | endOfLine(); | |
235d5a96 | 2845 | tokenizeNewlines = false; |
0fb57034 IB |
2846 | } |
2847 | ||
5fee5ec3 IB |
2848 | /******************************************** |
2849 | * Decode UTF character. | |
2850 | * Issue error messages for invalid sequences. | |
2851 | * Return decoded character, advance p to last character in UTF sequence. | |
2852 | */ | |
2853 | private uint decodeUTF() | |
6d799f0a IB |
2854 | { |
2855 | string msg; | |
2856 | auto result = decodeUTFpure(msg); | |
2857 | ||
2858 | if (msg) | |
8da8c7d3 | 2859 | error(token.loc, "%.*s", cast(int)msg.length, msg.ptr); |
6d799f0a IB |
2860 | return result; |
2861 | } | |
2862 | ||
2863 | /******************************************** | |
2864 | * Same as above, but the potential error message is stored to the | |
2865 | * msg parameter instead of being issued. | |
2866 | */ | |
2867 | private pure uint decodeUTFpure(out string msg) | |
5fee5ec3 IB |
2868 | { |
2869 | const s = p; | |
2870 | assert(*s & 0x80); | |
2871 | // Check length of remaining string up to 4 UTF-8 characters | |
2872 | size_t len; | |
2873 | for (len = 1; len < 4 && s[len]; len++) | |
2874 | { | |
2875 | } | |
2876 | size_t idx = 0; | |
2877 | dchar u; | |
6d799f0a | 2878 | msg = utf_decodeChar(s[0 .. len], idx, u); |
5fee5ec3 | 2879 | p += idx - 1; |
6d799f0a IB |
2880 | if (!msg && isBidiControl(u)) |
2881 | msg = "Bidirectional control characters are disallowed for security reasons."; | |
5fee5ec3 IB |
2882 | return u; |
2883 | } | |
2884 | ||
2885 | /*************************************************** | |
2886 | * Parse doc comment embedded between t.ptr and p. | |
2887 | * Remove trailing blanks and tabs from lines. | |
2888 | * Replace all newlines with \n. | |
2889 | * Remove leading comment character from each line. | |
2890 | * Decide if it's a lineComment or a blockComment. | |
2891 | * Append to previous one for this token. | |
2892 | * | |
2893 | * If newParagraph is true, an extra newline will be | |
2894 | * added between adjoining doc comments. | |
2895 | */ | |
2896 | private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure | |
2897 | { | |
2898 | /* ct tells us which kind of comment it is: '/', '*', or '+' | |
2899 | */ | |
2900 | const ct = t.ptr[2]; | |
2901 | /* Start of comment text skips over / * *, / + +, or / / / | |
2902 | */ | |
2903 | const(char)* q = t.ptr + 3; // start of comment text | |
2904 | const(char)* qend = p; | |
2905 | if (ct == '*' || ct == '+') | |
2906 | qend -= 2; | |
2907 | /* Scan over initial row of ****'s or ++++'s or ////'s | |
2908 | */ | |
2909 | for (; q < qend; q++) | |
2910 | { | |
2911 | if (*q != ct) | |
2912 | break; | |
2913 | } | |
2914 | /* Remove leading spaces until start of the comment | |
2915 | */ | |
2916 | int linestart = 0; | |
2917 | if (ct == '/') | |
2918 | { | |
2919 | while (q < qend && (*q == ' ' || *q == '\t')) | |
2920 | ++q; | |
2921 | } | |
2922 | else if (q < qend) | |
2923 | { | |
2924 | if (*q == '\r') | |
2925 | { | |
2926 | ++q; | |
2927 | if (q < qend && *q == '\n') | |
2928 | ++q; | |
2929 | linestart = 1; | |
2930 | } | |
2931 | else if (*q == '\n') | |
2932 | { | |
2933 | ++q; | |
2934 | linestart = 1; | |
2935 | } | |
2936 | } | |
2937 | /* Remove trailing row of ****'s or ++++'s | |
2938 | */ | |
2939 | if (ct != '/') | |
2940 | { | |
2941 | for (; q < qend; qend--) | |
2942 | { | |
2943 | if (qend[-1] != ct) | |
2944 | break; | |
2945 | } | |
2946 | } | |
2947 | /* Comment is now [q .. qend]. | |
2948 | * Canonicalize it into buf[]. | |
2949 | */ | |
2950 | OutBuffer buf; | |
2951 | ||
2952 | void trimTrailingWhitespace() | |
2953 | { | |
2954 | const s = buf[]; | |
2955 | auto len = s.length; | |
2956 | while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) | |
2957 | --len; | |
2958 | buf.setsize(len); | |
2959 | } | |
2960 | ||
2961 | for (; q < qend; q++) | |
2962 | { | |
2963 | char c = *q; | |
2964 | switch (c) | |
2965 | { | |
2966 | case '*': | |
2967 | case '+': | |
2968 | if (linestart && c == ct) | |
2969 | { | |
2970 | linestart = 0; | |
2971 | /* Trim preceding whitespace up to preceding \n | |
2972 | */ | |
2973 | trimTrailingWhitespace(); | |
2974 | continue; | |
2975 | } | |
2976 | break; | |
2977 | case ' ': | |
2978 | case '\t': | |
2979 | break; | |
2980 | case '\r': | |
2981 | if (q[1] == '\n') | |
2982 | continue; // skip the \r | |
2983 | goto Lnewline; | |
2984 | default: | |
2985 | if (c == 226) | |
2986 | { | |
2987 | // If LS or PS | |
2988 | if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) | |
2989 | { | |
2990 | q += 2; | |
2991 | goto Lnewline; | |
2992 | } | |
2993 | } | |
2994 | linestart = 0; | |
2995 | break; | |
2996 | Lnewline: | |
2997 | c = '\n'; // replace all newlines with \n | |
2998 | goto case; | |
2999 | case '\n': | |
3000 | linestart = 1; | |
3001 | /* Trim trailing whitespace | |
3002 | */ | |
3003 | trimTrailingWhitespace(); | |
3004 | break; | |
3005 | } | |
3006 | buf.writeByte(c); | |
3007 | } | |
3008 | /* Trim trailing whitespace (if the last line does not have newline) | |
3009 | */ | |
3010 | trimTrailingWhitespace(); | |
3011 | ||
3012 | // Always end with a newline | |
3013 | const s = buf[]; | |
3014 | if (s.length == 0 || s[$ - 1] != '\n') | |
3015 | buf.writeByte('\n'); | |
3016 | ||
3017 | // It's a line comment if the start of the doc comment comes | |
3018 | // after other non-whitespace on the same line. | |
3019 | auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; | |
3020 | // Combine with previous doc comment, if any | |
3021 | if (*dc) | |
3022 | *dc = combineComments(*dc, buf[], newParagraph).toDString(); | |
3023 | else | |
3024 | *dc = buf.extractSlice(true); | |
3025 | } | |
3026 | ||
3027 | /******************************************** | |
3028 | * Combine two document comments into one, | |
3029 | * separated by an extra newline if newParagraph is true. | |
3030 | */ | |
3031 | static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure | |
3032 | { | |
31350635 | 3033 | //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph); |
5fee5ec3 IB |
3034 | const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' |
3035 | if (!c1) | |
3036 | return c2.ptr; | |
3037 | if (!c2) | |
3038 | return c1.ptr; | |
3039 | ||
3040 | int insertNewLine = 0; | |
3041 | if (c1.length && c1[$ - 1] != '\n') | |
3042 | insertNewLine = 1; | |
3043 | const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; | |
3044 | auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); | |
3045 | p[0 .. c1.length] = c1[]; | |
3046 | if (insertNewLine) | |
3047 | p[c1.length] = '\n'; | |
3048 | if (newParagraph) | |
3049 | p[c1.length + insertNewLine] = '\n'; | |
3050 | p[retSize - c2.length .. retSize] = c2[]; | |
3051 | p[retSize] = 0; | |
3052 | return p; | |
3053 | } | |
3054 | ||
0fb57034 IB |
3055 | /************************** |
3056 | * `p` should be at start of next line | |
3057 | */ | |
3058 | private void endOfLine() pure @nogc @safe | |
5fee5ec3 IB |
3059 | { |
3060 | scanloc.linnum++; | |
3061 | line = p; | |
3062 | } | |
3063 | } | |
3064 | ||
6384eff5 IB |
3065 | |
3066 | /******************************* Private *****************************************/ | |
3067 | ||
3068 | private: | |
3069 | ||
5fee5ec3 IB |
3070 | /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__` |
3071 | private struct TimeStampInfo | |
3072 | { | |
3073 | private __gshared bool initdone = false; | |
3074 | ||
3075 | // Note: Those properties need to be guarded by a call to `init` | |
3076 | // The API isn't safe, and quite brittle, but it was left this way | |
3077 | // over performance concerns. | |
3078 | // This is currently only called once, from the lexer. | |
3079 | __gshared char[11 + 1] date; | |
3080 | __gshared char[8 + 1] time; | |
3081 | __gshared char[24 + 1] timestamp; | |
3082 | ||
8da8c7d3 | 3083 | public static void initialize(const ref Loc loc, ErrorSink eSink) nothrow |
5fee5ec3 IB |
3084 | { |
3085 | if (initdone) | |
3086 | return; | |
3087 | ||
3088 | initdone = true; | |
3089 | time_t ct; | |
3090 | // https://issues.dlang.org/show_bug.cgi?id=20444 | |
3091 | if (auto p = getenv("SOURCE_DATE_EPOCH")) | |
3092 | { | |
3093 | if (!ct.parseDigits(p.toDString())) | |
8da8c7d3 | 3094 | eSink.error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p); |
5fee5ec3 IB |
3095 | } |
3096 | else | |
3097 | .time(&ct); | |
3098 | const p = ctime(&ct); | |
3099 | assert(p); | |
8da8c7d3 IB |
3100 | snprintf(&date[0], date.length, "%.6s %.4s", p + 4, p + 20); |
3101 | snprintf(&time[0], time.length, "%.8s", p + 11); | |
3102 | snprintf(×tamp[0], timestamp.length, "%.24s", p); | |
5fee5ec3 IB |
3103 | } |
3104 | } | |
3105 | ||
6384eff5 IB |
3106 | private enum LS = 0x2028; // UTF line separator |
3107 | private enum PS = 0x2029; // UTF paragraph separator | |
3108 | ||
3109 | /******************************************** | |
3110 | * Do our own char maps | |
3111 | */ | |
3112 | private static immutable cmtable = () | |
3113 | { | |
3114 | ubyte[256] table; | |
3115 | foreach (const c; 0 .. table.length) | |
3116 | { | |
3117 | if ('0' <= c && c <= '7') | |
3118 | table[c] |= CMoctal; | |
3119 | if (c_isxdigit(c)) | |
3120 | table[c] |= CMhex; | |
3121 | if (c_isalnum(c) || c == '_') | |
3122 | table[c] |= CMidchar; | |
3123 | ||
3124 | switch (c) | |
3125 | { | |
3126 | case 'x': case 'X': | |
3127 | case 'b': case 'B': | |
3128 | table[c] |= CMzerosecond; | |
3129 | break; | |
3130 | ||
3131 | case '0': .. case '9': | |
3132 | case 'e': case 'E': | |
3133 | case 'f': case 'F': | |
3134 | case 'l': case 'L': | |
3135 | case 'p': case 'P': | |
3136 | case 'u': case 'U': | |
3137 | case 'i': | |
3138 | case '.': | |
3139 | case '_': | |
3140 | table[c] |= CMzerosecond | CMdigitsecond; | |
3141 | break; | |
3142 | ||
3143 | default: | |
3144 | break; | |
3145 | } | |
3146 | ||
3147 | switch (c) | |
3148 | { | |
3149 | case '\\': | |
3150 | case '\n': | |
3151 | case '\r': | |
3152 | case 0: | |
3153 | case 0x1A: | |
3154 | case '\'': | |
3155 | break; | |
3156 | default: | |
3157 | if (!(c & 0x80)) | |
3158 | table[c] |= CMsinglechar; | |
3159 | break; | |
3160 | } | |
3161 | } | |
3162 | return table; | |
3163 | }(); | |
3164 | ||
3165 | private | |
3166 | { | |
3167 | enum CMoctal = 0x1; | |
3168 | enum CMhex = 0x2; | |
3169 | enum CMidchar = 0x4; | |
3170 | enum CMzerosecond = 0x8; | |
3171 | enum CMdigitsecond = 0x10; | |
3172 | enum CMsinglechar = 0x20; | |
3173 | } | |
3174 | ||
3175 | private bool isoctal(const char c) pure @nogc @safe | |
3176 | { | |
3177 | return (cmtable[c] & CMoctal) != 0; | |
3178 | } | |
3179 | ||
3180 | private bool ishex(const char c) pure @nogc @safe | |
3181 | { | |
3182 | return (cmtable[c] & CMhex) != 0; | |
3183 | } | |
3184 | ||
3185 | private bool isidchar(const char c) pure @nogc @safe | |
3186 | { | |
3187 | return (cmtable[c] & CMidchar) != 0; | |
3188 | } | |
3189 | ||
3190 | private bool isZeroSecond(const char c) pure @nogc @safe | |
3191 | { | |
3192 | return (cmtable[c] & CMzerosecond) != 0; | |
3193 | } | |
3194 | ||
3195 | private bool isDigitSecond(const char c) pure @nogc @safe | |
3196 | { | |
3197 | return (cmtable[c] & CMdigitsecond) != 0; | |
3198 | } | |
3199 | ||
3200 | private bool issinglechar(const char c) pure @nogc @safe | |
3201 | { | |
3202 | return (cmtable[c] & CMsinglechar) != 0; | |
3203 | } | |
3204 | ||
3205 | private bool c_isxdigit(const int c) pure @nogc @safe | |
3206 | { | |
3207 | return (( c >= '0' && c <= '9') || | |
3208 | ( c >= 'a' && c <= 'f') || | |
3209 | ( c >= 'A' && c <= 'F')); | |
3210 | } | |
3211 | ||
3212 | private bool c_isalnum(const int c) pure @nogc @safe | |
3213 | { | |
3214 | return (( c >= '0' && c <= '9') || | |
3215 | ( c >= 'a' && c <= 'z') || | |
3216 | ( c >= 'A' && c <= 'Z')); | |
3217 | } | |
3218 | ||
3219 | /******************************* Unittest *****************************************/ | |
3220 | ||
5fee5ec3 IB |
3221 | unittest |
3222 | { | |
8da8c7d3 IB |
3223 | fprintf(stderr, "Lexer.unittest %d\n", __LINE__); |
3224 | ||
3225 | ErrorSink errorSink = new ErrorSinkStderr; | |
5fee5ec3 | 3226 | |
8da8c7d3 | 3227 | void test(T)(string sequence, T expected, bool Ccompile = false) |
5fee5ec3 IB |
3228 | { |
3229 | auto p = cast(const(char)*)sequence.ptr; | |
7e7ebe3e | 3230 | dchar c2; |
8da8c7d3 | 3231 | Lexer lexer = new Lexer(errorSink); |
7e7ebe3e | 3232 | assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2)); |
5fee5ec3 IB |
3233 | assert(p == sequence.ptr + sequence.length); |
3234 | } | |
3235 | ||
3236 | test(`'`, '\''); | |
3237 | test(`"`, '"'); | |
3238 | test(`?`, '?'); | |
3239 | test(`\`, '\\'); | |
3240 | test(`0`, '\0'); | |
3241 | test(`a`, '\a'); | |
3242 | test(`b`, '\b'); | |
3243 | test(`f`, '\f'); | |
3244 | test(`n`, '\n'); | |
3245 | test(`r`, '\r'); | |
3246 | test(`t`, '\t'); | |
3247 | test(`v`, '\v'); | |
3248 | ||
3249 | test(`x00`, 0x00); | |
3250 | test(`xff`, 0xff); | |
3251 | test(`xFF`, 0xff); | |
3252 | test(`xa7`, 0xa7); | |
3253 | test(`x3c`, 0x3c); | |
3254 | test(`xe2`, 0xe2); | |
3255 | ||
3256 | test(`1`, '\1'); | |
3257 | test(`42`, '\42'); | |
3258 | test(`357`, '\357'); | |
3259 | ||
3260 | test(`u1234`, '\u1234'); | |
3261 | test(`uf0e4`, '\uf0e4'); | |
3262 | ||
3263 | test(`U0001f603`, '\U0001f603'); | |
3264 | ||
3265 | test(`"`, '"'); | |
3266 | test(`<`, '<'); | |
3267 | test(`>`, '>'); | |
5fee5ec3 | 3268 | } |
6384eff5 | 3269 | |
5fee5ec3 IB |
3270 | unittest |
3271 | { | |
8da8c7d3 | 3272 | fprintf(stderr, "Lexer.unittest %d\n", __LINE__); |
5fee5ec3 | 3273 | |
8da8c7d3 | 3274 | static class ErrorSinkTest : ErrorSinkNull |
5fee5ec3 | 3275 | { |
8da8c7d3 IB |
3276 | nothrow: |
3277 | extern (C++): | |
3278 | override: | |
3279 | ||
3280 | import core.stdc.stdio; | |
3281 | import core.stdc.stdarg; | |
5fee5ec3 | 3282 | |
8da8c7d3 IB |
3283 | string expected; |
3284 | bool gotError; | |
3285 | ||
3286 | void error(const ref Loc loc, const(char)* format, ...) | |
3287 | { | |
3288 | gotError = true; | |
3289 | char[100] buffer = void; | |
3290 | va_list ap; | |
3291 | va_start(ap, format); | |
3292 | auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)]; | |
3293 | va_end(ap); | |
3294 | assert(expected == actual); | |
3295 | } | |
5fee5ec3 IB |
3296 | } |
3297 | ||
8da8c7d3 | 3298 | ErrorSinkTest errorSink = new ErrorSinkTest; |
5fee5ec3 IB |
3299 | |
3300 | void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false) | |
3301 | { | |
8da8c7d3 IB |
3302 | errorSink.expected = expectedError; |
3303 | errorSink.gotError = false; | |
5fee5ec3 | 3304 | auto p = cast(const(char)*)sequence.ptr; |
8da8c7d3 | 3305 | Lexer lexer = new Lexer(errorSink); |
7e7ebe3e IB |
3306 | dchar c2; |
3307 | auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2); | |
8da8c7d3 | 3308 | assert(errorSink.gotError); |
5fee5ec3 IB |
3309 | assert(expectedReturnValue == actualReturnValue); |
3310 | ||
3311 | auto actualScanLength = p - sequence.ptr; | |
3312 | assert(expectedScanLength == actualScanLength); | |
5fee5ec3 IB |
3313 | } |
3314 | ||
3315 | test("c", `undefined escape sequence \c`, 'c', 1); | |
3316 | test("!", `undefined escape sequence \!`, '!', 1); | |
3317 | test(""", `undefined escape sequence \&`, '&', 1, true); | |
3318 | ||
3319 | test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); | |
3320 | ||
3321 | test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); | |
3322 | test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); | |
3323 | test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); | |
3324 | ||
3325 | test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); | |
3326 | test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); | |
3327 | test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); | |
3328 | test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); | |
3329 | test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); | |
3330 | test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); | |
3331 | test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); | |
3332 | ||
3333 | test("ud800" , `invalid UTF character \U0000d800`, '?', 5); | |
3334 | test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); | |
3335 | test("U00110000", `invalid UTF character \U00110000`, '?', 9); | |
3336 | ||
3337 | test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); | |
3338 | test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); | |
3339 | test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); | |
3340 | ||
3341 | test("&BAD;", `unnamed character entity &BAD;` , '?', 5); | |
3342 | test(""", `unterminated named entity "`, '?', 5); | |
3343 | test(""", `unterminated named entity "`, '?', 5); | |
3344 | ||
3345 | test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); | |
5fee5ec3 | 3346 | } |
6384eff5 IB |
3347 | |
3348 | unittest | |
3349 | { | |
8da8c7d3 | 3350 | fprintf(stderr, "Lexer.unittest %d\n", __LINE__); |
6384eff5 IB |
3351 | /* Not much here, just trying things out. |
3352 | */ | |
3353 | string text = "int"; // We rely on the implicit null-terminator | |
8da8c7d3 IB |
3354 | ErrorSink errorSink = new ErrorSinkStderr; |
3355 | scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink); | |
6384eff5 IB |
3356 | TOK tok; |
3357 | tok = lex1.nextToken(); | |
3358 | //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); | |
3359 | assert(tok == TOK.int32); | |
3360 | tok = lex1.nextToken(); | |
3361 | assert(tok == TOK.endOfFile); | |
3362 | tok = lex1.nextToken(); | |
3363 | assert(tok == TOK.endOfFile); | |
3364 | tok = lex1.nextToken(); | |
3365 | assert(tok == TOK.endOfFile); | |
3366 | } | |
3367 | ||
3368 | unittest | |
3369 | { | |
8da8c7d3 IB |
3370 | fprintf(stderr, "Lexer.unittest %d\n", __LINE__); |
3371 | ||
6384eff5 | 3372 | // We don't want to see Lexer error output during these tests. |
8da8c7d3 | 3373 | ErrorSink errorSink = new ErrorSinkNull; |
6384eff5 IB |
3374 | |
3375 | // Test malformed input: even malformed input should end in a TOK.endOfFile. | |
3376 | static immutable char[][] testcases = | |
3377 | [ // Testcase must end with 0 or 0x1A. | |
3378 | [0], // not malformed, but pathological | |
3379 | ['\'', 0], | |
3380 | ['\'', 0x1A], | |
3381 | ['{', '{', 'q', '{', 0], | |
3382 | [0xFF, 0], | |
3383 | [0xFF, 0x80, 0], | |
3384 | [0xFF, 0xFF, 0], | |
3385 | [0xFF, 0xFF, 0], | |
3386 | ['x', '"', 0x1A], | |
3387 | ]; | |
3388 | ||
3389 | foreach (testcase; testcases) | |
3390 | { | |
8da8c7d3 | 3391 | scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink); |
6384eff5 IB |
3392 | TOK tok = lex2.nextToken(); |
3393 | size_t iterations = 1; | |
3394 | while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) | |
3395 | { | |
3396 | tok = lex2.nextToken(); | |
3397 | } | |
3398 | assert(tok == TOK.endOfFile); | |
3399 | tok = lex2.nextToken(); | |
3400 | assert(tok == TOK.endOfFile); | |
3401 | } | |
3402 | } |