]> git.ipfire.org Git - people/ms/gcc.git/blame - gcc/d/dmd/lexer.d
d: Merge upstream dmd, druntime 4ca4140e58, phobos 454dff14d.
[people/ms/gcc.git] / gcc / d / dmd / lexer.d
CommitLineData
5fee5ec3
IB
1/**
2 * Implements the lexical analyzer, which converts source code into lexical tokens.
3 *
4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
5 *
f99303eb 6 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
c43b5909
IB
7 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
8 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5fee5ec3
IB
9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
10 * Documentation: https://dlang.org/phobos/dmd_lexer.html
11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
12 */
13
14module dmd.lexer;
15
16import core.stdc.ctype;
17import core.stdc.errno;
18import core.stdc.stdarg;
19import core.stdc.stdio;
20import core.stdc.stdlib : getenv;
21import core.stdc.string;
22import core.stdc.time;
23
24import dmd.entity;
8da8c7d3 25import dmd.errorsink;
5fee5ec3
IB
26import dmd.id;
27import dmd.identifier;
f99303eb 28import dmd.location;
0fb57034 29import dmd.root.array;
5fee5ec3 30import dmd.root.ctfloat;
0fb57034 31import dmd.common.outbuffer;
5fee5ec3
IB
32import dmd.root.port;
33import dmd.root.rmem;
34import dmd.root.string;
c43b5909 35import dmd.root.utf;
5fee5ec3 36import dmd.tokens;
5fee5ec3
IB
37import dmd.utils;
38
39nothrow:
40
5fee5ec3
IB
41version (DMDLIB)
42{
43 version = LocOffset;
44}
45
46/***********************************************************
47 */
48class Lexer
49{
50 private __gshared OutBuffer stringbuffer;
51
52 Loc scanloc; // for error messages
53 Loc prevloc; // location of token before current
54
55 const(char)* p; // current character
56
57 Token token;
58
59 // For ImportC
60 bool Ccompile; /// true if compiling ImportC
61
62 // The following are valid only if (Ccompile == true)
1027dc45
IB
63 ubyte boolsize; /// size of a C _Bool, default 1
64 ubyte shortsize; /// size of a C short, default 2
65 ubyte intsize; /// size of a C int, default 4
5fee5ec3 66 ubyte longsize; /// size of C long, 4 or 8
1027dc45 67 ubyte long_longsize; /// size of a C long long, default 8
5fee5ec3
IB
68 ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
69 ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
70
8da8c7d3
IB
71 ErrorSink eSink; /// send error messages through this interface
72
5fee5ec3
IB
73 private
74 {
75 const(char)* base; // pointer to start of buffer
76 const(char)* end; // pointer to last element of buffer
77 const(char)* line; // start of current line
78
79 bool doDocComment; // collect doc comment information
80 bool anyToken; // seen at least one token
81 bool commentToken; // comments are TOK.comment's
235d5a96 82 bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's
8977f4be 83
f99303eb 84 bool whitespaceToken; // tokenize whitespaces (only for DMDLIB)
8977f4be 85
5fee5ec3
IB
86 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
87 int lastDocLine; // last line of previous doc comment
88
89 Token* tokenFreelist;
f99303eb
IB
90 uint versionNumber;
91 const(char)[] vendor;
5fee5ec3
IB
92 }
93
94 nothrow:
95
96 /*********************
97 * Creates a Lexer for the source code base[begoffset..endoffset+1].
98 * The last character, base[endoffset], must be null (0) or EOF (0x1A).
99 *
100 * Params:
101 * filename = used for error messages
102 * base = source code, must be terminated by a null (0) or EOF (0x1A) character
103 * begoffset = starting offset into base[]
104 * endoffset = the last offset to read into base[]
105 * doDocComment = handle documentation comments
106 * commentToken = comments become TOK.comment's
8da8c7d3 107 * errorSink = where error messages go, must not be null
f99303eb
IB
108 * vendor = name of the vendor
109 * versionNumber = version of the caller
5fee5ec3
IB
110 */
111 this(const(char)* filename, const(char)* base, size_t begoffset,
f99303eb 112 size_t endoffset, bool doDocComment, bool commentToken,
8da8c7d3
IB
113 ErrorSink errorSink,
114 const(char)[] vendor = "DLF", uint versionNumber = 1) pure scope
5fee5ec3
IB
115 {
116 scanloc = Loc(filename, 1, 1);
31350635
IB
117 // debug printf("Lexer::Lexer(%p)\n", base);
118 // debug printf("lexer.filename = %s\n", filename);
5fee5ec3
IB
119 token = Token.init;
120 this.base = base;
121 this.end = base + endoffset;
122 p = base + begoffset;
123 line = p;
124 this.doDocComment = doDocComment;
125 this.commentToken = commentToken;
235d5a96 126 this.tokenizeNewlines = false;
5fee5ec3
IB
127 this.inTokenStringConstant = 0;
128 this.lastDocLine = 0;
8da8c7d3
IB
129 this.eSink = errorSink;
130 assert(errorSink);
f99303eb
IB
131 this.versionNumber = versionNumber;
132 this.vendor = vendor;
5fee5ec3
IB
133 //initKeywords();
134 /* If first line starts with '#!', ignore the line
135 */
136 if (p && p[0] == '#' && p[1] == '!')
137 {
138 p += 2;
6d799f0a 139 for (;;p++)
5fee5ec3 140 {
6d799f0a 141 char c = *p;
5fee5ec3
IB
142 switch (c)
143 {
6d799f0a
IB
144 case '\n':
145 p++;
146 goto case;
5fee5ec3
IB
147 case 0:
148 case 0x1A:
5fee5ec3 149 break;
6d799f0a 150
5fee5ec3 151 default:
6d799f0a
IB
152 // Note: We do allow malformed UTF-8 on shebang line.
153 // It could have a meaning if the native system
154 // encoding is not Unicode. See test compilable/test13512.d
155 // for example encoded in KOI-8.
156 // We also allow bidirectional control characters.
157 // We do not execute the shebang line, so it can't be used
158 // to conceal code. It is up to the shell to sanitize it.
5fee5ec3
IB
159 continue;
160 }
161 break;
162 }
163 endOfLine();
164 }
165 }
166
f99303eb
IB
167 /***********************
168 * Alternative entry point for DMDLIB, adds `whitespaceToken`
169 */
170 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
8da8c7d3
IB
171 bool doDocComment, bool commentToken, bool whitespaceToken,
172 ErrorSink errorSink
173 )
f99303eb 174 {
8da8c7d3 175 this(filename, base, begoffset, endoffset, doDocComment, commentToken, errorSink);
f99303eb
IB
176 this.whitespaceToken = whitespaceToken;
177 }
178
610d7898
IB
179 /******************
180 * Used for unittests for a mock Lexer
181 */
8da8c7d3 182 this(ErrorSink errorSink) scope { assert(errorSink); this.eSink = errorSink; }
610d7898
IB
183
184 /**************************************
185 * Reset lexer to lex #define's
186 */
187 final void resetDefineLines(const(char)[] slice)
188 {
189 base = slice.ptr;
190 end = base + slice.length;
191 assert(*end == 0);
192 p = base;
193 line = p;
194 tokenizeNewlines = true;
195 inTokenStringConstant = 0;
196 lastDocLine = 0;
197 scanloc = Loc("#defines", 1, 1);
198 }
199
200 /**********************************
201 * Set up for next #define line.
202 * p should be at start of next line.
203 */
204 final void nextDefineLine()
205 {
206 tokenizeNewlines = true;
207 }
208
f99303eb
IB
209 /***************
210 * Range interface
211 */
8977f4be 212
f99303eb
IB
213 final bool empty() const pure @property @nogc @safe
214 {
215 return front() == TOK.endOfFile;
216 }
8977f4be 217
f99303eb
IB
218 final TOK front() const pure @property @nogc @safe
219 {
220 return token.value;
221 }
8977f4be 222
f99303eb
IB
223 final void popFront()
224 {
225 nextToken();
8977f4be
IB
226 }
227
5fee5ec3
IB
228 /// Returns: a newly allocated `Token`.
229 Token* allocateToken() pure nothrow @safe
230 {
231 if (tokenFreelist)
232 {
233 Token* t = tokenFreelist;
234 tokenFreelist = t.next;
235 t.next = null;
236 return t;
237 }
238 return new Token();
239 }
240
241 /// Frees the given token by returning it to the freelist.
242 private void releaseToken(Token* token) pure nothrow @nogc @safe
243 {
244 if (mem.isGCEnabled)
245 *token = Token.init;
246 token.next = tokenFreelist;
247 tokenFreelist = token;
248 }
249
250 final TOK nextToken()
251 {
252 prevloc = token.loc;
253 if (token.next)
254 {
255 Token* t = token.next;
256 memcpy(&token, t, Token.sizeof);
257 releaseToken(t);
258 }
259 else
260 {
261 scan(&token);
262 }
263 //printf(token.toChars());
264 return token.value;
265 }
266
267 /***********************
268 * Look ahead at next token's value.
269 */
270 final TOK peekNext()
271 {
272 return peek(&token).value;
273 }
274
275 /***********************
276 * Look 2 tokens ahead at value.
277 */
278 final TOK peekNext2()
279 {
280 Token* t = peek(&token);
281 return peek(t).value;
282 }
283
284 /****************************
285 * Turn next token in buffer into a token.
235d5a96
IB
286 * Params:
287 * t = the token to set the resulting Token to
5fee5ec3
IB
288 */
289 final void scan(Token* t)
290 {
291 const lastLine = scanloc.linnum;
292 Loc startLoc;
293 t.blockComment = null;
294 t.lineComment = null;
295
296 while (1)
297 {
298 t.ptr = p;
299 //printf("p = %p, *p = '%c'\n",p,*p);
300 t.loc = loc();
301 switch (*p)
302 {
303 case 0:
304 case 0x1A:
305 t.value = TOK.endOfFile; // end of file
306 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
307 return;
308 case ' ':
9c7d5e88
IB
309 // Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
310 while ((cast(size_t)p) % uint.sizeof)
311 {
312 if (*p != ' ')
313 goto LendSkipFourSpaces;
314 p++;
315 }
316 while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
317 p += 4;
318 // Skip over any remaining space on the line.
319 while (*p == ' ')
320 p++;
321 LendSkipFourSpaces:
8977f4be
IB
322 version (DMDLIB)
323 {
324 if (whitespaceToken)
325 {
326 t.value = TOK.whitespace;
327 return;
328 }
329 }
9c7d5e88 330 continue; // skip white space
5fee5ec3
IB
331 case '\t':
332 case '\v':
333 case '\f':
334 p++;
8977f4be
IB
335 version (DMDLIB)
336 {
337 if (whitespaceToken)
338 {
339 t.value = TOK.whitespace;
340 return;
341 }
342 }
5fee5ec3
IB
343 continue; // skip white space
344 case '\r':
345 p++;
346 if (*p != '\n') // if CR stands by itself
235d5a96 347 {
5fee5ec3 348 endOfLine();
235d5a96
IB
349 if (tokenizeNewlines)
350 {
351 t.value = TOK.endOfLine;
352 tokenizeNewlines = false;
353 return;
354 }
355 }
8977f4be
IB
356 version (DMDLIB)
357 {
358 if (whitespaceToken)
359 {
360 t.value = TOK.whitespace;
361 return;
362 }
363 }
5fee5ec3
IB
364 continue; // skip white space
365 case '\n':
366 p++;
367 endOfLine();
235d5a96
IB
368 if (tokenizeNewlines)
369 {
370 t.value = TOK.endOfLine;
371 tokenizeNewlines = false;
372 return;
373 }
8977f4be
IB
374 version (DMDLIB)
375 {
376 if (whitespaceToken)
377 {
378 t.value = TOK.whitespace;
379 return;
380 }
381 }
5fee5ec3
IB
382 continue; // skip white space
383 case '0':
384 if (!isZeroSecond(p[1])) // if numeric literal does not continue
385 {
386 ++p;
387 t.unsvalue = 0;
388 t.value = TOK.int32Literal;
389 return;
390 }
391 goto Lnumber;
392
393 case '1': .. case '9':
394 if (!isDigitSecond(p[1])) // if numeric literal does not continue
395 {
396 t.unsvalue = *p - '0';
397 ++p;
398 t.value = TOK.int32Literal;
399 return;
400 }
401 Lnumber:
402 t.value = number(t);
403 return;
404
405 case '\'':
406 if (issinglechar(p[1]) && p[2] == '\'')
407 {
408 t.unsvalue = p[1]; // simple one character literal
6384eff5 409 t.value = TOK.charLiteral;
5fee5ec3
IB
410 p += 3;
411 }
412 else if (Ccompile)
413 {
414 clexerCharConstant(*t, 0);
415 }
416 else
417 {
418 t.value = charConstant(t);
419 }
420 return;
421
422 case 'u':
423 case 'U':
424 case 'L':
425 if (!Ccompile)
426 goto case_ident;
427 if (p[1] == '\'') // C wide character constant
428 {
429 char c = *p;
430 if (c == 'L') // convert L to u or U
431 c = (wchar_tsize == 4) ? 'u' : 'U';
432 ++p;
433 clexerCharConstant(*t, c);
434 return;
435 }
436 else if (p[1] == '\"') // C wide string literal
437 {
438 const c = *p;
439 ++p;
440 escapeStringConstant(t);
441 t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
442 c == 'u' ? 'w' :
443 'd';
444 return;
445 }
fbdaa581
IB
446 else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
447 {
448 p += 2;
449 escapeStringConstant(t);
450 return;
451 }
5fee5ec3
IB
452 goto case_ident;
453
454 case 'r':
7e287503 455 if (Ccompile || p[1] != '"')
5fee5ec3
IB
456 goto case_ident;
457 p++;
458 goto case '`';
459 case '`':
7e287503
IB
460 if (Ccompile)
461 goto default;
5fee5ec3
IB
462 wysiwygStringConstant(t);
463 return;
5fee5ec3 464 case 'q':
7e287503
IB
465 if (Ccompile)
466 goto case_ident;
5fee5ec3
IB
467 if (p[1] == '"')
468 {
469 p++;
470 delimitedStringConstant(t);
471 return;
472 }
473 else if (p[1] == '{')
474 {
475 p++;
476 tokenStringConstant(t);
477 return;
478 }
479 else
480 goto case_ident;
481 case '"':
482 escapeStringConstant(t);
483 return;
484 case 'a':
485 case 'b':
486 case 'c':
487 case 'd':
488 case 'e':
489 case 'f':
490 case 'g':
491 case 'h':
492 case 'i':
493 case 'j':
494 case 'k':
495 case 'l':
496 case 'm':
497 case 'n':
498 case 'o':
499 case 'p':
500 /*case 'q': case 'r':*/
501 case 's':
502 case 't':
503 //case 'u':
504 case 'v':
505 case 'w':
7e287503 506 case 'x':
5fee5ec3
IB
507 case 'y':
508 case 'z':
509 case 'A':
510 case 'B':
511 case 'C':
512 case 'D':
513 case 'E':
514 case 'F':
515 case 'G':
516 case 'H':
517 case 'I':
518 case 'J':
519 case 'K':
520 //case 'L':
521 case 'M':
522 case 'N':
523 case 'O':
524 case 'P':
525 case 'Q':
526 case 'R':
527 case 'S':
528 case 'T':
529 //case 'U':
530 case 'V':
531 case 'W':
532 case 'X':
533 case 'Y':
534 case 'Z':
535 case '_':
536 case_ident:
537 {
538 while (1)
539 {
540 const c = *++p;
541 if (isidchar(c))
542 continue;
543 else if (c & 0x80)
544 {
545 const s = p;
546 const u = decodeUTF();
547 if (isUniAlpha(u))
548 continue;
6d799f0a 549 error(t.loc, "char 0x%04x not allowed in identifier", u);
5fee5ec3
IB
550 p = s;
551 }
552 break;
553 }
554 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
555 t.ident = id;
556 t.value = cast(TOK)id.getValue();
557
558 anyToken = 1;
559
560 /* Different keywords for C and D
561 */
562 if (Ccompile)
563 {
564 if (t.value != TOK.identifier)
565 {
566 t.value = Ckeywords[t.value]; // filter out D keywords
567 }
568 }
569 else if (t.value >= FirstCKeyword)
570 t.value = TOK.identifier; // filter out C keywords
571
572 else if (*t.ptr == '_') // if special identifier token
573 {
574 // Lazy initialization
8da8c7d3 575 TimeStampInfo.initialize(t.loc, eSink);
5fee5ec3
IB
576
577 if (id == Id.DATE)
578 {
579 t.ustring = TimeStampInfo.date.ptr;
580 goto Lstr;
581 }
582 else if (id == Id.TIME)
583 {
584 t.ustring = TimeStampInfo.time.ptr;
585 goto Lstr;
586 }
587 else if (id == Id.VENDOR)
588 {
f99303eb 589 t.ustring = vendor.xarraydup.ptr;
5fee5ec3
IB
590 goto Lstr;
591 }
592 else if (id == Id.TIMESTAMP)
593 {
594 t.ustring = TimeStampInfo.timestamp.ptr;
595 Lstr:
596 t.value = TOK.string_;
597 t.postfix = 0;
598 t.len = cast(uint)strlen(t.ustring);
599 }
600 else if (id == Id.VERSIONX)
601 {
602 t.value = TOK.int64Literal;
f99303eb 603 t.unsvalue = versionNumber;
5fee5ec3
IB
604 }
605 else if (id == Id.EOFX)
606 {
607 t.value = TOK.endOfFile;
608 // Advance scanner to end of file
609 while (!(*p == 0 || *p == 0x1A))
610 p++;
611 }
612 }
613 //printf("t.value = %d\n",t.value);
614 return;
615 }
616 case '/':
617 p++;
618 switch (*p)
619 {
620 case '=':
621 p++;
622 t.value = TOK.divAssign;
623 return;
624 case '*':
625 p++;
626 startLoc = loc();
627 while (1)
628 {
629 while (1)
630 {
631 const c = *p;
632 switch (c)
633 {
634 case '/':
635 break;
636 case '\n':
637 endOfLine();
638 p++;
639 continue;
640 case '\r':
641 p++;
642 if (*p != '\n')
643 endOfLine();
644 continue;
645 case 0:
646 case 0x1A:
6d799f0a 647 error(t.loc, "unterminated /* */ comment");
5fee5ec3
IB
648 p = end;
649 t.loc = loc();
650 t.value = TOK.endOfFile;
651 return;
652 default:
653 if (c & 0x80)
654 {
655 const u = decodeUTF();
656 if (u == PS || u == LS)
657 endOfLine();
658 }
659 p++;
660 continue;
661 }
662 break;
663 }
664 p++;
665 if (p[-2] == '*' && p - 3 != t.ptr)
666 break;
667 }
668 if (commentToken)
669 {
670 t.loc = startLoc;
671 t.value = TOK.comment;
672 return;
673 }
674 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
675 {
676 // if /** but not /**/
677 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
678 lastDocLine = scanloc.linnum;
679 }
680 continue;
681 case '/': // do // style comments
682 startLoc = loc();
683 while (1)
684 {
685 const c = *++p;
686 switch (c)
687 {
688 case '\n':
689 break;
690 case '\r':
691 if (p[1] == '\n')
692 p++;
693 break;
694 case 0:
695 case 0x1A:
696 if (commentToken)
697 {
698 p = end;
699 t.loc = startLoc;
700 t.value = TOK.comment;
701 return;
702 }
703 if (doDocComment && t.ptr[2] == '/')
704 {
705 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
706 lastDocLine = scanloc.linnum;
707 }
708 p = end;
709 t.loc = loc();
710 t.value = TOK.endOfFile;
711 return;
712 default:
713 if (c & 0x80)
714 {
715 const u = decodeUTF();
716 if (u == PS || u == LS)
717 break;
718 }
719 continue;
720 }
721 break;
722 }
723 if (commentToken)
724 {
8977f4be
IB
725 version (DMDLIB) {}
726 else
727 {
728 p++;
729 endOfLine();
730 }
5fee5ec3
IB
731 t.loc = startLoc;
732 t.value = TOK.comment;
733 return;
734 }
735 if (doDocComment && t.ptr[2] == '/')
736 {
737 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
738 lastDocLine = scanloc.linnum;
739 }
740 p++;
741 endOfLine();
742 continue;
743 case '+':
7e287503 744 if (!Ccompile)
5fee5ec3
IB
745 {
746 int nest;
747 startLoc = loc();
748 p++;
749 nest = 1;
750 while (1)
751 {
752 char c = *p;
753 switch (c)
754 {
755 case '/':
756 p++;
757 if (*p == '+')
758 {
759 p++;
760 nest++;
761 }
762 continue;
763 case '+':
764 p++;
765 if (*p == '/')
766 {
767 p++;
768 if (--nest == 0)
769 break;
770 }
771 continue;
772 case '\r':
773 p++;
774 if (*p != '\n')
775 endOfLine();
776 continue;
777 case '\n':
778 endOfLine();
779 p++;
780 continue;
781 case 0:
782 case 0x1A:
6d799f0a 783 error(t.loc, "unterminated /+ +/ comment");
5fee5ec3
IB
784 p = end;
785 t.loc = loc();
786 t.value = TOK.endOfFile;
787 return;
788 default:
789 if (c & 0x80)
790 {
791 uint u = decodeUTF();
792 if (u == PS || u == LS)
793 endOfLine();
794 }
795 p++;
796 continue;
797 }
798 break;
799 }
800 if (commentToken)
801 {
802 t.loc = startLoc;
803 t.value = TOK.comment;
804 return;
805 }
806 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
807 {
808 // if /++ but not /++/
809 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
810 lastDocLine = scanloc.linnum;
811 }
812 continue;
813 }
7e287503 814 break;
5fee5ec3
IB
815 default:
816 break;
817 }
818 t.value = TOK.div;
819 return;
820 case '.':
821 p++;
822 if (isdigit(*p))
823 {
824 /* Note that we don't allow ._1 and ._ as being
825 * valid floating point numbers.
826 */
827 p--;
828 t.value = inreal(t);
829 }
830 else if (p[0] == '.')
831 {
832 if (p[1] == '.')
833 {
834 p += 2;
835 t.value = TOK.dotDotDot;
836 }
837 else
838 {
839 p++;
840 t.value = TOK.slice;
841 }
842 }
843 else
844 t.value = TOK.dot;
845 return;
846 case '&':
847 p++;
848 if (*p == '=')
849 {
850 p++;
851 t.value = TOK.andAssign;
852 }
853 else if (*p == '&')
854 {
855 p++;
856 t.value = TOK.andAnd;
857 }
858 else
859 t.value = TOK.and;
860 return;
861 case '|':
862 p++;
863 if (*p == '=')
864 {
865 p++;
866 t.value = TOK.orAssign;
867 }
868 else if (*p == '|')
869 {
870 p++;
871 t.value = TOK.orOr;
872 }
873 else
874 t.value = TOK.or;
875 return;
876 case '-':
877 p++;
878 if (*p == '=')
879 {
880 p++;
881 t.value = TOK.minAssign;
882 }
883 else if (*p == '-')
884 {
885 p++;
886 t.value = TOK.minusMinus;
887 }
888 else if (*p == '>')
889 {
890 ++p;
891 t.value = TOK.arrow;
892 }
893 else
894 t.value = TOK.min;
895 return;
896 case '+':
897 p++;
898 if (*p == '=')
899 {
900 p++;
901 t.value = TOK.addAssign;
902 }
903 else if (*p == '+')
904 {
905 p++;
906 t.value = TOK.plusPlus;
907 }
908 else
909 t.value = TOK.add;
910 return;
911 case '<':
912 p++;
913 if (*p == '=')
914 {
915 p++;
916 t.value = TOK.lessOrEqual; // <=
917 }
918 else if (*p == '<')
919 {
920 p++;
921 if (*p == '=')
922 {
923 p++;
924 t.value = TOK.leftShiftAssign; // <<=
925 }
926 else
927 t.value = TOK.leftShift; // <<
928 }
929 else if (*p == ':' && Ccompile)
930 {
931 ++p;
932 t.value = TOK.leftBracket; // <:
933 }
934 else if (*p == '%' && Ccompile)
935 {
936 ++p;
937 t.value = TOK.leftCurly; // <%
938 }
939 else
940 t.value = TOK.lessThan; // <
941 return;
942 case '>':
943 p++;
944 if (*p == '=')
945 {
946 p++;
947 t.value = TOK.greaterOrEqual; // >=
948 }
949 else if (*p == '>')
950 {
951 p++;
952 if (*p == '=')
953 {
954 p++;
955 t.value = TOK.rightShiftAssign; // >>=
956 }
957 else if (*p == '>')
958 {
959 p++;
960 if (*p == '=')
961 {
962 p++;
963 t.value = TOK.unsignedRightShiftAssign; // >>>=
964 }
965 else
966 t.value = TOK.unsignedRightShift; // >>>
967 }
968 else
969 t.value = TOK.rightShift; // >>
970 }
971 else
972 t.value = TOK.greaterThan; // >
973 return;
974 case '!':
975 p++;
976 if (*p == '=')
977 {
978 p++;
979 t.value = TOK.notEqual; // !=
980 }
981 else
982 t.value = TOK.not; // !
983 return;
984 case '=':
985 p++;
986 if (*p == '=')
987 {
988 p++;
989 t.value = TOK.equal; // ==
990 }
991 else if (*p == '>')
992 {
993 p++;
994 t.value = TOK.goesTo; // =>
995 }
996 else
997 t.value = TOK.assign; // =
998 return;
999 case '~':
1000 p++;
1001 if (*p == '=')
1002 {
1003 p++;
1004 t.value = TOK.concatenateAssign; // ~=
1005 }
1006 else
1007 t.value = TOK.tilde; // ~
1008 return;
1009 case '^':
1010 p++;
1011 if (*p == '^')
1012 {
1013 p++;
1014 if (*p == '=')
1015 {
1016 p++;
1017 t.value = TOK.powAssign; // ^^=
1018 }
1019 else
1020 t.value = TOK.pow; // ^^
1021 }
1022 else if (*p == '=')
1023 {
1024 p++;
1025 t.value = TOK.xorAssign; // ^=
1026 }
1027 else
1028 t.value = TOK.xor; // ^
1029 return;
1030 case '(':
1031 p++;
1032 t.value = TOK.leftParenthesis;
1033 return;
1034 case ')':
1035 p++;
1036 t.value = TOK.rightParenthesis;
1037 return;
1038 case '[':
1039 p++;
1040 t.value = TOK.leftBracket;
1041 return;
1042 case ']':
1043 p++;
1044 t.value = TOK.rightBracket;
1045 return;
1046 case '{':
1047 p++;
1048 t.value = TOK.leftCurly;
1049 return;
1050 case '}':
1051 p++;
1052 t.value = TOK.rightCurly;
1053 return;
1054 case '?':
1055 p++;
1056 t.value = TOK.question;
1057 return;
1058 case ',':
1059 p++;
1060 t.value = TOK.comma;
1061 return;
1062 case ';':
1063 p++;
1064 t.value = TOK.semicolon;
1065 return;
1066 case ':':
1067 p++;
1068 if (*p == ':')
1069 {
1070 ++p;
1071 t.value = TOK.colonColon;
1072 }
1073 else if (*p == '>' && Ccompile)
1074 {
1075 ++p;
1076 t.value = TOK.rightBracket;
1077 }
1078 else
1079 t.value = TOK.colon;
1080 return;
1081 case '$':
1082 p++;
1083 t.value = TOK.dollar;
1084 return;
1085 case '@':
1086 p++;
1087 t.value = TOK.at;
1088 return;
1089 case '*':
1090 p++;
1091 if (*p == '=')
1092 {
1093 p++;
1094 t.value = TOK.mulAssign;
1095 }
1096 else
1097 t.value = TOK.mul;
1098 return;
1099 case '%':
1100 p++;
1101 if (*p == '=')
1102 {
1103 p++;
1104 t.value = TOK.modAssign;
1105 }
1106 else if (*p == '>' && Ccompile)
1107 {
1108 ++p;
1109 t.value = TOK.rightCurly;
1110 }
1111 else if (*p == ':' && Ccompile)
1112 {
1113 goto case '#'; // %: means #
1114 }
1115 else
1116 t.value = TOK.mod;
1117 return;
1118 case '#':
1119 {
235d5a96
IB
1120 // https://issues.dlang.org/show_bug.cgi?id=22825
1121 // Special token sequences are terminated by newlines,
1122 // and should not be skipped over.
1123 this.tokenizeNewlines = true;
5fee5ec3 1124 p++;
7e287503 1125 if (parseSpecialTokenSequence())
5fee5ec3 1126 continue;
5fee5ec3
IB
1127 t.value = TOK.pound;
1128 return;
1129 }
1130 default:
1131 {
1132 dchar c = *p;
1133 if (c & 0x80)
1134 {
1135 c = decodeUTF();
1136 // Check for start of unicode identifier
1137 if (isUniAlpha(c))
1138 goto case_ident;
1139 if (c == PS || c == LS)
1140 {
1141 endOfLine();
1142 p++;
235d5a96
IB
1143 if (tokenizeNewlines)
1144 {
1145 t.value = TOK.endOfLine;
1146 tokenizeNewlines = false;
1147 return;
1148 }
5fee5ec3
IB
1149 continue;
1150 }
1151 }
1152 if (c < 0x80 && isprint(c))
6d799f0a 1153 error(t.loc, "character '%c' is not a valid token", c);
5fee5ec3 1154 else
6d799f0a 1155 error(t.loc, "character 0x%02x is not a valid token", c);
5fee5ec3
IB
1156 p++;
1157 continue;
6d799f0a 1158 // assert(0);
5fee5ec3
IB
1159 }
1160 }
1161 }
1162 }
1163
1164 final Token* peek(Token* ct)
1165 {
1166 Token* t;
1167 if (ct.next)
1168 t = ct.next;
1169 else
1170 {
1171 t = allocateToken();
1172 scan(t);
1173 ct.next = t;
1174 }
1175 return t;
1176 }
1177
1178 /*********************************
1179 * tk is on the opening (.
1180 * Look ahead and return token that is past the closing ).
1181 */
1182 final Token* peekPastParen(Token* tk)
1183 {
1184 //printf("peekPastParen()\n");
1185 int parens = 1;
1186 int curlynest = 0;
1187 while (1)
1188 {
1189 tk = peek(tk);
1190 //tk.print();
1191 switch (tk.value)
1192 {
1193 case TOK.leftParenthesis:
1194 parens++;
1195 continue;
1196 case TOK.rightParenthesis:
1197 --parens;
1198 if (parens)
1199 continue;
1200 tk = peek(tk);
1201 break;
1202 case TOK.leftCurly:
1203 curlynest++;
1204 continue;
1205 case TOK.rightCurly:
1206 if (--curlynest >= 0)
1207 continue;
1208 break;
1209 case TOK.semicolon:
1210 if (curlynest)
1211 continue;
1212 break;
1213 case TOK.endOfFile:
1214 break;
1215 default:
1216 continue;
1217 }
1218 return tk;
1219 }
1220 }
1221
1222 /*******************************************
1223 * Parse escape sequence.
1224 */
7e7ebe3e 1225 private uint escapeSequence(out dchar c2)
5fee5ec3 1226 {
7e7ebe3e 1227 return Lexer.escapeSequence(token.loc, p, Ccompile, c2);
5fee5ec3
IB
1228 }
1229
1230 /********
1231 * Parse the given string literal escape sequence into a single character.
1232 * D https://dlang.org/spec/lex.html#escape_sequences
1233 * C11 6.4.4.4
1234 * Params:
1235 * loc = location to use for error messages
1236 * sequence = pointer to string with escape sequence to parse. Updated to
1237 * point past the end of the escape sequence
1238 * Ccompile = true for compile C11 escape sequences
7e7ebe3e 1239 * c2 = returns second `dchar` of html entity with 2 code units, otherwise stays `dchar.init`
5fee5ec3
IB
1240 * Returns:
1241 * the escape sequence as a single character
1242 */
7e7ebe3e 1243 private dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile, out dchar c2)
5fee5ec3
IB
1244 {
1245 const(char)* p = sequence; // cache sequence reference on stack
1246 scope(exit) sequence = p;
1247
1248 uint c = *p;
1249 int ndigits;
1250 switch (c)
1251 {
1252 case '\'':
1253 case '"':
1254 case '?':
1255 case '\\':
1256 Lconsume:
1257 p++;
1258 break;
1259 case 'a':
1260 c = 7;
1261 goto Lconsume;
1262 case 'b':
1263 c = 8;
1264 goto Lconsume;
1265 case 'f':
1266 c = 12;
1267 goto Lconsume;
1268 case 'n':
1269 c = 10;
1270 goto Lconsume;
1271 case 'r':
1272 c = 13;
1273 goto Lconsume;
1274 case 't':
1275 c = 9;
1276 goto Lconsume;
1277 case 'v':
1278 c = 11;
1279 goto Lconsume;
1280 case 'u':
1281 ndigits = 4;
1282 goto Lhex;
1283 case 'U':
1284 ndigits = 8;
1285 goto Lhex;
1286 case 'x':
1287 ndigits = 2;
1288 Lhex:
1289 p++;
1290 c = *p;
1291 if (ishex(cast(char)c))
1292 {
1293 uint v = 0;
1294 int n = 0;
ae56e2da 1295 if (Ccompile && ndigits == 2)
5fee5ec3 1296 {
ae56e2da
IB
1297 /* C11 6.4.4.4-7 one to infinity hex digits
1298 */
1299 do
5fee5ec3 1300 {
ae56e2da
IB
1301 if (isdigit(cast(char)c))
1302 c -= '0';
1303 else if (islower(c))
1304 c -= 'a' - 10;
1305 else
1306 c -= 'A' - 10;
1307 v = v * 16 + c;
1308 c = *++p;
1309 } while (ishex(cast(char)c));
5fee5ec3 1310 }
ae56e2da 1311 else
5fee5ec3 1312 {
ae56e2da
IB
1313 while (1)
1314 {
1315 if (isdigit(cast(char)c))
1316 c -= '0';
1317 else if (islower(c))
1318 c -= 'a' - 10;
1319 else
1320 c -= 'A' - 10;
1321 v = v * 16 + c;
1322 c = *++p;
1323 if (++n == ndigits)
1324 break;
1325 if (!ishex(cast(char)c))
1326 {
610d7898 1327 error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
ae56e2da
IB
1328 break;
1329 }
1330 }
1331 if (ndigits != 2 && !utf_isValidDchar(v))
1332 {
610d7898 1333 error(loc, "invalid UTF character \\U%08x", v);
ae56e2da
IB
1334 v = '?'; // recover with valid UTF character
1335 }
5fee5ec3
IB
1336 }
1337 c = v;
1338 }
1339 else
1340 {
610d7898 1341 error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
5fee5ec3
IB
1342 p++;
1343 }
1344 break;
1345 case '&':
1346 if (Ccompile)
1347 goto default;
1348
1349 // named character entity
1350 for (const idstart = ++p; 1; p++)
1351 {
1352 switch (*p)
1353 {
1354 case ';':
7e7ebe3e
IB
1355 auto entity = HtmlNamedEntity(idstart[0 .. p - idstart]);
1356 c = entity[0];
1357 if (entity == entity.init)
5fee5ec3 1358 {
610d7898 1359 error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
5fee5ec3
IB
1360 c = '?';
1361 }
7e7ebe3e
IB
1362 if (entity[1] != entity.init[1])
1363 c2 = entity[1];
1364
5fee5ec3
IB
1365 p++;
1366 break;
1367 default:
1368 if (isalpha(*p) || (p != idstart && isdigit(*p)))
1369 continue;
610d7898 1370 error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
5fee5ec3
IB
1371 c = '?';
1372 break;
1373 }
1374 break;
1375 }
1376 break;
1377 case 0:
1378 case 0x1A:
1379 // end of file
1380 c = '\\';
1381 break;
1382 default:
1383 if (isoctal(cast(char)c))
1384 {
1385 uint v = 0;
1386 int n = 0;
1387 do
1388 {
1389 v = v * 8 + (c - '0');
1390 c = *++p;
1391 }
1392 while (++n < 3 && isoctal(cast(char)c));
1393 c = v;
1394 if (c > 0xFF)
610d7898 1395 error(loc, "escape octal sequence \\%03o is larger than \\377", c);
5fee5ec3
IB
1396 }
1397 else
1398 {
610d7898 1399 error(loc, "undefined escape sequence \\%c", c);
5fee5ec3
IB
1400 p++;
1401 }
1402 break;
1403 }
1404 return c;
1405 }
1406
1407 /**
1408 Lex a wysiwyg string. `p` must be pointing to the first character before the
1409 contents of the string literal. The character pointed to by `p` will be used as
1410 the terminating character (i.e. backtick or double-quote).
1411 Params:
1412 result = pointer to the token that accepts the result
1413 */
1414 private void wysiwygStringConstant(Token* result)
1415 {
1416 result.value = TOK.string_;
1417 Loc start = loc();
1418 auto terminator = p[0];
1419 p++;
1420 stringbuffer.setsize(0);
1421 while (1)
1422 {
1423 dchar c = p[0];
1424 p++;
1425 switch (c)
1426 {
1427 case '\n':
1428 endOfLine();
1429 break;
1430 case '\r':
1431 if (p[0] == '\n')
1432 continue; // ignore
1433 c = '\n'; // treat EndOfLine as \n character
1434 endOfLine();
1435 break;
1436 case 0:
1437 case 0x1A:
1438 error("unterminated string constant starting at %s", start.toChars());
1439 result.setString();
1440 // rewind `p` so it points to the EOF character
1441 p--;
1442 return;
1443 default:
1444 if (c == terminator)
1445 {
1446 result.setString(stringbuffer);
1447 stringPostfix(result);
1448 return;
1449 }
1450 else if (c & 0x80)
1451 {
1452 p--;
1453 const u = decodeUTF();
1454 p++;
1455 if (u == PS || u == LS)
1456 endOfLine();
1457 stringbuffer.writeUTF8(u);
1458 continue;
1459 }
1460 break;
1461 }
1462 stringbuffer.writeByte(c);
1463 }
1464 }
1465
5fee5ec3
IB
1466 /**
1467 Lex a delimited string. Some examples of delimited strings are:
1468 ---
1469 q"(foo(xxx))" // "foo(xxx)"
1470 q"[foo$(LPAREN)]" // "foo$(LPAREN)"
1471 q"/foo]/" // "foo]"
1472 q"HERE
1473 foo
1474 HERE" // "foo\n"
1475 ---
1476 It is assumed that `p` points to the opening double-quote '"'.
1477 Params:
1478 result = pointer to the token that accepts the result
1479 */
1480 private void delimitedStringConstant(Token* result)
1481 {
1482 result.value = TOK.string_;
1483 Loc start = loc();
1484 dchar delimleft = 0;
1485 dchar delimright = 0;
1486 uint nest = 1;
1487 uint nestcount = ~0; // dead assignment, needed to suppress warning
1488 Identifier hereid = null;
1489 uint blankrol = 0;
1490 uint startline = 0;
1491 p++;
1492 stringbuffer.setsize(0);
1493 while (1)
1494 {
6d799f0a 1495 const s = p;
5fee5ec3
IB
1496 dchar c = *p++;
1497 //printf("c = '%c'\n", c);
1498 switch (c)
1499 {
1500 case '\n':
1501 Lnextline:
1502 endOfLine();
1503 startline = 1;
1504 if (blankrol)
1505 {
1506 blankrol = 0;
1507 continue;
1508 }
1509 if (hereid)
1510 {
1511 stringbuffer.writeUTF8(c);
1512 continue;
1513 }
1514 break;
1515 case '\r':
1516 if (*p == '\n')
1517 continue; // ignore
1518 c = '\n'; // treat EndOfLine as \n character
1519 goto Lnextline;
1520 case 0:
1521 case 0x1A:
1522 error("unterminated delimited string constant starting at %s", start.toChars());
1523 result.setString();
1524 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1525 p--;
1526 return;
1527 default:
1528 if (c & 0x80)
1529 {
1530 p--;
1531 c = decodeUTF();
1532 p++;
1533 if (c == PS || c == LS)
1534 goto Lnextline;
1535 }
1536 break;
1537 }
1538 if (delimleft == 0)
1539 {
1540 delimleft = c;
1541 nest = 1;
1542 nestcount = 1;
1543 if (c == '(')
1544 delimright = ')';
1545 else if (c == '{')
1546 delimright = '}';
1547 else if (c == '[')
1548 delimright = ']';
1549 else if (c == '<')
1550 delimright = '>';
1551 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1552 {
1553 // Start of identifier; must be a heredoc
1554 Token tok;
6d799f0a 1555 p = s;
5fee5ec3
IB
1556 scan(&tok); // read in heredoc identifier
1557 if (tok.value != TOK.identifier)
1558 {
1559 error("identifier expected for heredoc, not %s", tok.toChars());
1560 delimright = c;
1561 }
1562 else
1563 {
1564 hereid = tok.ident;
1565 //printf("hereid = '%s'\n", hereid.toChars());
1566 blankrol = 1;
1567 }
1568 nest = 0;
1569 }
1570 else
1571 {
1572 delimright = c;
1573 nest = 0;
1574 if (isspace(c))
1575 error("delimiter cannot be whitespace");
1576 }
1577 }
1578 else
1579 {
1580 if (blankrol)
1581 {
1582 error("heredoc rest of line should be blank");
1583 blankrol = 0;
1584 continue;
1585 }
1586 if (nest == 1)
1587 {
1588 if (c == delimleft)
1589 nestcount++;
1590 else if (c == delimright)
1591 {
1592 nestcount--;
1593 if (nestcount == 0)
1594 goto Ldone;
1595 }
1596 }
1597 else if (c == delimright)
1598 goto Ldone;
1599 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
1600 {
1601 Token tok;
1602 auto psave = p;
6d799f0a 1603 p = s;
5fee5ec3
IB
1604 scan(&tok); // read in possible heredoc identifier
1605 //printf("endid = '%s'\n", tok.ident.toChars());
1606 if (tok.value == TOK.identifier && tok.ident is hereid)
1607 {
1608 /* should check that rest of line is blank
1609 */
1610 goto Ldone;
1611 }
1612 p = psave;
1613 }
1614 stringbuffer.writeUTF8(c);
1615 startline = 0;
1616 }
1617 }
1618 Ldone:
1619 if (*p == '"')
1620 p++;
1621 else if (hereid)
6384eff5
IB
1622 error("delimited string must end in `%s\"`", hereid.toChars());
1623 else if (isspace(delimright))
1624 error("delimited string must end in `\"`");
5fee5ec3 1625 else
8da8c7d3 1626 error(token.loc, "delimited string must end in `%c\"`", delimright);
5fee5ec3
IB
1627 result.setString(stringbuffer);
1628 stringPostfix(result);
1629 }
1630
1631 /**
1632 Lex a token string. Some examples of token strings are:
1633 ---
1634 q{ foo(xxx) } // " foo(xxx) "
1635 q{foo$(LPAREN)} // "foo$(LPAREN)"
1636 q{{foo}"}"} // "{foo}"}""
1637 ---
1638 It is assumed that `p` points to the opening curly-brace.
1639 Params:
1640 result = pointer to the token that accepts the result
1641 */
1642 private void tokenStringConstant(Token* result)
1643 {
1644 result.value = TOK.string_;
1645
1646 uint nest = 1;
1647 const start = loc();
1648 const pstart = ++p;
1649 inTokenStringConstant++;
1650 scope(exit) inTokenStringConstant--;
1651 while (1)
1652 {
1653 Token tok;
1654 scan(&tok);
1655 switch (tok.value)
1656 {
1657 case TOK.leftCurly:
1658 nest++;
1659 continue;
1660 case TOK.rightCurly:
1661 if (--nest == 0)
1662 {
1663 result.setString(pstart, p - 1 - pstart);
1664 stringPostfix(result);
1665 return;
1666 }
1667 continue;
1668 case TOK.endOfFile:
1669 error("unterminated token string constant starting at %s", start.toChars());
1670 result.setString();
1671 return;
1672 default:
1673 continue;
1674 }
1675 }
1676 }
1677
1678 /**
1679 Scan a quoted string while building the processed string value by
1680 handling escape sequences. The result is returned in the given `t` token.
1681 This function assumes that `p` currently points to the opening quote
1682 of the string.
1683 Params:
1684 t = the token to set the resulting string to
1685 * References:
1686 * D https://dlang.org/spec/lex.html#double_quoted_strings
1687 * ImportC C11 6.4.5
1688 */
1689 private void escapeStringConstant(Token* t)
1690 {
1691 t.value = TOK.string_;
1692
1693 const start = loc();
1694 const tc = *p++; // opening quote
1695 stringbuffer.setsize(0);
1696 while (1)
1697 {
1698 dchar c = *p++;
7e7ebe3e 1699 dchar c2;
5fee5ec3
IB
1700 switch (c)
1701 {
1702 case '\\':
1703 switch (*p)
1704 {
1705 case '&':
1706 if (Ccompile)
1707 goto default;
5fee5ec3 1708
7e7ebe3e
IB
1709 c = escapeSequence(c2);
1710 stringbuffer.writeUTF8(c);
1711 if (c2 != dchar.init)
1712 stringbuffer.writeUTF8(c2);
1713 continue;
5fee5ec3
IB
1714 case 'u':
1715 case 'U':
7e7ebe3e 1716 c = escapeSequence(c2);
5fee5ec3
IB
1717 stringbuffer.writeUTF8(c);
1718 continue;
1719 default:
7e7ebe3e 1720 c = escapeSequence(c2);
5fee5ec3
IB
1721 break;
1722 }
1723 break;
1724 case '\n':
1725 endOfLine();
1726 if (Ccompile)
1727 goto Lunterminated;
1728 break;
1729 case '\r':
1730 if (*p == '\n')
1731 continue; // ignore
1732 c = '\n'; // treat EndOfLine as \n character
1733 endOfLine();
1734 if (Ccompile)
1735 goto Lunterminated;
1736 break;
1737 case '\'':
1738 case '"':
1739 if (c != tc)
1740 goto default;
1741 t.setString(stringbuffer);
1742 if (!Ccompile)
1743 stringPostfix(t);
1744 return;
1745 case 0:
1746 case 0x1A:
1747 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1748 p--;
1749 Lunterminated:
1750 error("unterminated string constant starting at %s", start.toChars());
1751 t.setString();
1752 return;
1753 default:
1754 if (c & 0x80)
1755 {
1756 p--;
1757 c = decodeUTF();
1758 if (c == LS || c == PS)
1759 {
1760 c = '\n';
1761 endOfLine();
1762 if (Ccompile)
1763 goto Lunterminated;
1764 }
1765 p++;
1766 stringbuffer.writeUTF8(c);
1767 continue;
1768 }
1769 break;
1770 }
1771 stringbuffer.writeByte(c);
1772 }
1773 }
1774
1775 /**************************************
1776 * Reference:
1777 * https://dlang.org/spec/lex.html#characterliteral
1778 */
1779 private TOK charConstant(Token* t)
1780 {
1781 TOK tk = TOK.charLiteral;
1782 //printf("Lexer::charConstant\n");
1783 p++;
1784 dchar c = *p++;
7e7ebe3e 1785 dchar c2;
5fee5ec3
IB
1786 switch (c)
1787 {
1788 case '\\':
1789 switch (*p)
1790 {
1791 case 'u':
5fee5ec3 1792 tk = TOK.wcharLiteral;
7e7ebe3e 1793 goto default;
5fee5ec3
IB
1794 case 'U':
1795 case '&':
5fee5ec3 1796 tk = TOK.dcharLiteral;
7e7ebe3e 1797 goto default;
5fee5ec3 1798 default:
7e7ebe3e
IB
1799 t.unsvalue = escapeSequence(c2);
1800 if (c2 != c2.init)
1801 {
1802 error("html entity requires 2 code units, use a string instead of a character");
1803 t.unsvalue = '?';
1804 }
5fee5ec3
IB
1805 break;
1806 }
1807 break;
1808 case '\n':
1809 L1:
1810 endOfLine();
1811 goto case;
1812 case '\r':
1813 goto case '\'';
1814 case 0:
1815 case 0x1A:
1816 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
1817 p--;
1818 goto case;
1819 case '\'':
1820 error("unterminated character constant");
1821 t.unsvalue = '?';
1822 return tk;
1823 default:
1824 if (c & 0x80)
1825 {
1826 p--;
1827 c = decodeUTF();
1828 p++;
1829 if (c == LS || c == PS)
1830 goto L1;
1831 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1832 tk = TOK.wcharLiteral;
1833 else
1834 tk = TOK.dcharLiteral;
1835 }
1836 t.unsvalue = c;
1837 break;
1838 }
1839 if (*p != '\'')
1840 {
1841 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
1842 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
1843 {
1844 if (*p & 0x80)
1845 {
1846 const s = p;
1847 c = decodeUTF();
1848 if (c == LS || c == PS)
1849 {
1850 p = s;
1851 break;
1852 }
1853 }
1854 p++;
1855 }
1856
1857 if (*p == '\'')
1858 {
1859 error("character constant has multiple characters");
1860 p++;
1861 }
1862 else
1863 error("unterminated character constant");
1864 t.unsvalue = '?';
1865 return tk;
1866 }
1867 p++;
1868 return tk;
1869 }
1870
1871 /***************************************
1872 * Lex C character constant.
1873 * Parser is on the opening quote.
1874 * Params:
1875 * t = token to fill in
1876 * prefix = one of `u`, `U` or 0.
1877 * Reference:
1878 * C11 6.4.4.4
1879 */
1880 private void clexerCharConstant(ref Token t, char prefix)
1881 {
1882 escapeStringConstant(&t);
1883 const(char)[] str = t.ustring[0 .. t.len];
1884 const n = str.length;
1885 const loc = t.loc;
1886 if (n == 0)
1887 {
1888 error(loc, "empty character constant");
1889 t.value = TOK.semicolon;
1890 return;
1891 }
1892
1893 uint u;
1894 switch (prefix)
1895 {
1896 case 0:
1897 if (n == 1) // fast case
1898 {
1899 u = str[0];
1900 }
1901 else if (n > 4)
1902 error(loc, "max number of chars in character literal is 4, had %d",
1903 cast(int)n);
1904 else
1905 {
1906 foreach (i, c; str)
1907 (cast(char*)&u)[n - 1 - i] = c;
1908 }
1909 break;
1910
1911 case 'u':
1912 dchar d1;
1913 size_t idx;
1914 auto msg = utf_decodeChar(str, idx, d1);
1915 dchar d2 = 0;
1916 if (idx < n && !msg)
1917 msg = utf_decodeChar(str, idx, d2);
1918 if (msg)
8da8c7d3 1919 error(loc, "%.*s", cast(int)msg.length, msg.ptr);
5fee5ec3
IB
1920 else if (idx < n)
1921 error(loc, "max number of chars in 16 bit character literal is 2, had %d",
8da8c7d3 1922 cast(int)((n + 1) >> 1));
5fee5ec3
IB
1923 else if (d1 > 0x1_0000)
1924 error(loc, "%d does not fit in 16 bits", d1);
1925 else if (d2 > 0x1_0000)
1926 error(loc, "%d does not fit in 16 bits", d2);
1927 u = d1;
1928 if (d2)
1929 u = (d1 << 16) | d2;
1930 break;
1931
1932 case 'U':
1933 dchar d;
1934 size_t idx;
1935 auto msg = utf_decodeChar(str, idx, d);
1936 if (msg)
8da8c7d3 1937 error(loc, "%.*s", cast(int)msg.length, msg.ptr);
5fee5ec3
IB
1938 else if (idx < n)
1939 error(loc, "max number of chars in 32 bit character literal is 1, had %d",
8da8c7d3 1940 cast(int)((n + 3) >> 2));
5fee5ec3
IB
1941 u = d;
1942 break;
1943
1944 default:
1945 assert(0);
1946 }
6384eff5 1947 t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
5fee5ec3
IB
1948 t.unsvalue = u;
1949 }
1950
1951 /***************************************
1952 * Get postfix of string literal.
1953 */
1954 private void stringPostfix(Token* t) pure @nogc
1955 {
1956 switch (*p)
1957 {
1958 case 'c':
1959 case 'w':
1960 case 'd':
1961 t.postfix = *p;
1962 p++;
1963 break;
1964 default:
1965 t.postfix = 0;
1966 break;
1967 }
1968 }
1969
1970 /**************************************
1971 * Read in a number.
1972 * If it's an integer, store it in tok.TKutok.Vlong.
1973 * integers can be decimal, octal or hex
1974 * Handle the suffixes U, UL, LU, L, etc.
1975 * If it's double, store it in tok.TKutok.Vdouble.
1976 * Returns:
1977 * TKnum
1978 * TKdouble,...
1979 */
1980 private TOK number(Token* t)
1981 {
1982 int base = 10;
1983 const start = p;
f99303eb 1984 ulong n = 0; // unsigned >=64 bit integer type
5fee5ec3
IB
1985 int d;
1986 bool err = false;
1987 bool overflow = false;
1988 bool anyBinaryDigitsNoSingleUS = false;
1989 bool anyHexDigitsNoSingleUS = false;
fd43568c 1990 char errorDigit = 0;
5fee5ec3
IB
1991 dchar c = *p;
1992 if (c == '0')
1993 {
1994 ++p;
1995 c = *p;
1996 switch (c)
1997 {
1998 case '0':
1999 case '1':
2000 case '2':
2001 case '3':
2002 case '4':
2003 case '5':
2004 case '6':
2005 case '7':
2006 base = 8;
2007 break;
2008
2009 case '8':
2010 case '9':
fd43568c 2011 errorDigit = cast(char) c;
5fee5ec3
IB
2012 base = 8;
2013 break;
2014 case 'x':
2015 case 'X':
2016 ++p;
2017 base = 16;
2018 break;
2019 case 'b':
2020 case 'B':
5fee5ec3
IB
2021 ++p;
2022 base = 2;
2023 break;
2024 case '.':
2025 if (p[1] == '.')
2026 goto Ldone; // if ".."
2027 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
6384eff5
IB
2028 {
2029 if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2030 goto Lreal; // if `0.f` or `0.L`
5fee5ec3 2031 goto Ldone; // if ".identifier" or ".unicode"
6384eff5 2032 }
5fee5ec3
IB
2033 goto Lreal; // '.' is part of current token
2034 case 'i':
2035 case 'f':
2036 case 'F':
2037 goto Lreal;
2038 case '_':
2039 if (Ccompile)
2040 error("embedded `_` not allowed");
2041 ++p;
2042 base = 8;
2043 break;
2044 case 'L':
2045 if (p[1] == 'i')
2046 goto Lreal;
2047 break;
2048 default:
2049 break;
2050 }
2051 }
2052 while (1)
2053 {
2054 c = *p;
2055 switch (c)
2056 {
2057 case '0':
2058 case '1':
2059 case '2':
2060 case '3':
2061 case '4':
2062 case '5':
2063 case '6':
2064 case '7':
2065 case '8':
2066 case '9':
2067 ++p;
2068 d = c - '0';
2069 break;
2070 case 'a':
2071 case 'b':
2072 case 'c':
2073 case 'd':
2074 case 'e':
2075 case 'f':
2076 case 'A':
2077 case 'B':
2078 case 'C':
2079 case 'D':
2080 case 'E':
2081 case 'F':
2082 ++p;
2083 if (base != 16)
2084 {
2085 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
2086 goto Lreal;
2087 }
2088 if (c >= 'a')
2089 d = c + 10 - 'a';
2090 else
2091 d = c + 10 - 'A';
2092 break;
2093 case 'L':
2094 if (p[1] == 'i')
2095 goto Lreal;
2096 goto Ldone;
2097 case '.':
2098 if (p[1] == '.')
2099 goto Ldone; // if ".."
0fb57034 2100 if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
6384eff5
IB
2101 {
2102 if (Ccompile && base == 10 &&
fbdaa581
IB
2103 (p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
2104 goto Lreal; // if `1.e6` or `1.f` or `1.L`
5fee5ec3 2105 goto Ldone; // if ".identifier" or ".unicode"
6384eff5 2106 }
5fee5ec3
IB
2107 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
2108 goto Ldone; // if ".identifier" or ".unicode"
2109 if (base == 2)
2110 goto Ldone; // if ".identifier" or ".unicode"
2111 goto Lreal; // otherwise as part of a floating point literal
2112 case 'p':
2113 case 'P':
2114 case 'i':
2115 Lreal:
2116 p = start;
2117 return inreal(t);
2118 case '_':
2119 if (Ccompile)
2120 goto default;
2121 ++p;
2122 continue;
2123 default:
2124 goto Ldone;
2125 }
2126 // got a digit here, set any necessary flags, check for errors
2127 anyHexDigitsNoSingleUS = true;
2128 anyBinaryDigitsNoSingleUS = true;
fd43568c 2129 if (!errorDigit && d >= base)
5fee5ec3 2130 {
fd43568c 2131 errorDigit = cast(char) c;
5fee5ec3
IB
2132 }
2133 // Avoid expensive overflow check if we aren't at risk of overflow
2134 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
2135 n = n * base + d;
2136 else
2137 {
2138 import core.checkedint : mulu, addu;
2139
2140 n = mulu(n, base, overflow);
2141 n = addu(n, d, overflow);
2142 }
2143 }
2144 Ldone:
fd43568c
IB
2145 if (errorDigit)
2146 {
8da8c7d3 2147 error(token.loc, "%s digit expected, not `%c`", base == 2 ? "binary".ptr :
fd43568c
IB
2148 base == 8 ? "octal".ptr :
2149 "decimal".ptr, errorDigit);
2150 err = true;
2151 }
5fee5ec3
IB
2152 if (overflow && !err)
2153 {
2154 error("integer overflow");
2155 err = true;
2156 }
2157 if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
2158 (base == 16 && !anyHexDigitsNoSingleUS))
8da8c7d3 2159 error(token.loc, "`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
5fee5ec3
IB
2160
2161 t.unsvalue = n;
2162
2163 if (Ccompile)
2164 return cnumber(base, n);
2165
2166 enum FLAGS : int
2167 {
2168 none = 0,
2169 decimal = 1, // decimal
2170 unsigned = 2, // u or U suffix
2171 long_ = 4, // L suffix
2172 }
2173
2174 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
2175 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
2176 const psuffix = p;
2177 while (1)
2178 {
2179 FLAGS f;
2180 switch (*p)
2181 {
2182 case 'U':
2183 case 'u':
2184 f = FLAGS.unsigned;
2185 goto L1;
2186 case 'l':
2187 f = FLAGS.long_;
2188 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
2189 goto L1;
2190 case 'L':
2191 f = FLAGS.long_;
2192 L1:
2193 p++;
2194 if ((flags & f) && !err)
2195 {
2196 error("unrecognized token");
2197 err = true;
2198 }
2199 flags = cast(FLAGS)(flags | f);
2200 continue;
2201 default:
2202 break;
2203 }
2204 break;
2205 }
2206 if (base == 8 && n >= 8)
2207 {
2208 if (err)
2209 // can't translate invalid octal value, just show a generic message
2210 error("octal literals larger than 7 are no longer supported");
2211 else
8da8c7d3 2212 error(token.loc, "octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
5fee5ec3
IB
2213 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
2214 }
2215 TOK result;
2216 switch (flags)
2217 {
2218 case FLAGS.none:
2219 /* Octal or Hexadecimal constant.
2220 * First that fits: int, uint, long, ulong
2221 */
2222 if (n & 0x8000000000000000L)
2223 result = TOK.uns64Literal;
2224 else if (n & 0xFFFFFFFF00000000L)
2225 result = TOK.int64Literal;
2226 else if (n & 0x80000000)
2227 result = TOK.uns32Literal;
2228 else
2229 result = TOK.int32Literal;
2230 break;
2231 case FLAGS.decimal:
2232 /* First that fits: int, long, long long
2233 */
2234 if (n & 0x8000000000000000L)
2235 {
2236 result = TOK.uns64Literal;
2237 }
2238 else if (n & 0xFFFFFFFF80000000L)
2239 result = TOK.int64Literal;
2240 else
2241 result = TOK.int32Literal;
2242 break;
2243 case FLAGS.unsigned:
2244 case FLAGS.decimal | FLAGS.unsigned:
2245 /* First that fits: uint, ulong
2246 */
2247 if (n & 0xFFFFFFFF00000000L)
2248 result = TOK.uns64Literal;
2249 else
2250 result = TOK.uns32Literal;
2251 break;
2252 case FLAGS.decimal | FLAGS.long_:
2253 if (n & 0x8000000000000000L)
2254 {
2255 if (!err)
2256 {
2257 error("signed integer overflow");
2258 err = true;
2259 }
2260 result = TOK.uns64Literal;
2261 }
2262 else
2263 result = TOK.int64Literal;
2264 break;
2265 case FLAGS.long_:
2266 if (n & 0x8000000000000000L)
2267 result = TOK.uns64Literal;
2268 else
2269 result = TOK.int64Literal;
2270 break;
2271 case FLAGS.unsigned | FLAGS.long_:
2272 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2273 result = TOK.uns64Literal;
2274 break;
2275 default:
2276 debug
2277 {
2278 printf("%x\n", flags);
2279 }
2280 assert(0);
2281 }
2282 return result;
2283 }
2284
2285 /**************************************
2286 * Lex C integer-suffix
2287 * Params:
2288 * base = number base
2289 * n = raw integer value
2290 * Returns:
2291 * token value
2292 */
f99303eb 2293 private TOK cnumber(int base, ulong n)
5fee5ec3
IB
2294 {
2295 /* C11 6.4.4.1
2296 * Parse trailing suffixes:
2297 * u or U
2298 * l or L
2299 * ll or LL
2300 */
2301 enum FLAGS : uint
2302 {
2303 octalhex = 1, // octal or hexadecimal
2304 decimal = 2, // decimal
2305 unsigned = 4, // u or U suffix
2306 long_ = 8, // l or L suffix
2307 llong = 0x10 // ll or LL
2308 }
2309 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
2310 bool err;
2311 Lsuffixes:
2312 while (1)
2313 {
2314 FLAGS f;
2315 const cs = *p;
2316 switch (cs)
2317 {
2318 case 'U':
2319 case 'u':
2320 f = FLAGS.unsigned;
2321 break;
2322
2323 case 'l':
2324 case 'L':
2325 f = FLAGS.long_;
2326 if (cs == p[1])
2327 {
2328 f = FLAGS.long_ | FLAGS.llong;
2329 ++p;
2330 }
2331 break;
2332
2333 default:
2334 break Lsuffixes;
2335 }
2336 ++p;
2337 if ((flags & f) && !err)
2338 {
2339 error("duplicate integer suffixes");
2340 err = true;
2341 }
2342 flags = cast(FLAGS)(flags | f);
2343 }
2344
5fee5ec3
IB
2345 TOK result = TOK.int32Literal; // default
2346 switch (flags)
2347 {
2348 /* Since D doesn't have a variable sized `long` or `unsigned long` type,
2349 * this code deviates from C by picking D int, uint, long, or ulong instead
2350 */
2351
2352 case FLAGS.octalhex:
2353 /* Octal or Hexadecimal constant.
2354 * First that fits: int, unsigned, long, unsigned long,
2355 * long long, unsigned long long
2356 */
6384eff5
IB
2357 if (n & 0x8000000000000000L)
2358 result = TOK.uns64Literal; // unsigned long
2359 else if (n & 0xFFFFFFFF00000000L)
2360 result = TOK.int64Literal; // long
2361 else if (n & 0x80000000)
2362 result = TOK.uns32Literal;
5fee5ec3 2363 else
6384eff5 2364 result = TOK.int32Literal;
5fee5ec3
IB
2365 break;
2366
2367 case FLAGS.decimal:
2368 /* First that fits: int, long, long long
2369 */
6384eff5
IB
2370 if (n & 0x8000000000000000L)
2371 result = TOK.uns64Literal; // unsigned long
2372 else if (n & 0xFFFFFFFF80000000L)
2373 result = TOK.int64Literal; // long
5fee5ec3 2374 else
6384eff5 2375 result = TOK.int32Literal;
5fee5ec3
IB
2376 break;
2377
2378 case FLAGS.octalhex | FLAGS.unsigned:
2379 case FLAGS.decimal | FLAGS.unsigned:
2380 /* First that fits: unsigned, unsigned long, unsigned long long
2381 */
6384eff5
IB
2382 if (n & 0xFFFFFFFF00000000L)
2383 result = TOK.uns64Literal; // unsigned long
5fee5ec3 2384 else
6384eff5 2385 result = TOK.uns32Literal;
5fee5ec3
IB
2386 break;
2387
2388 case FLAGS.decimal | FLAGS.long_:
2389 /* First that fits: long, long long
2390 */
1027dc45 2391 if (longsize == 4 || long_longsize == 4)
5fee5ec3 2392 {
6384eff5 2393 if (n & 0xFFFFFFFF_80000000L)
5fee5ec3
IB
2394 result = TOK.int64Literal;
2395 else
6384eff5 2396 result = TOK.int32Literal; // long
5fee5ec3
IB
2397 }
2398 else
2399 {
6384eff5 2400 result = TOK.int64Literal; // long
5fee5ec3
IB
2401 }
2402 break;
2403
2404 case FLAGS.octalhex | FLAGS.long_:
2405 /* First that fits: long, unsigned long, long long,
2406 * unsigned long long
2407 */
1027dc45 2408 if (longsize == 4 || long_longsize == 4)
5fee5ec3
IB
2409 {
2410 if (n & 0x8000000000000000L)
2411 result = TOK.uns64Literal;
2412 else if (n & 0xFFFFFFFF00000000L)
2413 result = TOK.int64Literal;
2414 else if (n & 0x80000000)
2415 result = TOK.uns32Literal; // unsigned long
2416 else
2417 result = TOK.int32Literal; // long
2418 }
2419 else
2420 {
2421 if (n & 0x80000000_00000000L)
2422 result = TOK.uns64Literal; // unsigned long
2423 else
2424 result = TOK.int64Literal; // long
2425 }
2426 break;
2427
2428 case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
2429 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
2430 /* First that fits: unsigned long, unsigned long long
2431 */
1027dc45 2432 if (longsize == 4 || long_longsize == 4)
5fee5ec3
IB
2433 {
2434 if (n & 0xFFFFFFFF00000000L)
2435 result = TOK.uns64Literal;
2436 else
2437 result = TOK.uns32Literal; // unsigned long
2438 }
2439 else
2440 {
2441 result = TOK.uns64Literal; // unsigned long
2442 }
2443 break;
2444
2445 case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
2446 /* First that fits: long long, unsigned long long
2447 */
2448 if (n & 0x8000000000000000L)
2449 result = TOK.uns64Literal;
2450 else
2451 result = TOK.int64Literal;
2452 break;
2453
2454 case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
2455 /* long long
2456 */
2457 result = TOK.int64Literal;
2458 break;
2459
2460 case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2461 case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
2462 result = TOK.uns64Literal;
2463 break;
2464
2465 default:
2466 debug printf("%x\n",flags);
2467 assert(0);
2468 }
2469 return result;
2470 }
2471
2472 /**************************************
2473 * Read in characters, converting them to real.
2474 * Bugs:
2475 * Exponent overflow not detected.
2476 * Too much requested precision is not detected.
2477 */
2478 private TOK inreal(Token* t)
2479 {
2480 //printf("Lexer::inreal()\n");
2481 debug
2482 {
2483 assert(*p == '.' || isdigit(*p));
2484 }
2485 bool isWellformedString = true;
2486 stringbuffer.setsize(0);
2487 auto pstart = p;
2488 bool hex = false;
2489 dchar c = *p++;
2490 // Leading '0x'
2491 if (c == '0')
2492 {
2493 c = *p++;
2494 if (c == 'x' || c == 'X')
2495 {
2496 hex = true;
2497 c = *p++;
2498 }
2499 }
2500 // Digits to left of '.'
2501 while (1)
2502 {
2503 if (c == '.')
2504 {
2505 c = *p++;
2506 break;
2507 }
2508 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2509 {
2510 c = *p++;
2511 continue;
2512 }
2513 break;
2514 }
2515 // Digits to right of '.'
2516 while (1)
2517 {
2518 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2519 {
2520 c = *p++;
2521 continue;
2522 }
2523 break;
2524 }
2525 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2526 {
2527 c = *p++;
2528 if (c == '-' || c == '+')
2529 {
2530 c = *p++;
2531 }
2532 bool anyexp = false;
2533 while (1)
2534 {
2535 if (isdigit(c))
2536 {
2537 anyexp = true;
2538 c = *p++;
2539 continue;
2540 }
2541 if (c == '_')
2542 {
2543 if (Ccompile)
2544 error("embedded `_` in numeric literals not allowed");
2545 c = *p++;
2546 continue;
2547 }
2548 if (!anyexp)
2549 {
2550 error("missing exponent");
2551 isWellformedString = false;
2552 }
2553 break;
2554 }
2555 }
2556 else if (hex)
2557 {
2558 error("exponent required for hex float");
2559 isWellformedString = false;
2560 }
2561 --p;
2562 while (pstart < p)
2563 {
2564 if (*pstart != '_')
2565 stringbuffer.writeByte(*pstart);
2566 ++pstart;
2567 }
2568 stringbuffer.writeByte(0);
2569 auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
2570 TOK result;
2571 bool isOutOfRange = false;
b7a586be 2572 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, isOutOfRange) : CTFloat.zero);
5fee5ec3
IB
2573 switch (*p)
2574 {
2575 case 'F':
2576 case 'f':
2577 if (isWellformedString && !isOutOfRange)
2578 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
2579 result = TOK.float32Literal;
2580 p++;
2581 break;
2582 default:
2583 if (isWellformedString && !isOutOfRange)
2584 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
2585 result = TOK.float64Literal;
2586 break;
2587 case 'l':
2588 if (!Ccompile)
2589 error("use 'L' suffix instead of 'l'");
2590 goto case 'L';
2591 case 'L':
2592 ++p;
2593 if (Ccompile && long_doublesize == 8)
2594 goto default;
2595 result = TOK.float80Literal;
2596 break;
2597 }
2598 if ((*p == 'i' || *p == 'I') && !Ccompile)
2599 {
2600 if (*p == 'I')
2601 error("use 'i' suffix instead of 'I'");
2602 p++;
2603 switch (result)
2604 {
2605 case TOK.float32Literal:
2606 result = TOK.imaginary32Literal;
2607 break;
2608 case TOK.float64Literal:
2609 result = TOK.imaginary64Literal;
2610 break;
2611 case TOK.float80Literal:
2612 result = TOK.imaginary80Literal;
2613 break;
2614 default:
2615 break;
2616 }
2617 }
2618 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
fbdaa581 2619 if (isOutOfRange && !isLong && (!Ccompile || hex))
5fee5ec3 2620 {
fbdaa581
IB
2621 /* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
2622 */
c8dfa79c
IB
2623 const char* suffix = result == TOK.float32Literal ? "f" : result == TOK.float80Literal ? "L" : "";
2624 const char* type = [TOK.float32Literal: "`float`".ptr,
2625 TOK.float64Literal: "`double`".ptr,
2626 TOK.float80Literal: "`real` for the current target".ptr][result];
2627 error(scanloc, "number `%s%s` is not representable as a %s", sbufptr, suffix, type);
2628 const char* extra = result == TOK.float64Literal ? "`real` literals can be written using the `L` suffix. " : "";
8da8c7d3 2629 eSink.errorSupplemental(scanloc, "%shttps://dlang.org/spec/lex.html#floatliteral", extra);
5fee5ec3
IB
2630 }
2631 debug
2632 {
2633 switch (result)
2634 {
2635 case TOK.float32Literal:
2636 case TOK.float64Literal:
2637 case TOK.float80Literal:
2638 case TOK.imaginary32Literal:
2639 case TOK.imaginary64Literal:
2640 case TOK.imaginary80Literal:
2641 break;
2642 default:
2643 assert(0);
2644 }
2645 }
2646 return result;
2647 }
2648
2649 final Loc loc() pure @nogc
2650 {
2651 scanloc.charnum = cast(uint)(1 + p - line);
2652 version (LocOffset)
2653 scanloc.fileOffset = cast(uint)(p - base);
2654 return scanloc;
2655 }
2656
8da8c7d3
IB
2657 void error(T...)(const(char)* format, T args)
2658 {
2659 eSink.error(token.loc, format, args);
2660 }
2661
2662 void error(T...)(const ref Loc loc, const(char)* format, T args)
5fee5ec3 2663 {
8da8c7d3 2664 eSink.error(loc, format, args);
5fee5ec3
IB
2665 }
2666
8da8c7d3 2667 final void deprecation(const(char)* format)
5fee5ec3 2668 {
8da8c7d3 2669 eSink.deprecation(token.loc, format);
5fee5ec3
IB
2670 }
2671
8da8c7d3 2672 final void deprecationSupplemental(const(char)* format)
5fee5ec3 2673 {
8da8c7d3 2674 eSink.deprecationSupplemental(token.loc, format);
5fee5ec3
IB
2675 }
2676
7e287503
IB
2677 /***************************************
2678 * Parse special token sequence:
2679 * Returns:
2680 * true if the special token sequence was handled
2681 * References:
2682 * https://dlang.org/spec/lex.html#special-token-sequence
2683 */
2684 bool parseSpecialTokenSequence()
2685 {
2686 Token n;
2687 scan(&n);
2688 if (n.value == TOK.identifier)
2689 {
2690 if (n.ident == Id.line)
2691 {
2692 poundLine(n, false);
2693 return true;
2694 }
2695 else
2696 {
2697 const locx = loc();
8da8c7d3 2698 eSink.warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
7e287503
IB
2699 }
2700 }
2701 else if (n.value == TOK.if_)
2702 {
2703 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
2704 }
2705 return false;
2706 }
2707
5fee5ec3
IB
2708 /*********************************************
2709 * Parse line/file preprocessor directive:
2710 * #line linnum [filespec]
2711 * Allow __LINE__ for linnum, and __FILE__ for filespec.
2712 * Accept linemarker format:
2713 * # linnum [filespec] {flags}
2714 * There can be zero or more flags, which are one of the digits 1..4, and
2715 * must be in ascending order. The flags are ignored.
2716 * Params:
2717 * tok = token we're on, which is linnum of linemarker
2718 * linemarker = true if line marker format and lexer is on linnum
2719 * References:
2720 * linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
2721 */
7e287503 2722 final void poundLine(ref Token tok, bool linemarker)
5fee5ec3
IB
2723 {
2724 auto linnum = this.scanloc.linnum;
2725 const(char)* filespec = null;
5fee5ec3
IB
2726 bool flags;
2727
2728 if (!linemarker)
2729 scan(&tok);
2730 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
2731 {
235d5a96
IB
2732 const lin = cast(int)(tok.unsvalue);
2733 if (lin != tok.unsvalue)
2734 {
2735 error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
2736 skipToNextLine();
2737 return;
2738 }
5fee5ec3
IB
2739 else
2740 linnum = lin;
2741 }
2742 else if (tok.value == TOK.line) // #line __LINE__
2743 {
2744 }
2745 else
235d5a96
IB
2746 {
2747 error(tok.loc, "positive integer argument expected following `#line`");
2748 if (tok.value != TOK.endOfLine)
2749 skipToNextLine();
2750 return;
2751 }
5fee5ec3
IB
2752 while (1)
2753 {
235d5a96
IB
2754 scan(&tok);
2755 switch (tok.value)
5fee5ec3 2756 {
235d5a96
IB
2757 case TOK.endOfFile:
2758 case TOK.endOfLine:
5fee5ec3
IB
2759 if (!inTokenStringConstant)
2760 {
2761 this.scanloc.linnum = linnum;
2762 if (filespec)
2763 this.scanloc.filename = filespec;
2764 }
2765 return;
235d5a96 2766 case TOK.file:
5fee5ec3
IB
2767 if (filespec || flags)
2768 goto Lerr;
235d5a96
IB
2769 filespec = mem.xstrdup(scanloc.filename);
2770 continue;
2771 case TOK.string_:
5fee5ec3
IB
2772 if (filespec || flags)
2773 goto Lerr;
235d5a96 2774 if (tok.ptr[0] != '"' || tok.postfix != 0)
1027dc45 2775 goto Lerr;
235d5a96 2776 filespec = tok.ustring;
5fee5ec3 2777 continue;
235d5a96
IB
2778 case TOK.int32Literal:
2779 if (!filespec)
2780 goto Lerr;
2781 if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
5fee5ec3 2782 {
235d5a96
IB
2783 flags = true; // linemarker flags seen
2784 continue;
5fee5ec3
IB
2785 }
2786 goto Lerr;
235d5a96
IB
2787 default:
2788 goto Lerr;
5fee5ec3
IB
2789 }
2790 }
2791 Lerr:
235d5a96
IB
2792 if (filespec is null)
2793 error(tok.loc, "invalid filename for `#line` directive");
2794 else if (linemarker)
2795 error(tok.loc, "invalid flag for line marker directive");
2796 else if (!Ccompile)
2797 error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
2798 if (tok.value != TOK.endOfLine)
2799 skipToNextLine();
5fee5ec3
IB
2800 }
2801
0fb57034
IB
2802 /***************************************
2803 * Scan forward to start of next line.
610d7898
IB
2804 * Params:
2805 * defines = send characters to `defines`
0fb57034 2806 */
610d7898 2807 final void skipToNextLine(OutBuffer* defines = null)
0fb57034
IB
2808 {
2809 while (1)
2810 {
2811 switch (*p)
2812 {
2813 case 0:
2814 case 0x1A:
2815 return; // do not advance p
2816
2817 case '\n':
2818 ++p;
2819 break;
2820
2821 case '\r':
2822 ++p;
2823 if (p[0] == '\n')
2824 ++p;
2825 break;
2826
2827 default:
610d7898
IB
2828 if (defines)
2829 defines.writeByte(*p); // don't care about Unicode line endings for C
2830 else if (*p & 0x80)
0fb57034
IB
2831 {
2832 const u = decodeUTF();
2833 if (u == PS || u == LS)
2834 {
2835 ++p;
2836 break;
2837 }
2838 }
2839 ++p;
2840 continue;
2841 }
2842 break;
2843 }
2844 endOfLine();
235d5a96 2845 tokenizeNewlines = false;
0fb57034
IB
2846 }
2847
5fee5ec3
IB
2848 /********************************************
2849 * Decode UTF character.
2850 * Issue error messages for invalid sequences.
2851 * Return decoded character, advance p to last character in UTF sequence.
2852 */
2853 private uint decodeUTF()
6d799f0a
IB
2854 {
2855 string msg;
2856 auto result = decodeUTFpure(msg);
2857
2858 if (msg)
8da8c7d3 2859 error(token.loc, "%.*s", cast(int)msg.length, msg.ptr);
6d799f0a
IB
2860 return result;
2861 }
2862
2863 /********************************************
2864 * Same as above, but the potential error message is stored to the
2865 * msg parameter instead of being issued.
2866 */
2867 private pure uint decodeUTFpure(out string msg)
5fee5ec3
IB
2868 {
2869 const s = p;
2870 assert(*s & 0x80);
2871 // Check length of remaining string up to 4 UTF-8 characters
2872 size_t len;
2873 for (len = 1; len < 4 && s[len]; len++)
2874 {
2875 }
2876 size_t idx = 0;
2877 dchar u;
6d799f0a 2878 msg = utf_decodeChar(s[0 .. len], idx, u);
5fee5ec3 2879 p += idx - 1;
6d799f0a
IB
2880 if (!msg && isBidiControl(u))
2881 msg = "Bidirectional control characters are disallowed for security reasons.";
5fee5ec3
IB
2882 return u;
2883 }
2884
2885 /***************************************************
2886 * Parse doc comment embedded between t.ptr and p.
2887 * Remove trailing blanks and tabs from lines.
2888 * Replace all newlines with \n.
2889 * Remove leading comment character from each line.
2890 * Decide if it's a lineComment or a blockComment.
2891 * Append to previous one for this token.
2892 *
2893 * If newParagraph is true, an extra newline will be
2894 * added between adjoining doc comments.
2895 */
2896 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
2897 {
2898 /* ct tells us which kind of comment it is: '/', '*', or '+'
2899 */
2900 const ct = t.ptr[2];
2901 /* Start of comment text skips over / * *, / + +, or / / /
2902 */
2903 const(char)* q = t.ptr + 3; // start of comment text
2904 const(char)* qend = p;
2905 if (ct == '*' || ct == '+')
2906 qend -= 2;
2907 /* Scan over initial row of ****'s or ++++'s or ////'s
2908 */
2909 for (; q < qend; q++)
2910 {
2911 if (*q != ct)
2912 break;
2913 }
2914 /* Remove leading spaces until start of the comment
2915 */
2916 int linestart = 0;
2917 if (ct == '/')
2918 {
2919 while (q < qend && (*q == ' ' || *q == '\t'))
2920 ++q;
2921 }
2922 else if (q < qend)
2923 {
2924 if (*q == '\r')
2925 {
2926 ++q;
2927 if (q < qend && *q == '\n')
2928 ++q;
2929 linestart = 1;
2930 }
2931 else if (*q == '\n')
2932 {
2933 ++q;
2934 linestart = 1;
2935 }
2936 }
2937 /* Remove trailing row of ****'s or ++++'s
2938 */
2939 if (ct != '/')
2940 {
2941 for (; q < qend; qend--)
2942 {
2943 if (qend[-1] != ct)
2944 break;
2945 }
2946 }
2947 /* Comment is now [q .. qend].
2948 * Canonicalize it into buf[].
2949 */
2950 OutBuffer buf;
2951
2952 void trimTrailingWhitespace()
2953 {
2954 const s = buf[];
2955 auto len = s.length;
2956 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
2957 --len;
2958 buf.setsize(len);
2959 }
2960
2961 for (; q < qend; q++)
2962 {
2963 char c = *q;
2964 switch (c)
2965 {
2966 case '*':
2967 case '+':
2968 if (linestart && c == ct)
2969 {
2970 linestart = 0;
2971 /* Trim preceding whitespace up to preceding \n
2972 */
2973 trimTrailingWhitespace();
2974 continue;
2975 }
2976 break;
2977 case ' ':
2978 case '\t':
2979 break;
2980 case '\r':
2981 if (q[1] == '\n')
2982 continue; // skip the \r
2983 goto Lnewline;
2984 default:
2985 if (c == 226)
2986 {
2987 // If LS or PS
2988 if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2989 {
2990 q += 2;
2991 goto Lnewline;
2992 }
2993 }
2994 linestart = 0;
2995 break;
2996 Lnewline:
2997 c = '\n'; // replace all newlines with \n
2998 goto case;
2999 case '\n':
3000 linestart = 1;
3001 /* Trim trailing whitespace
3002 */
3003 trimTrailingWhitespace();
3004 break;
3005 }
3006 buf.writeByte(c);
3007 }
3008 /* Trim trailing whitespace (if the last line does not have newline)
3009 */
3010 trimTrailingWhitespace();
3011
3012 // Always end with a newline
3013 const s = buf[];
3014 if (s.length == 0 || s[$ - 1] != '\n')
3015 buf.writeByte('\n');
3016
3017 // It's a line comment if the start of the doc comment comes
3018 // after other non-whitespace on the same line.
3019 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
3020 // Combine with previous doc comment, if any
3021 if (*dc)
3022 *dc = combineComments(*dc, buf[], newParagraph).toDString();
3023 else
3024 *dc = buf.extractSlice(true);
3025 }
3026
3027 /********************************************
3028 * Combine two document comments into one,
3029 * separated by an extra newline if newParagraph is true.
3030 */
3031 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
3032 {
31350635 3033 //debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
5fee5ec3
IB
3034 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
3035 if (!c1)
3036 return c2.ptr;
3037 if (!c2)
3038 return c1.ptr;
3039
3040 int insertNewLine = 0;
3041 if (c1.length && c1[$ - 1] != '\n')
3042 insertNewLine = 1;
3043 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
3044 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
3045 p[0 .. c1.length] = c1[];
3046 if (insertNewLine)
3047 p[c1.length] = '\n';
3048 if (newParagraph)
3049 p[c1.length + insertNewLine] = '\n';
3050 p[retSize - c2.length .. retSize] = c2[];
3051 p[retSize] = 0;
3052 return p;
3053 }
3054
0fb57034
IB
3055 /**************************
3056 * `p` should be at start of next line
3057 */
3058 private void endOfLine() pure @nogc @safe
5fee5ec3
IB
3059 {
3060 scanloc.linnum++;
3061 line = p;
3062 }
3063}
3064
6384eff5
IB
3065
3066/******************************* Private *****************************************/
3067
3068private:
3069
5fee5ec3
IB
3070/// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
3071private struct TimeStampInfo
3072{
3073 private __gshared bool initdone = false;
3074
3075 // Note: Those properties need to be guarded by a call to `init`
3076 // The API isn't safe, and quite brittle, but it was left this way
3077 // over performance concerns.
3078 // This is currently only called once, from the lexer.
3079 __gshared char[11 + 1] date;
3080 __gshared char[8 + 1] time;
3081 __gshared char[24 + 1] timestamp;
3082
8da8c7d3 3083 public static void initialize(const ref Loc loc, ErrorSink eSink) nothrow
5fee5ec3
IB
3084 {
3085 if (initdone)
3086 return;
3087
3088 initdone = true;
3089 time_t ct;
3090 // https://issues.dlang.org/show_bug.cgi?id=20444
3091 if (auto p = getenv("SOURCE_DATE_EPOCH"))
3092 {
3093 if (!ct.parseDigits(p.toDString()))
8da8c7d3 3094 eSink.error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
5fee5ec3
IB
3095 }
3096 else
3097 .time(&ct);
3098 const p = ctime(&ct);
3099 assert(p);
8da8c7d3
IB
3100 snprintf(&date[0], date.length, "%.6s %.4s", p + 4, p + 20);
3101 snprintf(&time[0], time.length, "%.8s", p + 11);
3102 snprintf(&timestamp[0], timestamp.length, "%.24s", p);
5fee5ec3
IB
3103 }
3104}
3105
6384eff5
IB
3106private enum LS = 0x2028; // UTF line separator
3107private enum PS = 0x2029; // UTF paragraph separator
3108
3109/********************************************
3110 * Do our own char maps
3111 */
3112private static immutable cmtable = ()
3113{
3114 ubyte[256] table;
3115 foreach (const c; 0 .. table.length)
3116 {
3117 if ('0' <= c && c <= '7')
3118 table[c] |= CMoctal;
3119 if (c_isxdigit(c))
3120 table[c] |= CMhex;
3121 if (c_isalnum(c) || c == '_')
3122 table[c] |= CMidchar;
3123
3124 switch (c)
3125 {
3126 case 'x': case 'X':
3127 case 'b': case 'B':
3128 table[c] |= CMzerosecond;
3129 break;
3130
3131 case '0': .. case '9':
3132 case 'e': case 'E':
3133 case 'f': case 'F':
3134 case 'l': case 'L':
3135 case 'p': case 'P':
3136 case 'u': case 'U':
3137 case 'i':
3138 case '.':
3139 case '_':
3140 table[c] |= CMzerosecond | CMdigitsecond;
3141 break;
3142
3143 default:
3144 break;
3145 }
3146
3147 switch (c)
3148 {
3149 case '\\':
3150 case '\n':
3151 case '\r':
3152 case 0:
3153 case 0x1A:
3154 case '\'':
3155 break;
3156 default:
3157 if (!(c & 0x80))
3158 table[c] |= CMsinglechar;
3159 break;
3160 }
3161 }
3162 return table;
3163}();
3164
3165private
3166{
3167 enum CMoctal = 0x1;
3168 enum CMhex = 0x2;
3169 enum CMidchar = 0x4;
3170 enum CMzerosecond = 0x8;
3171 enum CMdigitsecond = 0x10;
3172 enum CMsinglechar = 0x20;
3173}
3174
3175private bool isoctal(const char c) pure @nogc @safe
3176{
3177 return (cmtable[c] & CMoctal) != 0;
3178}
3179
3180private bool ishex(const char c) pure @nogc @safe
3181{
3182 return (cmtable[c] & CMhex) != 0;
3183}
3184
3185private bool isidchar(const char c) pure @nogc @safe
3186{
3187 return (cmtable[c] & CMidchar) != 0;
3188}
3189
3190private bool isZeroSecond(const char c) pure @nogc @safe
3191{
3192 return (cmtable[c] & CMzerosecond) != 0;
3193}
3194
3195private bool isDigitSecond(const char c) pure @nogc @safe
3196{
3197 return (cmtable[c] & CMdigitsecond) != 0;
3198}
3199
3200private bool issinglechar(const char c) pure @nogc @safe
3201{
3202 return (cmtable[c] & CMsinglechar) != 0;
3203}
3204
3205private bool c_isxdigit(const int c) pure @nogc @safe
3206{
3207 return (( c >= '0' && c <= '9') ||
3208 ( c >= 'a' && c <= 'f') ||
3209 ( c >= 'A' && c <= 'F'));
3210}
3211
3212private bool c_isalnum(const int c) pure @nogc @safe
3213{
3214 return (( c >= '0' && c <= '9') ||
3215 ( c >= 'a' && c <= 'z') ||
3216 ( c >= 'A' && c <= 'Z'));
3217}
3218
3219/******************************* Unittest *****************************************/
3220
5fee5ec3
IB
3221unittest
3222{
8da8c7d3
IB
3223 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3224
3225 ErrorSink errorSink = new ErrorSinkStderr;
5fee5ec3 3226
8da8c7d3 3227 void test(T)(string sequence, T expected, bool Ccompile = false)
5fee5ec3
IB
3228 {
3229 auto p = cast(const(char)*)sequence.ptr;
7e7ebe3e 3230 dchar c2;
8da8c7d3 3231 Lexer lexer = new Lexer(errorSink);
7e7ebe3e 3232 assert(expected == lexer.escapeSequence(Loc.initial, p, Ccompile, c2));
5fee5ec3
IB
3233 assert(p == sequence.ptr + sequence.length);
3234 }
3235
3236 test(`'`, '\'');
3237 test(`"`, '"');
3238 test(`?`, '?');
3239 test(`\`, '\\');
3240 test(`0`, '\0');
3241 test(`a`, '\a');
3242 test(`b`, '\b');
3243 test(`f`, '\f');
3244 test(`n`, '\n');
3245 test(`r`, '\r');
3246 test(`t`, '\t');
3247 test(`v`, '\v');
3248
3249 test(`x00`, 0x00);
3250 test(`xff`, 0xff);
3251 test(`xFF`, 0xff);
3252 test(`xa7`, 0xa7);
3253 test(`x3c`, 0x3c);
3254 test(`xe2`, 0xe2);
3255
3256 test(`1`, '\1');
3257 test(`42`, '\42');
3258 test(`357`, '\357');
3259
3260 test(`u1234`, '\u1234');
3261 test(`uf0e4`, '\uf0e4');
3262
3263 test(`U0001f603`, '\U0001f603');
3264
3265 test(`&quot;`, '"');
3266 test(`&lt;`, '<');
3267 test(`&gt;`, '>');
5fee5ec3 3268}
6384eff5 3269
5fee5ec3
IB
3270unittest
3271{
8da8c7d3 3272 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
5fee5ec3 3273
8da8c7d3 3274 static class ErrorSinkTest : ErrorSinkNull
5fee5ec3 3275 {
8da8c7d3
IB
3276 nothrow:
3277 extern (C++):
3278 override:
3279
3280 import core.stdc.stdio;
3281 import core.stdc.stdarg;
5fee5ec3 3282
8da8c7d3
IB
3283 string expected;
3284 bool gotError;
3285
3286 void error(const ref Loc loc, const(char)* format, ...)
3287 {
3288 gotError = true;
3289 char[100] buffer = void;
3290 va_list ap;
3291 va_start(ap, format);
3292 auto actual = buffer[0 .. vsnprintf(buffer.ptr, buffer.length, format, ap)];
3293 va_end(ap);
3294 assert(expected == actual);
3295 }
5fee5ec3
IB
3296 }
3297
8da8c7d3 3298 ErrorSinkTest errorSink = new ErrorSinkTest;
5fee5ec3
IB
3299
3300 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
3301 {
8da8c7d3
IB
3302 errorSink.expected = expectedError;
3303 errorSink.gotError = false;
5fee5ec3 3304 auto p = cast(const(char)*)sequence.ptr;
8da8c7d3 3305 Lexer lexer = new Lexer(errorSink);
7e7ebe3e
IB
3306 dchar c2;
3307 auto actualReturnValue = lexer.escapeSequence(Loc.initial, p, Ccompile, c2);
8da8c7d3 3308 assert(errorSink.gotError);
5fee5ec3
IB
3309 assert(expectedReturnValue == actualReturnValue);
3310
3311 auto actualScanLength = p - sequence.ptr;
3312 assert(expectedScanLength == actualScanLength);
5fee5ec3
IB
3313 }
3314
3315 test("c", `undefined escape sequence \c`, 'c', 1);
3316 test("!", `undefined escape sequence \!`, '!', 1);
3317 test("&quot;", `undefined escape sequence \&`, '&', 1, true);
3318
3319 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
3320
3321 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
3322 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
3323 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
3324
3325 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
3326 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
3327 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
3328 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
3329 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
3330 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
3331 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
3332
3333 test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
3334 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
3335 test("U00110000", `invalid UTF character \U00110000`, '?', 9);
3336
3337 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
3338 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
3339 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
3340
3341 test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
3342 test("&quot", `unterminated named entity &quot;`, '?', 5);
3343 test("&quot", `unterminated named entity &quot;`, '?', 5);
3344
3345 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
5fee5ec3 3346}
6384eff5
IB
3347
3348unittest
3349{
8da8c7d3 3350 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
6384eff5
IB
3351 /* Not much here, just trying things out.
3352 */
3353 string text = "int"; // We rely on the implicit null-terminator
8da8c7d3
IB
3354 ErrorSink errorSink = new ErrorSinkStderr;
3355 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, false, false, errorSink);
6384eff5
IB
3356 TOK tok;
3357 tok = lex1.nextToken();
3358 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
3359 assert(tok == TOK.int32);
3360 tok = lex1.nextToken();
3361 assert(tok == TOK.endOfFile);
3362 tok = lex1.nextToken();
3363 assert(tok == TOK.endOfFile);
3364 tok = lex1.nextToken();
3365 assert(tok == TOK.endOfFile);
3366}
3367
3368unittest
3369{
8da8c7d3
IB
3370 fprintf(stderr, "Lexer.unittest %d\n", __LINE__);
3371
6384eff5 3372 // We don't want to see Lexer error output during these tests.
8da8c7d3 3373 ErrorSink errorSink = new ErrorSinkNull;
6384eff5
IB
3374
3375 // Test malformed input: even malformed input should end in a TOK.endOfFile.
3376 static immutable char[][] testcases =
3377 [ // Testcase must end with 0 or 0x1A.
3378 [0], // not malformed, but pathological
3379 ['\'', 0],
3380 ['\'', 0x1A],
3381 ['{', '{', 'q', '{', 0],
3382 [0xFF, 0],
3383 [0xFF, 0x80, 0],
3384 [0xFF, 0xFF, 0],
3385 [0xFF, 0xFF, 0],
3386 ['x', '"', 0x1A],
3387 ];
3388
3389 foreach (testcase; testcases)
3390 {
8da8c7d3 3391 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, false, false, errorSink);
6384eff5
IB
3392 TOK tok = lex2.nextToken();
3393 size_t iterations = 1;
3394 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
3395 {
3396 tok = lex2.nextToken();
3397 }
3398 assert(tok == TOK.endOfFile);
3399 tok = lex2.nextToken();
3400 assert(tok == TOK.endOfFile);
3401 }
3402}