]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/d/dmd/lexer.c
Merge dmd upstream 6d5b853d3
[thirdparty/gcc.git] / gcc / d / dmd / lexer.c
CommitLineData
b4c522fa
IB
1
2/* Compiler implementation of the D programming language
f3ed896c 3 * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved
b4c522fa
IB
4 * written by Walter Bright
5 * http://www.digitalmars.com
6 * Distributed under the Boost Software License, Version 1.0.
7 * http://www.boost.org/LICENSE_1_0.txt
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c
9 */
10
11/* Lexical Analyzer */
12
f9ab59ff 13#include "root/dsystem.h" // for time() and ctime()
b4c522fa
IB
14#include "root/rmem.h"
15
16#include "mars.h"
17#include "lexer.h"
18#include "utf.h"
19#include "identifier.h"
20#include "id.h"
21
22extern int HtmlNamedEntity(const utf8_t *p, size_t length);
23
24#define LS 0x2028 // UTF line separator
25#define PS 0x2029 // UTF paragraph separator
26
27/********************************************
28 * Do our own char maps
29 */
30
31static unsigned char cmtable[256];
32
33const int CMoctal = 0x1;
34const int CMhex = 0x2;
35const int CMidchar = 0x4;
36
37inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; }
38inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; }
39inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; }
40
41struct CMTableInitializer
42{
43 CMTableInitializer();
44};
45
46static CMTableInitializer cmtableinitializer;
47
48CMTableInitializer::CMTableInitializer()
49{
50 for (unsigned c = 0; c < 256; c++)
51 {
52 if ('0' <= c && c <= '7')
53 cmtable[c] |= CMoctal;
54 if (isxdigit(c))
55 cmtable[c] |= CMhex;
56 if (isalnum(c) || c == '_')
57 cmtable[c] |= CMidchar;
58 }
59}
60
61/*************************** Lexer ********************************************/
62
63OutBuffer Lexer::stringbuffer;
64
65Lexer::Lexer(const char *filename,
66 const utf8_t *base, size_t begoffset, size_t endoffset,
67 bool doDocComment, bool commentToken)
68{
69 scanloc = Loc(filename, 1, 1);
70 //printf("Lexer::Lexer(%p,%d)\n",base,length);
71 //printf("lexer.filename = %s\n", filename);
72 this->token = Token();
73 this->token.ptr = NULL;
74 this->token.value = TOKreserved;
75 this->token.blockComment = NULL;
76 this->token.lineComment = NULL;
77 this->base = base;
78 this->end = base + endoffset;
79 p = base + begoffset;
80 line = p;
81 this->doDocComment = doDocComment;
82 this->anyToken = 0;
83 this->commentToken = commentToken;
84 this->errors = false;
85 //initKeywords();
86
87 /* If first line starts with '#!', ignore the line
88 */
89
90 if (p[0] == '#' && p[1] =='!')
91 {
92 p += 2;
93 while (1)
94 {
95 utf8_t c = *p++;
96 switch (c)
97 {
98 case 0:
99 case 0x1A:
100 p--;
101 /* fall through */
102
103 case '\n':
104 break;
105
106 default:
107 continue;
108 }
109 break;
110 }
111 endOfLine();
112 }
113}
114
115
116void Lexer::endOfLine()
117{
118 scanloc.linnum++;
119 line = p;
120}
121
122
123void Lexer::error(const char *format, ...)
124{
125 va_list ap;
126 va_start(ap, format);
127 ::verror(token.loc, format, ap);
128 va_end(ap);
129 errors = true;
130}
131
132void Lexer::error(Loc loc, const char *format, ...)
133{
134 va_list ap;
135 va_start(ap, format);
136 ::verror(loc, format, ap);
137 va_end(ap);
138 errors = true;
139}
140
141void Lexer::deprecation(const char *format, ...)
142{
143 va_list ap;
144 va_start(ap, format);
145 ::vdeprecation(token.loc, format, ap);
146 va_end(ap);
147 if (global.params.useDeprecated == DIAGNOSTICerror)
148 errors = true;
149}
150
151TOK Lexer::nextToken()
152{
153 if (token.next)
154 {
155 Token *t = token.next;
156 memcpy(&token,t,sizeof(Token));
157 t->free();
158 }
159 else
160 {
161 scan(&token);
162 }
163 //token.print();
164 return token.value;
165}
166
167Token *Lexer::peek(Token *ct)
168{
169 Token *t;
170 if (ct->next)
171 t = ct->next;
172 else
173 {
174 t = Token::alloc();
175 scan(t);
176 ct->next = t;
177 }
178 return t;
179}
180
181/***********************
182 * Look ahead at next token's value.
183 */
184
185TOK Lexer::peekNext()
186{
187 return peek(&token)->value;
188}
189
190/***********************
191 * Look 2 tokens ahead at value.
192 */
193
194TOK Lexer::peekNext2()
195{
196 Token *t = peek(&token);
197 return peek(t)->value;
198}
199
200/*********************************
201 * tk is on the opening (.
202 * Look ahead and return token that is past the closing ).
203 */
204
205Token *Lexer::peekPastParen(Token *tk)
206{
207 //printf("peekPastParen()\n");
208 int parens = 1;
209 int curlynest = 0;
210 while (1)
211 {
212 tk = peek(tk);
213 //tk->print();
214 switch (tk->value)
215 {
216 case TOKlparen:
217 parens++;
218 continue;
219
220 case TOKrparen:
221 --parens;
222 if (parens)
223 continue;
224 tk = peek(tk);
225 break;
226
227 case TOKlcurly:
228 curlynest++;
229 continue;
230
231 case TOKrcurly:
232 if (--curlynest >= 0)
233 continue;
234 break;
235
236 case TOKsemicolon:
237 if (curlynest)
238 continue;
239 break;
240
241 case TOKeof:
242 break;
243
244 default:
245 continue;
246 }
247 return tk;
248 }
249}
250
251/****************************
252 * Turn next token in buffer into a token.
253 */
254
255void Lexer::scan(Token *t)
256{
257 unsigned lastLine = scanloc.linnum;
258 Loc startLoc;
259
260 t->blockComment = NULL;
261 t->lineComment = NULL;
262 while (1)
263 {
264 t->ptr = p;
265 //printf("p = %p, *p = '%c'\n",p,*p);
266 t->loc = loc();
267 switch (*p)
268 {
269 case 0:
270 case 0x1A:
271 t->value = TOKeof; // end of file
272 return;
273
274 case ' ':
275 case '\t':
276 case '\v':
277 case '\f':
278 p++;
279 continue; // skip white space
280
281 case '\r':
282 p++;
283 if (*p != '\n') // if CR stands by itself
284 endOfLine();
285 continue; // skip white space
286
287 case '\n':
288 p++;
289 endOfLine();
290 continue; // skip white space
291
292 case '0': case '1': case '2': case '3': case '4':
293 case '5': case '6': case '7': case '8': case '9':
294 t->value = number(t);
295 return;
296
297 case '\'':
298 t->value = charConstant(t);
299 return;
300
301 case 'r':
302 if (p[1] != '"')
303 goto case_ident;
304 p++;
305 /* fall through */
306 case '`':
307 t->value = wysiwygStringConstant(t, *p);
308 return;
309
310 case 'x':
311 if (p[1] != '"')
312 goto case_ident;
313 p++;
314 t->value = hexStringConstant(t);
315 return;
316
317 case 'q':
318 if (p[1] == '"')
319 {
320 p++;
321 t->value = delimitedStringConstant(t);
322 return;
323 }
324 else if (p[1] == '{')
325 {
326 p++;
327 t->value = tokenStringConstant(t);
328 return;
329 }
330 else
331 goto case_ident;
332
333 case '"':
334 t->value = escapeStringConstant(t);
335 return;
336
337 case 'a': case 'b': case 'c': case 'd': case 'e':
338 case 'f': case 'g': case 'h': case 'i': case 'j':
339 case 'k': case 'l': case 'm': case 'n': case 'o':
340 case 'p': /*case 'q': case 'r':*/ case 's': case 't':
341 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y':
342 case 'z':
343 case 'A': case 'B': case 'C': case 'D': case 'E':
344 case 'F': case 'G': case 'H': case 'I': case 'J':
345 case 'K': case 'L': case 'M': case 'N': case 'O':
346 case 'P': case 'Q': case 'R': case 'S': case 'T':
347 case 'U': case 'V': case 'W': case 'X': case 'Y':
348 case 'Z':
349 case '_':
350 case_ident:
351 { utf8_t c;
352
353 while (1)
354 {
355 c = *++p;
356 if (isidchar(c))
357 continue;
358 else if (c & 0x80)
359 { const utf8_t *s = p;
360 unsigned u = decodeUTF();
361 if (isUniAlpha(u))
362 continue;
363 error("char 0x%04x not allowed in identifier", u);
364 p = s;
365 }
366 break;
367 }
368
369 Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr);
370 t->ident = id;
371 t->value = (TOK) id->getValue();
372 anyToken = 1;
373 if (*t->ptr == '_') // if special identifier token
374 {
375 static bool initdone = false;
376 static char date[11+1];
377 static char time[8+1];
378 static char timestamp[24+1];
379
380 if (!initdone) // lazy evaluation
381 {
382 initdone = true;
383 time_t ct;
384 ::time(&ct);
385 char *p = ctime(&ct);
386 assert(p);
387 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
388 sprintf(&time[0], "%.8s", p + 11);
389 sprintf(&timestamp[0], "%.24s", p);
390 }
391
392 if (id == Id::DATE)
393 {
394 t->ustring = (utf8_t *)date;
395 goto Lstr;
396 }
397 else if (id == Id::TIME)
398 {
399 t->ustring = (utf8_t *)time;
400 goto Lstr;
401 }
402 else if (id == Id::VENDOR)
403 {
404 t->ustring = (utf8_t *)const_cast<char *>(global.vendor);
405 goto Lstr;
406 }
407 else if (id == Id::TIMESTAMP)
408 {
409 t->ustring = (utf8_t *)timestamp;
410 Lstr:
411 t->value = TOKstring;
412 t->postfix = 0;
413 t->len = (unsigned)strlen((char *)t->ustring);
414 }
415 else if (id == Id::VERSIONX)
416 { unsigned major = 0;
417 unsigned minor = 0;
418 bool point = false;
419
420 for (const char *p = global.version + 1; 1; p++)
421 {
422 c = *p;
423 if (isdigit((utf8_t)c))
424 minor = minor * 10 + c - '0';
425 else if (c == '.')
426 {
427 if (point)
428 break; // ignore everything after second '.'
429 point = true;
430 major = minor;
431 minor = 0;
432 }
433 else
434 break;
435 }
436 t->value = TOKint64v;
437 t->uns64value = major * 1000 + minor;
438 }
439 else if (id == Id::EOFX)
440 {
441 t->value = TOKeof;
442 // Advance scanner to end of file
443 while (!(*p == 0 || *p == 0x1A))
444 p++;
445 }
446 }
447 //printf("t->value = %d\n",t->value);
448 return;
449 }
450
451 case '/':
452 p++;
453 switch (*p)
454 {
455 case '=':
456 p++;
457 t->value = TOKdivass;
458 return;
459
460 case '*':
461 p++;
462 startLoc = loc();
463 while (1)
464 {
465 while (1)
466 { utf8_t c = *p;
467 switch (c)
468 {
469 case '/':
470 break;
471
472 case '\n':
473 endOfLine();
474 p++;
475 continue;
476
477 case '\r':
478 p++;
479 if (*p != '\n')
480 endOfLine();
481 continue;
482
483 case 0:
484 case 0x1A:
485 error("unterminated /* */ comment");
486 p = end;
487 t->loc = loc();
488 t->value = TOKeof;
489 return;
490
491 default:
492 if (c & 0x80)
493 { unsigned u = decodeUTF();
494 if (u == PS || u == LS)
495 endOfLine();
496 }
497 p++;
498 continue;
499 }
500 break;
501 }
502 p++;
503 if (p[-2] == '*' && p - 3 != t->ptr)
504 break;
505 }
506 if (commentToken)
507 {
508 t->loc = startLoc;
509 t->value = TOKcomment;
510 return;
511 }
512 else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
513 { // if /** but not /**/
514 getDocComment(t, lastLine == startLoc.linnum);
515 }
516 continue;
517
518 case '/': // do // style comments
519 startLoc = loc();
520 while (1)
521 { utf8_t c = *++p;
522 switch (c)
523 {
524 case '\n':
525 break;
526
527 case '\r':
528 if (p[1] == '\n')
529 p++;
530 break;
531
532 case 0:
533 case 0x1A:
534 if (commentToken)
535 {
536 p = end;
537 t->loc = startLoc;
538 t->value = TOKcomment;
539 return;
540 }
541 if (doDocComment && t->ptr[2] == '/')
542 getDocComment(t, lastLine == startLoc.linnum);
543 p = end;
544 t->loc = loc();
545 t->value = TOKeof;
546 return;
547
548 default:
549 if (c & 0x80)
550 { unsigned u = decodeUTF();
551 if (u == PS || u == LS)
552 break;
553 }
554 continue;
555 }
556 break;
557 }
558
559 if (commentToken)
560 {
561 p++;
562 endOfLine();
563 t->loc = startLoc;
564 t->value = TOKcomment;
565 return;
566 }
567 if (doDocComment && t->ptr[2] == '/')
568 getDocComment(t, lastLine == startLoc.linnum);
569
570 p++;
571 endOfLine();
572 continue;
573
574 case '+':
575 { int nest;
576
577 startLoc = loc();
578 p++;
579 nest = 1;
580 while (1)
581 { utf8_t c = *p;
582 switch (c)
583 {
584 case '/':
585 p++;
586 if (*p == '+')
587 {
588 p++;
589 nest++;
590 }
591 continue;
592
593 case '+':
594 p++;
595 if (*p == '/')
596 {
597 p++;
598 if (--nest == 0)
599 break;
600 }
601 continue;
602
603 case '\r':
604 p++;
605 if (*p != '\n')
606 endOfLine();
607 continue;
608
609 case '\n':
610 endOfLine();
611 p++;
612 continue;
613
614 case 0:
615 case 0x1A:
616 error("unterminated /+ +/ comment");
617 p = end;
618 t->loc = loc();
619 t->value = TOKeof;
620 return;
621
622 default:
623 if (c & 0x80)
624 { unsigned u = decodeUTF();
625 if (u == PS || u == LS)
626 endOfLine();
627 }
628 p++;
629 continue;
630 }
631 break;
632 }
633 if (commentToken)
634 {
635 t->loc = startLoc;
636 t->value = TOKcomment;
637 return;
638 }
639 if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
640 { // if /++ but not /++/
641 getDocComment(t, lastLine == startLoc.linnum);
642 }
643 continue;
644 }
645 default:
646 break;
647 }
648 t->value = TOKdiv;
649 return;
650
651 case '.':
652 p++;
653 if (isdigit(*p))
654 { /* Note that we don't allow ._1 and ._ as being
655 * valid floating point numbers.
656 */
657 p--;
658 t->value = inreal(t);
659 }
660 else if (p[0] == '.')
661 {
662 if (p[1] == '.')
663 { p += 2;
664 t->value = TOKdotdotdot;
665 }
666 else
667 { p++;
668 t->value = TOKslice;
669 }
670 }
671 else
672 t->value = TOKdot;
673 return;
674
675 case '&':
676 p++;
677 if (*p == '=')
678 { p++;
679 t->value = TOKandass;
680 }
681 else if (*p == '&')
682 { p++;
683 t->value = TOKandand;
684 }
685 else
686 t->value = TOKand;
687 return;
688
689 case '|':
690 p++;
691 if (*p == '=')
692 { p++;
693 t->value = TOKorass;
694 }
695 else if (*p == '|')
696 { p++;
697 t->value = TOKoror;
698 }
699 else
700 t->value = TOKor;
701 return;
702
703 case '-':
704 p++;
705 if (*p == '=')
706 { p++;
707 t->value = TOKminass;
708 }
709 else if (*p == '-')
710 { p++;
711 t->value = TOKminusminus;
712 }
713 else
714 t->value = TOKmin;
715 return;
716
717 case '+':
718 p++;
719 if (*p == '=')
720 { p++;
721 t->value = TOKaddass;
722 }
723 else if (*p == '+')
724 { p++;
725 t->value = TOKplusplus;
726 }
727 else
728 t->value = TOKadd;
729 return;
730
731 case '<':
732 p++;
733 if (*p == '=')
734 { p++;
735 t->value = TOKle; // <=
736 }
737 else if (*p == '<')
738 { p++;
739 if (*p == '=')
740 { p++;
741 t->value = TOKshlass; // <<=
742 }
743 else
744 t->value = TOKshl; // <<
745 }
746 else if (*p == '>')
747 { p++;
748 if (*p == '=')
749 { p++;
750 t->value = TOKleg; // <>=
751 }
752 else
753 t->value = TOKlg; // <>
754 }
755 else
756 t->value = TOKlt; // <
757 return;
758
759 case '>':
760 p++;
761 if (*p == '=')
762 { p++;
763 t->value = TOKge; // >=
764 }
765 else if (*p == '>')
766 { p++;
767 if (*p == '=')
768 { p++;
769 t->value = TOKshrass; // >>=
770 }
771 else if (*p == '>')
772 { p++;
773 if (*p == '=')
774 { p++;
775 t->value = TOKushrass; // >>>=
776 }
777 else
778 t->value = TOKushr; // >>>
779 }
780 else
781 t->value = TOKshr; // >>
782 }
783 else
784 t->value = TOKgt; // >
785 return;
786
787 case '!':
788 p++;
789 if (*p == '=')
790 { p++;
791 t->value = TOKnotequal; // !=
792 }
793 else if (*p == '<')
794 { p++;
795 if (*p == '>')
796 { p++;
797 if (*p == '=')
798 { p++;
799 t->value = TOKunord; // !<>=
800 }
801 else
802 t->value = TOKue; // !<>
803 }
804 else if (*p == '=')
805 { p++;
806 t->value = TOKug; // !<=
807 }
808 else
809 t->value = TOKuge; // !<
810 }
811 else if (*p == '>')
812 { p++;
813 if (*p == '=')
814 { p++;
815 t->value = TOKul; // !>=
816 }
817 else
818 t->value = TOKule; // !>
819 }
820 else
821 t->value = TOKnot; // !
822 return;
823
824 case '=':
825 p++;
826 if (*p == '=')
827 { p++;
828 t->value = TOKequal; // ==
829 }
830 else if (*p == '>')
831 { p++;
832 t->value = TOKgoesto; // =>
833 }
834 else
835 t->value = TOKassign; // =
836 return;
837
838 case '~':
839 p++;
840 if (*p == '=')
841 { p++;
842 t->value = TOKcatass; // ~=
843 }
844 else
845 t->value = TOKtilde; // ~
846 return;
847
848 case '^':
849 p++;
850 if (*p == '^')
851 { p++;
852 if (*p == '=')
853 { p++;
854 t->value = TOKpowass; // ^^=
855 }
856 else
857 t->value = TOKpow; // ^^
858 }
859 else if (*p == '=')
860 { p++;
861 t->value = TOKxorass; // ^=
862 }
863 else
864 t->value = TOKxor; // ^
865 return;
866
867 case '(': p++; t->value = TOKlparen; return;
868 case ')': p++; t->value = TOKrparen; return;
869 case '[': p++; t->value = TOKlbracket; return;
870 case ']': p++; t->value = TOKrbracket; return;
871 case '{': p++; t->value = TOKlcurly; return;
872 case '}': p++; t->value = TOKrcurly; return;
873 case '?': p++; t->value = TOKquestion; return;
874 case ',': p++; t->value = TOKcomma; return;
875 case ';': p++; t->value = TOKsemicolon; return;
876 case ':': p++; t->value = TOKcolon; return;
877 case '$': p++; t->value = TOKdollar; return;
878 case '@': p++; t->value = TOKat; return;
879
880 case '*':
881 p++;
882 if (*p == '=')
883 { p++;
884 t->value = TOKmulass;
885 }
886 else
887 t->value = TOKmul;
888 return;
889 case '%':
890 p++;
891 if (*p == '=')
892 { p++;
893 t->value = TOKmodass;
894 }
895 else
896 t->value = TOKmod;
897 return;
898
899 case '#':
900 {
901 p++;
902 Token n;
903 scan(&n);
904 if (n.value == TOKidentifier && n.ident == Id::line)
905 {
906 poundLine();
907 continue;
908 }
909 else
910 {
911 t->value = TOKpound;
912 return;
913 }
914 }
915
916 default:
917 { unsigned c = *p;
918
919 if (c & 0x80)
920 { c = decodeUTF();
921
922 // Check for start of unicode identifier
923 if (isUniAlpha(c))
924 goto case_ident;
925
926 if (c == PS || c == LS)
927 {
928 endOfLine();
929 p++;
930 continue;
931 }
932 }
933 if (c < 0x80 && isprint(c))
934 error("character '%c' is not a valid token", c);
935 else
936 error("character 0x%02x is not a valid token", c);
937 p++;
938 continue;
939 }
940 }
941 }
942}
943
944/*******************************************
945 * Parse escape sequence.
946 */
947
948unsigned Lexer::escapeSequence()
949{ unsigned c = *p;
950
951 int n;
952 int ndigits;
953
954 switch (c)
955 {
956 case '\'':
957 case '"':
958 case '?':
959 case '\\':
960 Lconsume:
961 p++;
962 break;
963
964 case 'a': c = 7; goto Lconsume;
965 case 'b': c = 8; goto Lconsume;
966 case 'f': c = 12; goto Lconsume;
967 case 'n': c = 10; goto Lconsume;
968 case 'r': c = 13; goto Lconsume;
969 case 't': c = 9; goto Lconsume;
970 case 'v': c = 11; goto Lconsume;
971
972 case 'u':
973 ndigits = 4;
974 goto Lhex;
975 case 'U':
976 ndigits = 8;
977 goto Lhex;
978 case 'x':
979 ndigits = 2;
980 Lhex:
981 p++;
982 c = *p;
983 if (ishex((utf8_t)c))
984 { unsigned v;
985
986 n = 0;
987 v = 0;
988 while (1)
989 {
990 if (isdigit((utf8_t)c))
991 c -= '0';
992 else if (islower(c))
993 c -= 'a' - 10;
994 else
995 c -= 'A' - 10;
996 v = v * 16 + c;
997 c = *++p;
998 if (++n == ndigits)
999 break;
1000 if (!ishex((utf8_t)c))
1001 { error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1002 break;
1003 }
1004 }
1005 if (ndigits != 2 && !utf_isValidDchar(v))
1006 { error("invalid UTF character \\U%08x", v);
1007 v = '?'; // recover with valid UTF character
1008 }
1009 c = v;
1010 }
1011 else
1012 error("undefined escape hex sequence \\%c",c);
1013 break;
1014
1015 case '&': // named character entity
1016 for (const utf8_t *idstart = ++p; 1; p++)
1017 {
1018 switch (*p)
1019 {
1020 case ';':
1021 c = HtmlNamedEntity(idstart, p - idstart);
1022 if (c == ~0U)
1023 { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1024 c = ' ';
1025 }
1026 p++;
1027 break;
1028
1029 default:
1030 if (isalpha(*p) ||
1031 (p != idstart && isdigit(*p)))
1032 continue;
1033 error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart);
1034 break;
1035 }
1036 break;
1037 }
1038 break;
1039
1040 case 0:
1041 case 0x1A: // end of file
1042 c = '\\';
1043 break;
1044
1045 default:
1046 if (isoctal((utf8_t)c))
1047 { unsigned v;
1048
1049 n = 0;
1050 v = 0;
1051 do
1052 {
1053 v = v * 8 + (c - '0');
1054 c = *++p;
1055 } while (++n < 3 && isoctal((utf8_t)c));
1056 c = v;
1057 if (c > 0xFF)
1058 error("escape octal sequence \\%03o is larger than \\377", c);
1059 }
1060 else
1061 error("undefined escape sequence \\%c",c);
1062 break;
1063 }
1064 return c;
1065}
1066
1067/**************************************
1068 */
1069
1070TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1071{
1072 int c;
1073 Loc start = loc();
1074
1075 p++;
1076 stringbuffer.reset();
1077 while (1)
1078 {
1079 c = *p++;
1080 switch (c)
1081 {
1082 case '\n':
1083 endOfLine();
1084 break;
1085
1086 case '\r':
1087 if (*p == '\n')
1088 continue; // ignore
1089 c = '\n'; // treat EndOfLine as \n character
1090 endOfLine();
1091 break;
1092
1093 case 0:
1094 case 0x1A:
1095 error("unterminated string constant starting at %s", start.toChars());
1096 t->ustring = (utf8_t *)const_cast<char *>("");
1097 t->len = 0;
1098 t->postfix = 0;
1099 return TOKstring;
1100
1101 case '"':
1102 case '`':
1103 if (c == tc)
1104 {
1105 t->len = (unsigned)stringbuffer.offset;
1106 stringbuffer.writeByte(0);
1107 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1108 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1109 stringPostfix(t);
1110 return TOKstring;
1111 }
1112 break;
1113
1114 default:
1115 if (c & 0x80)
1116 { p--;
1117 unsigned u = decodeUTF();
1118 p++;
1119 if (u == PS || u == LS)
1120 endOfLine();
1121 stringbuffer.writeUTF8(u);
1122 continue;
1123 }
1124 break;
1125 }
1126 stringbuffer.writeByte(c);
1127 }
1128}
1129
1130/**************************************
1131 * Lex hex strings:
1132 * x"0A ae 34FE BD"
1133 */
1134
1135TOK Lexer::hexStringConstant(Token *t)
1136{
1137 unsigned c;
1138 Loc start = loc();
1139 unsigned n = 0;
1140 unsigned v = ~0; // dead assignment, needed to suppress warning
1141
1142 p++;
1143 stringbuffer.reset();
1144 while (1)
1145 {
1146 c = *p++;
1147 switch (c)
1148 {
1149 case ' ':
1150 case '\t':
1151 case '\v':
1152 case '\f':
1153 continue; // skip white space
1154
1155 case '\r':
1156 if (*p == '\n')
1157 continue; // ignore
1158 // Treat isolated '\r' as if it were a '\n'
1159 /* fall through */
1160 case '\n':
1161 endOfLine();
1162 continue;
1163
1164 case 0:
1165 case 0x1A:
1166 error("unterminated string constant starting at %s", start.toChars());
1167 t->ustring = (utf8_t *)const_cast<char *>("");
1168 t->len = 0;
1169 t->postfix = 0;
1170 return TOKxstring;
1171
1172 case '"':
1173 if (n & 1)
1174 { error("odd number (%d) of hex characters in hex string", n);
1175 stringbuffer.writeByte(v);
1176 }
1177 t->len = (unsigned)stringbuffer.offset;
1178 stringbuffer.writeByte(0);
1179 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1180 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1181 stringPostfix(t);
1182 return TOKxstring;
1183
1184 default:
1185 if (c >= '0' && c <= '9')
1186 c -= '0';
1187 else if (c >= 'a' && c <= 'f')
1188 c -= 'a' - 10;
1189 else if (c >= 'A' && c <= 'F')
1190 c -= 'A' - 10;
1191 else if (c & 0x80)
1192 { p--;
1193 unsigned u = decodeUTF();
1194 p++;
1195 if (u == PS || u == LS)
1196 endOfLine();
1197 else
1198 error("non-hex character \\u%04x in hex string", u);
1199 }
1200 else
1201 error("non-hex character '%c' in hex string", c);
1202 if (n & 1)
1203 { v = (v << 4) | c;
1204 stringbuffer.writeByte(v);
1205 }
1206 else
1207 v = c;
1208 n++;
1209 break;
1210 }
1211 }
1212}
1213
1214
1215/**************************************
1216 * Lex delimited strings:
1217 * q"(foo(xxx))" // "foo(xxx)"
1218 * q"[foo(]" // "foo("
1219 * q"/foo]/" // "foo]"
1220 * q"HERE
1221 * foo
1222 * HERE" // "foo\n"
1223 * Input:
1224 * p is on the "
1225 */
1226
1227TOK Lexer::delimitedStringConstant(Token *t)
1228{
1229 unsigned c;
1230 Loc start = loc();
1231 unsigned delimleft = 0;
1232 unsigned delimright = 0;
1233 unsigned nest = 1;
1234 unsigned nestcount = ~0; // dead assignment, needed to suppress warning
1235 Identifier *hereid = NULL;
1236 unsigned blankrol = 0;
1237 unsigned startline = 0;
1238
1239 p++;
1240 stringbuffer.reset();
1241 while (1)
1242 {
1243 c = *p++;
1244 //printf("c = '%c'\n", c);
1245 switch (c)
1246 {
1247 case '\n':
1248 Lnextline:
1249 endOfLine();
1250 startline = 1;
1251 if (blankrol)
1252 { blankrol = 0;
1253 continue;
1254 }
1255 if (hereid)
1256 {
1257 stringbuffer.writeUTF8(c);
1258 continue;
1259 }
1260 break;
1261
1262 case '\r':
1263 if (*p == '\n')
1264 continue; // ignore
1265 c = '\n'; // treat EndOfLine as \n character
1266 goto Lnextline;
1267
1268 case 0:
1269 case 0x1A:
1270 error("unterminated delimited string constant starting at %s", start.toChars());
1271 t->ustring = (utf8_t *)const_cast<char *>("");
1272 t->len = 0;
1273 t->postfix = 0;
1274 return TOKstring;
1275
1276 default:
1277 if (c & 0x80)
1278 { p--;
1279 c = decodeUTF();
1280 p++;
1281 if (c == PS || c == LS)
1282 goto Lnextline;
1283 }
1284 break;
1285 }
1286 if (delimleft == 0)
1287 { delimleft = c;
1288 nest = 1;
1289 nestcount = 1;
1290 if (c == '(')
1291 delimright = ')';
1292 else if (c == '{')
1293 delimright = '}';
1294 else if (c == '[')
1295 delimright = ']';
1296 else if (c == '<')
1297 delimright = '>';
1298 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1299 { // Start of identifier; must be a heredoc
1300 Token tok;
1301 p--;
1302 scan(&tok); // read in heredoc identifier
1303 if (tok.value != TOKidentifier)
1304 { error("identifier expected for heredoc, not %s", tok.toChars());
1305 delimright = c;
1306 }
1307 else
1308 { hereid = tok.ident;
1309 //printf("hereid = '%s'\n", hereid->toChars());
1310 blankrol = 1;
1311 }
1312 nest = 0;
1313 }
1314 else
1315 { delimright = c;
1316 nest = 0;
1317 if (isspace(c))
1318 error("delimiter cannot be whitespace");
1319 }
1320 }
1321 else
1322 {
1323 if (blankrol)
1324 { error("heredoc rest of line should be blank");
1325 blankrol = 0;
1326 continue;
1327 }
1328 if (nest == 1)
1329 {
1330 if (c == delimleft)
1331 nestcount++;
1332 else if (c == delimright)
1333 { nestcount--;
1334 if (nestcount == 0)
1335 goto Ldone;
1336 }
1337 }
1338 else if (c == delimright)
1339 goto Ldone;
1340 if (startline && isalpha(c) && hereid)
1341 { Token tok;
1342 const utf8_t *psave = p;
1343 p--;
1344 scan(&tok); // read in possible heredoc identifier
1345 //printf("endid = '%s'\n", tok.ident->toChars());
1346 if (tok.value == TOKidentifier && tok.ident->equals(hereid))
1347 { /* should check that rest of line is blank
1348 */
1349 goto Ldone;
1350 }
1351 p = psave;
1352 }
1353 stringbuffer.writeUTF8(c);
1354 startline = 0;
1355 }
1356 }
1357
1358Ldone:
1359 if (*p == '"')
1360 p++;
1361 else if (hereid)
1362 error("delimited string must end in %s\"", hereid->toChars());
1363 else
1364 error("delimited string must end in %c\"", delimright);
1365 t->len = (unsigned)stringbuffer.offset;
1366 stringbuffer.writeByte(0);
1367 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1368 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1369 stringPostfix(t);
1370 return TOKstring;
1371}
1372
1373/**************************************
1374 * Lex delimited strings:
1375 * q{ foo(xxx) } // " foo(xxx) "
1376 * q{foo(} // "foo("
1377 * q{{foo}"}"} // "{foo}"}""
1378 * Input:
1379 * p is on the q
1380 */
1381
1382TOK Lexer::tokenStringConstant(Token *t)
1383{
1384 unsigned nest = 1;
1385 Loc start = loc();
1386 const utf8_t *pstart = ++p;
1387
1388 while (1)
1389 { Token tok;
1390
1391 scan(&tok);
1392 switch (tok.value)
1393 {
1394 case TOKlcurly:
1395 nest++;
1396 continue;
1397
1398 case TOKrcurly:
1399 if (--nest == 0)
1400 {
1401 t->len = (unsigned)(p - 1 - pstart);
1402 t->ustring = (utf8_t *)mem.xmalloc(t->len + 1);
1403 memcpy(t->ustring, pstart, t->len);
1404 t->ustring[t->len] = 0;
1405 stringPostfix(t);
1406 return TOKstring;
1407 }
1408 continue;
1409
1410 case TOKeof:
1411 error("unterminated token string constant starting at %s", start.toChars());
1412 t->ustring = (utf8_t *)const_cast<char *>("");
1413 t->len = 0;
1414 t->postfix = 0;
1415 return TOKstring;
1416
1417 default:
1418 continue;
1419 }
1420 }
1421}
1422
1423
1424
1425/**************************************
1426 */
1427
1428TOK Lexer::escapeStringConstant(Token *t)
1429{
1430 unsigned c;
1431 Loc start = loc();
1432
1433 p++;
1434 stringbuffer.reset();
1435 while (1)
1436 {
1437 c = *p++;
1438 switch (c)
1439 {
1440 case '\\':
1441 switch (*p)
1442 {
1443 case 'u':
1444 case 'U':
1445 case '&':
1446 c = escapeSequence();
1447 stringbuffer.writeUTF8(c);
1448 continue;
1449
1450 default:
1451 c = escapeSequence();
1452 break;
1453 }
1454 break;
1455 case '\n':
1456 endOfLine();
1457 break;
1458
1459 case '\r':
1460 if (*p == '\n')
1461 continue; // ignore
1462 c = '\n'; // treat EndOfLine as \n character
1463 endOfLine();
1464 break;
1465
1466 case '"':
1467 t->len = (unsigned)stringbuffer.offset;
1468 stringbuffer.writeByte(0);
1469 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1470 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1471 stringPostfix(t);
1472 return TOKstring;
1473
1474 case 0:
1475 case 0x1A:
1476 p--;
1477 error("unterminated string constant starting at %s", start.toChars());
1478 t->ustring = (utf8_t *)const_cast<char *>("");
1479 t->len = 0;
1480 t->postfix = 0;
1481 return TOKstring;
1482
1483 default:
1484 if (c & 0x80)
1485 {
1486 p--;
1487 c = decodeUTF();
1488 if (c == LS || c == PS)
1489 { c = '\n';
1490 endOfLine();
1491 }
1492 p++;
1493 stringbuffer.writeUTF8(c);
1494 continue;
1495 }
1496 break;
1497 }
1498 stringbuffer.writeByte(c);
1499 }
1500}
1501
1502/**************************************
1503 */
1504
1505TOK Lexer::charConstant(Token *t)
1506{
1507 unsigned c;
1508 TOK tk = TOKcharv;
1509
1510 //printf("Lexer::charConstant\n");
1511 p++;
1512 c = *p++;
1513 switch (c)
1514 {
1515 case '\\':
1516 switch (*p)
1517 {
1518 case 'u':
1519 t->uns64value = escapeSequence();
1520 tk = TOKwcharv;
1521 break;
1522
1523 case 'U':
1524 case '&':
1525 t->uns64value = escapeSequence();
1526 tk = TOKdcharv;
1527 break;
1528
1529 default:
1530 t->uns64value = escapeSequence();
1531 break;
1532 }
1533 break;
1534 case '\n':
1535 L1:
1536 endOfLine();
1537 /* fall through */
1538 case '\r':
1539 case 0:
1540 case 0x1A:
1541 case '\'':
1542 error("unterminated character constant");
1543 t->uns64value = '?';
1544 return tk;
1545
1546 default:
1547 if (c & 0x80)
1548 {
1549 p--;
1550 c = decodeUTF();
1551 p++;
1552 if (c == LS || c == PS)
1553 goto L1;
1554 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1555 tk = TOKwcharv;
1556 else
1557 tk = TOKdcharv;
1558 }
1559 t->uns64value = c;
1560 break;
1561 }
1562
1563 if (*p != '\'')
1564 {
1565 error("unterminated character constant");
1566 t->uns64value = '?';
1567 return tk;
1568 }
1569 p++;
1570 return tk;
1571}
1572
1573/***************************************
1574 * Get postfix of string literal.
1575 */
1576
1577void Lexer::stringPostfix(Token *t)
1578{
1579 switch (*p)
1580 {
1581 case 'c':
1582 case 'w':
1583 case 'd':
1584 t->postfix = *p;
1585 p++;
1586 break;
1587
1588 default:
1589 t->postfix = 0;
1590 break;
1591 }
1592}
1593
1594/**************************************
1595 * Read in a number.
1596 * If it's an integer, store it in tok.TKutok.Vlong.
1597 * integers can be decimal, octal or hex
1598 * Handle the suffixes U, UL, LU, L, etc.
1599 * If it's double, store it in tok.TKutok.Vdouble.
1600 * Returns:
1601 * TKnum
1602 * TKdouble,...
1603 */
1604
1605TOK Lexer::number(Token *t)
1606{
1607 int base = 10;
1608 const utf8_t *start = p;
1609 unsigned c;
1610 uinteger_t n = 0; // unsigned >=64 bit integer type
1611 int d;
1612 bool err = false;
1613 bool overflow = false;
1614
1615 c = *p;
1616 if (c == '0')
1617 {
1618 ++p;
1619 c = *p;
1620 switch (c)
1621 {
1622 case '0': case '1': case '2': case '3':
1623 case '4': case '5': case '6': case '7':
1624 n = c - '0';
1625 ++p;
1626 base = 8;
1627 break;
1628
1629 case 'x':
1630 case 'X':
1631 ++p;
1632 base = 16;
1633 break;
1634
1635 case 'b':
1636 case 'B':
1637 ++p;
1638 base = 2;
1639 break;
1640
1641 case '.':
1642 if (p[1] == '.')
1643 goto Ldone; // if ".."
1644 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1645 goto Ldone; // if ".identifier" or ".unicode"
1646 goto Lreal; // '.' is part of current token
1647
1648 case 'i':
1649 case 'f':
1650 case 'F':
1651 goto Lreal;
1652
1653 case '_':
1654 ++p;
1655 base = 8;
1656 break;
1657
1658 case 'L':
1659 if (p[1] == 'i')
1660 goto Lreal;
1661 break;
1662
1663 default:
1664 break;
1665 }
1666 }
1667
1668 while (1)
1669 {
1670 c = *p;
1671 switch (c)
1672 {
1673 case '0': case '1':
1674 ++p;
1675 d = c - '0';
1676 break;
1677
1678 case '2': case '3':
1679 case '4': case '5': case '6': case '7':
1680 if (base == 2 && !err)
1681 {
1682 error("binary digit expected");
1683 err = true;
1684 }
1685 ++p;
1686 d = c - '0';
1687 break;
1688
1689 case '8': case '9':
1690 ++p;
1691 if (base < 10 && !err)
1692 {
1693 error("radix %d digit expected, not '%c'", base, c);
1694 err = true;
1695 }
1696 d = c - '0';
1697 break;
1698
1699 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1700 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1701 ++p;
1702 if (base != 16)
1703 {
1704 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
1705 goto Lreal;
1706 if (!err)
1707 {
1708 error("radix %d digit expected, not '%c'", base, c);
1709 err = true;
1710 }
1711 }
1712 if (c >= 'a')
1713 d = c + 10 - 'a';
1714 else
1715 d = c + 10 - 'A';
1716 break;
1717
1718 case 'L':
1719 if (p[1] == 'i')
1720 goto Lreal;
1721 goto Ldone;
1722
1723 case '.':
1724 if (p[1] == '.')
1725 goto Ldone; // if ".."
1726 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
1727 goto Ldone; // if ".identifier" or ".unicode"
1728 goto Lreal; // otherwise as part of a floating point literal
1729
1730 case 'p':
1731 case 'P':
1732 case 'i':
1733 Lreal:
1734 p = start;
1735 return inreal(t);
1736
1737 case '_':
1738 ++p;
1739 continue;
1740
1741 default:
1742 goto Ldone;
1743 }
1744
1745 uinteger_t n2 = n * base;
1746 if ((n2 / base != n || n2 + d < n))
1747 {
1748 overflow = true;
1749 }
1750 n = n2 + d;
1751
1752 // if n needs more than 64 bits
1753 if (sizeof(n) > 8 &&
1754 n > 0xFFFFFFFFFFFFFFFFULL)
1755 {
1756 overflow = true;
1757 }
1758 }
1759
1760Ldone:
1761
1762 if (overflow && !err)
1763 {
1764 error("integer overflow");
1765 err = true;
1766 }
1767
1768 enum FLAGS
1769 {
1770 FLAGS_none = 0,
1771 FLAGS_decimal = 1, // decimal
1772 FLAGS_unsigned = 2, // u or U suffix
1773 FLAGS_long = 4, // L suffix
1774 };
1775
1776 unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
1777
1778 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
1779 const utf8_t *psuffix = p;
1780 while (1)
1781 {
1782 utf8_t f;
1783 switch (*p)
1784 {
1785 case 'U':
1786 case 'u':
1787 f = FLAGS_unsigned;
1788 goto L1;
1789
1790 case 'l':
1791 f = FLAGS_long;
1792 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
1793 goto L1;
1794
1795 case 'L':
1796 f = FLAGS_long;
1797 L1:
1798 p++;
1799 if ((flags & f) && !err)
1800 {
1801 error("unrecognized token");
1802 err = true;
1803 }
1804 flags = (FLAGS) (flags | f);
1805 continue;
1806 default:
1807 break;
1808 }
1809 break;
1810 }
1811
1812 if (base == 8 && n >= 8)
1813 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead",
1814 n, p - psuffix, psuffix, n, p - psuffix, psuffix);
1815
1816 TOK result;
1817 switch (flags)
1818 {
1819 case FLAGS_none:
1820 /* Octal or Hexadecimal constant.
1821 * First that fits: int, uint, long, ulong
1822 */
1823 if (n & 0x8000000000000000LL)
1824 result = TOKuns64v;
1825 else if (n & 0xFFFFFFFF00000000LL)
1826 result = TOKint64v;
1827 else if (n & 0x80000000)
1828 result = TOKuns32v;
1829 else
1830 result = TOKint32v;
1831 break;
1832
1833 case FLAGS_decimal:
1834 /* First that fits: int, long, long long
1835 */
1836 if (n & 0x8000000000000000LL)
1837 {
1838 if (!err)
1839 {
1840 error("signed integer overflow");
1841 err = true;
1842 }
1843 result = TOKuns64v;
1844 }
1845 else if (n & 0xFFFFFFFF80000000LL)
1846 result = TOKint64v;
1847 else
1848 result = TOKint32v;
1849 break;
1850
1851 case FLAGS_unsigned:
1852 case FLAGS_decimal | FLAGS_unsigned:
1853 /* First that fits: uint, ulong
1854 */
1855 if (n & 0xFFFFFFFF00000000LL)
1856 result = TOKuns64v;
1857 else
1858 result = TOKuns32v;
1859 break;
1860
1861 case FLAGS_decimal | FLAGS_long:
1862 if (n & 0x8000000000000000LL)
1863 {
1864 if (!err)
1865 {
1866 error("signed integer overflow");
1867 err = true;
1868 }
1869 result = TOKuns64v;
1870 }
1871 else
1872 result = TOKint64v;
1873 break;
1874
1875 case FLAGS_long:
1876 if (n & 0x8000000000000000LL)
1877 result = TOKuns64v;
1878 else
1879 result = TOKint64v;
1880 break;
1881
1882 case FLAGS_unsigned | FLAGS_long:
1883 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
1884 result = TOKuns64v;
1885 break;
1886
1887 default:
1888 assert(0);
1889 }
1890 t->uns64value = n;
1891 return result;
1892}
1893
1894/**************************************
1895 * Read in characters, converting them to real.
1896 * Bugs:
1897 * Exponent overflow not detected.
1898 * Too much requested precision is not detected.
1899 */
1900
1901TOK Lexer::inreal(Token *t)
1902{
1903 //printf("Lexer::inreal()\n");
1904 bool isWellformedString = true;
1905 stringbuffer.reset();
1906 const utf8_t *pstart = p;
1907 char hex = 0;
1908 unsigned c = *p++;
1909
1910 // Leading '0x'
1911 if (c == '0')
1912 {
1913 c = *p++;
1914 if (c == 'x' || c == 'X')
1915 {
1916 hex = true;
1917 c = *p++;
1918 }
1919 }
1920
1921 // Digits to left of '.'
1922 while (1)
1923 {
1924 if (c == '.')
1925 {
1926 c = *p++;
1927 break;
1928 }
1929 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
1930 {
1931 c = *p++;
1932 continue;
1933 }
1934 break;
1935 }
1936
1937 // Digits to right of '.'
1938 while (1)
1939 {
1940 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
1941 {
1942 c = *p++;
1943 continue;
1944 }
1945 break;
1946 }
1947
1948 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
1949 {
1950 c = *p++;
1951 if (c == '-' || c == '+')
1952 {
1953 c = *p++;
1954 }
1955 bool anyexp = false;
1956 while (1)
1957 {
1958 if (isdigit(c))
1959 {
1960 anyexp = true;
1961 c = *p++;
1962 continue;
1963 }
1964 if (c == '_')
1965 {
1966 c = *p++;
1967 continue;
1968 }
1969 if (!anyexp)
1970 {
1971 error("missing exponent");
1972 isWellformedString = false;
1973 }
1974 break;
1975 }
1976 }
1977 else if (hex)
1978 {
1979 error("exponent required for hex float");
1980 isWellformedString = false;
1981 }
1982 --p;
1983 while (pstart < p)
1984 {
1985 if (*pstart != '_')
1986 stringbuffer.writeByte(*pstart);
1987 ++pstart;
1988 }
1989
1990 stringbuffer.writeByte(0);
1991 const char *sbufptr = (char *)stringbuffer.data;
1992 TOK result;
1993 bool isOutOfRange = false;
1994 t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero);
1995 errno = 0;
1996 switch (*p)
1997 {
1998 case 'F':
1999 case 'f':
2000 if (isWellformedString && !isOutOfRange)
2001 isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr);
2002 result = TOKfloat32v;
2003 p++;
2004 break;
2005
2006 default:
2007 if (isWellformedString && !isOutOfRange)
2008 isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr);
2009 result = TOKfloat64v;
2010 break;
2011
2012 case 'l':
2013 error("use 'L' suffix instead of 'l'");
2014 /* fall through */
2015 case 'L':
2016 result = TOKfloat80v;
2017 p++;
2018 break;
2019 }
2020 if (*p == 'i' || *p == 'I')
2021 {
2022 if (*p == 'I')
2023 error("use 'i' suffix instead of 'I'");
2024 p++;
2025 switch (result)
2026 {
2027 case TOKfloat32v:
2028 result = TOKimaginary32v;
2029 break;
2030 case TOKfloat64v:
2031 result = TOKimaginary64v;
2032 break;
2033 case TOKfloat80v:
2034 result = TOKimaginary80v;
2035 break;
2036 default: break;
2037 }
2038 }
2039 const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v);
2040 if (isOutOfRange && !isLong)
2041 {
2042 const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
2043 error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix);
2044 }
2045 return result;
2046}
2047
2048/*********************************************
2049 * parse:
2050 * #line linnum [filespec]
2051 * also allow __LINE__ for linnum, and __FILE__ for filespec
2052 */
2053
2054void Lexer::poundLine()
2055{
2056 Token tok;
2057 int linnum = this->scanloc.linnum;
2058 char *filespec = NULL;
2059 Loc loc = this->loc();
2060
2061 scan(&tok);
2062 if (tok.value == TOKint32v || tok.value == TOKint64v)
2063 {
2064 int lin = (int)(tok.uns64value - 1);
2065 if ((unsigned)lin != tok.uns64value - 1)
2066 error("line number %lld out of range", (unsigned long long)tok.uns64value);
2067 else
2068 linnum = lin;
2069 }
2070 else if (tok.value == TOKline)
2071 {
2072 }
2073 else
2074 goto Lerr;
2075
2076 while (1)
2077 {
2078 switch (*p)
2079 {
2080 case 0:
2081 case 0x1A:
2082 case '\n':
2083 Lnewline:
2084 this->scanloc.linnum = linnum;
2085 if (filespec)
2086 this->scanloc.filename = filespec;
2087 return;
2088
2089 case '\r':
2090 p++;
2091 if (*p != '\n')
2092 { p--;
2093 goto Lnewline;
2094 }
2095 continue;
2096
2097 case ' ':
2098 case '\t':
2099 case '\v':
2100 case '\f':
2101 p++;
2102 continue; // skip white space
2103
2104 case '_':
2105 if (memcmp(p, "__FILE__", 8) == 0)
2106 {
2107 p += 8;
2108 filespec = mem.xstrdup(scanloc.filename);
2109 continue;
2110 }
2111 goto Lerr;
2112
2113 case '"':
2114 if (filespec)
2115 goto Lerr;
2116 stringbuffer.reset();
2117 p++;
2118 while (1)
2119 { unsigned c;
2120
2121 c = *p;
2122 switch (c)
2123 {
2124 case '\n':
2125 case '\r':
2126 case 0:
2127 case 0x1A:
2128 goto Lerr;
2129
2130 case '"':
2131 stringbuffer.writeByte(0);
2132 filespec = mem.xstrdup((char *)stringbuffer.data);
2133 p++;
2134 break;
2135
2136 default:
2137 if (c & 0x80)
2138 { unsigned u = decodeUTF();
2139 if (u == PS || u == LS)
2140 goto Lerr;
2141 }
2142 stringbuffer.writeByte(c);
2143 p++;
2144 continue;
2145 }
2146 break;
2147 }
2148 continue;
2149
2150 default:
2151 if (*p & 0x80)
2152 { unsigned u = decodeUTF();
2153 if (u == PS || u == LS)
2154 goto Lnewline;
2155 }
2156 goto Lerr;
2157 }
2158 }
2159
2160Lerr:
2161 error(loc, "#line integer [\"filespec\"]\\n expected");
2162}
2163
2164
2165/********************************************
2166 * Decode UTF character.
2167 * Issue error messages for invalid sequences.
2168 * Return decoded character, advance p to last character in UTF sequence.
2169 */
2170
2171unsigned Lexer::decodeUTF()
2172{
2173 dchar_t u;
2174 utf8_t c;
2175 const utf8_t *s = p;
2176 size_t len;
2177 size_t idx;
2178 const char *msg;
2179
2180 c = *s;
2181 assert(c & 0x80);
2182
2183 // Check length of remaining string up to 6 UTF-8 characters
2184 for (len = 1; len < 6 && s[len]; len++)
2185 ;
2186
2187 idx = 0;
2188 msg = utf_decodeChar(s, len, &idx, &u);
2189 p += idx - 1;
2190 if (msg)
2191 {
2192 error("%s", msg);
2193 }
2194 return u;
2195}
2196
2197
2198/***************************************************
2199 * Parse doc comment embedded between t->ptr and p.
2200 * Remove trailing blanks and tabs from lines.
2201 * Replace all newlines with \n.
2202 * Remove leading comment character from each line.
2203 * Decide if it's a lineComment or a blockComment.
2204 * Append to previous one for this token.
2205 */
2206
2207void Lexer::getDocComment(Token *t, unsigned lineComment)
2208{
2209 /* ct tells us which kind of comment it is: '/', '*', or '+'
2210 */
2211 utf8_t ct = t->ptr[2];
2212
2213 /* Start of comment text skips over / * *, / + +, or / / /
2214 */
2215 const utf8_t *q = t->ptr + 3; // start of comment text
2216
2217 const utf8_t *qend = p;
2218 if (ct == '*' || ct == '+')
2219 qend -= 2;
2220
2221 /* Scan over initial row of ****'s or ++++'s or ////'s
2222 */
2223 for (; q < qend; q++)
2224 {
2225 if (*q != ct)
2226 break;
2227 }
2228
2229 /* Remove leading spaces until start of the comment
2230 */
2231 int linestart = 0;
2232 if (ct == '/')
2233 {
2234 while (q < qend && (*q == ' ' || *q == '\t'))
2235 ++q;
2236 }
2237 else if (q < qend)
2238 {
2239 if (*q == '\r')
2240 {
2241 ++q;
2242 if (q < qend && *q == '\n')
2243 ++q;
2244 linestart = 1;
2245 }
2246 else if (*q == '\n')
2247 {
2248 ++q;
2249 linestart = 1;
2250 }
2251 }
2252
2253 /* Remove trailing row of ****'s or ++++'s
2254 */
2255 if (ct != '/')
2256 {
2257 for (; q < qend; qend--)
2258 {
2259 if (qend[-1] != ct)
2260 break;
2261 }
2262 }
2263
2264 /* Comment is now [q .. qend].
2265 * Canonicalize it into buf[].
2266 */
2267 OutBuffer buf;
2268
2269 for (; q < qend; q++)
2270 {
2271 utf8_t c = *q;
2272
2273 switch (c)
2274 {
2275 case '*':
2276 case '+':
2277 if (linestart && c == ct)
2278 { linestart = 0;
2279 /* Trim preceding whitespace up to preceding \n
2280 */
2281 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2282 buf.offset--;
2283 continue;
2284 }
2285 break;
2286
2287 case ' ':
2288 case '\t':
2289 break;
2290
2291 case '\r':
2292 if (q[1] == '\n')
2293 continue; // skip the \r
2294 goto Lnewline;
2295
2296 default:
2297 if (c == 226)
2298 {
2299 // If LS or PS
2300 if (q[1] == 128 &&
2301 (q[2] == 168 || q[2] == 169))
2302 {
2303 q += 2;
2304 goto Lnewline;
2305 }
2306 }
2307 linestart = 0;
2308 break;
2309
2310 Lnewline:
2311 c = '\n'; // replace all newlines with \n
2312 /* fall through */
2313 case '\n':
2314 linestart = 1;
2315
2316 /* Trim trailing whitespace
2317 */
2318 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2319 buf.offset--;
2320
2321 break;
2322 }
2323 buf.writeByte(c);
2324 }
2325
2326 /* Trim trailing whitespace (if the last line does not have newline)
2327 */
2328 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2329 {
2330 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2331 buf.offset--;
2332 }
2333
2334 // Always end with a newline
2335 if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2336 buf.writeByte('\n');
2337
2338 buf.writeByte(0);
2339
2340 // It's a line comment if the start of the doc comment comes
2341 // after other non-whitespace on the same line.
2342 const utf8_t** dc = (lineComment && anyToken)
2343 ? &t->lineComment
2344 : &t->blockComment;
2345
2346 // Combine with previous doc comment, if any
2347 if (*dc)
2348 *dc = combineComments(*dc, (utf8_t *)buf.data);
2349 else
2350 *dc = (utf8_t *)buf.extractData();
2351}
2352
2353/********************************************
2354 * Combine two document comments into one,
2355 * separated by a newline.
2356 */
2357
2358const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2)
2359{
2360 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
2361
2362 const utf8_t *c = c2;
2363
2364 if (c1)
2365 {
2366 c = c1;
2367 if (c2)
2368 {
2369 size_t len1 = strlen((const char *)c1);
2370 size_t len2 = strlen((const char *)c2);
2371
2372 int insertNewLine = 0;
2373 if (len1 && c1[len1 - 1] != '\n')
2374 {
2375 ++len1;
2376 insertNewLine = 1;
2377 }
2378
2379 utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1);
2380 memcpy(p, c1, len1 - insertNewLine);
2381 if (insertNewLine)
2382 p[len1 - 1] = '\n';
2383
2384 p[len1] = '\n';
2385
2386 memcpy(p + len1 + 1, c2, len2);
2387 p[len1 + 1 + len2] = 0;
2388 c = p;
2389 }
2390 }
2391 return c;
2392}