]>
Commit | Line | Data |
---|---|---|
b4c522fa IB |
1 | |
2 | /* Compiler implementation of the D programming language | |
f3ed896c | 3 | * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved |
b4c522fa IB |
4 | * written by Walter Bright |
5 | * http://www.digitalmars.com | |
6 | * Distributed under the Boost Software License, Version 1.0. | |
7 | * http://www.boost.org/LICENSE_1_0.txt | |
8 | * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c | |
9 | */ | |
10 | ||
11 | /* Lexical Analyzer */ | |
12 | ||
f9ab59ff | 13 | #include "root/dsystem.h" // for time() and ctime() |
b4c522fa IB |
14 | #include "root/rmem.h" |
15 | ||
16 | #include "mars.h" | |
17 | #include "lexer.h" | |
18 | #include "utf.h" | |
19 | #include "identifier.h" | |
20 | #include "id.h" | |
21 | ||
22 | extern int HtmlNamedEntity(const utf8_t *p, size_t length); | |
23 | ||
24 | #define LS 0x2028 // UTF line separator | |
25 | #define PS 0x2029 // UTF paragraph separator | |
26 | ||
27 | /******************************************** | |
28 | * Do our own char maps | |
29 | */ | |
30 | ||
31 | static unsigned char cmtable[256]; | |
32 | ||
33 | const int CMoctal = 0x1; | |
34 | const int CMhex = 0x2; | |
35 | const int CMidchar = 0x4; | |
36 | ||
37 | inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; } | |
38 | inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; } | |
39 | inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; } | |
40 | ||
41 | struct CMTableInitializer | |
42 | { | |
43 | CMTableInitializer(); | |
44 | }; | |
45 | ||
46 | static CMTableInitializer cmtableinitializer; | |
47 | ||
48 | CMTableInitializer::CMTableInitializer() | |
49 | { | |
50 | for (unsigned c = 0; c < 256; c++) | |
51 | { | |
52 | if ('0' <= c && c <= '7') | |
53 | cmtable[c] |= CMoctal; | |
54 | if (isxdigit(c)) | |
55 | cmtable[c] |= CMhex; | |
56 | if (isalnum(c) || c == '_') | |
57 | cmtable[c] |= CMidchar; | |
58 | } | |
59 | } | |
60 | ||
61 | /*************************** Lexer ********************************************/ | |
62 | ||
63 | OutBuffer Lexer::stringbuffer; | |
64 | ||
65 | Lexer::Lexer(const char *filename, | |
66 | const utf8_t *base, size_t begoffset, size_t endoffset, | |
67 | bool doDocComment, bool commentToken) | |
68 | { | |
69 | scanloc = Loc(filename, 1, 1); | |
70 | //printf("Lexer::Lexer(%p,%d)\n",base,length); | |
71 | //printf("lexer.filename = %s\n", filename); | |
72 | this->token = Token(); | |
73 | this->token.ptr = NULL; | |
74 | this->token.value = TOKreserved; | |
75 | this->token.blockComment = NULL; | |
76 | this->token.lineComment = NULL; | |
77 | this->base = base; | |
78 | this->end = base + endoffset; | |
79 | p = base + begoffset; | |
80 | line = p; | |
81 | this->doDocComment = doDocComment; | |
82 | this->anyToken = 0; | |
83 | this->commentToken = commentToken; | |
84 | this->errors = false; | |
85 | //initKeywords(); | |
86 | ||
87 | /* If first line starts with '#!', ignore the line | |
88 | */ | |
89 | ||
90 | if (p[0] == '#' && p[1] =='!') | |
91 | { | |
92 | p += 2; | |
93 | while (1) | |
94 | { | |
95 | utf8_t c = *p++; | |
96 | switch (c) | |
97 | { | |
98 | case 0: | |
99 | case 0x1A: | |
100 | p--; | |
101 | /* fall through */ | |
102 | ||
103 | case '\n': | |
104 | break; | |
105 | ||
106 | default: | |
107 | continue; | |
108 | } | |
109 | break; | |
110 | } | |
111 | endOfLine(); | |
112 | } | |
113 | } | |
114 | ||
115 | ||
116 | void Lexer::endOfLine() | |
117 | { | |
118 | scanloc.linnum++; | |
119 | line = p; | |
120 | } | |
121 | ||
122 | ||
123 | void Lexer::error(const char *format, ...) | |
124 | { | |
125 | va_list ap; | |
126 | va_start(ap, format); | |
127 | ::verror(token.loc, format, ap); | |
128 | va_end(ap); | |
129 | errors = true; | |
130 | } | |
131 | ||
132 | void Lexer::error(Loc loc, const char *format, ...) | |
133 | { | |
134 | va_list ap; | |
135 | va_start(ap, format); | |
136 | ::verror(loc, format, ap); | |
137 | va_end(ap); | |
138 | errors = true; | |
139 | } | |
140 | ||
141 | void Lexer::deprecation(const char *format, ...) | |
142 | { | |
143 | va_list ap; | |
144 | va_start(ap, format); | |
145 | ::vdeprecation(token.loc, format, ap); | |
146 | va_end(ap); | |
147 | if (global.params.useDeprecated == DIAGNOSTICerror) | |
148 | errors = true; | |
149 | } | |
150 | ||
151 | TOK Lexer::nextToken() | |
152 | { | |
153 | if (token.next) | |
154 | { | |
155 | Token *t = token.next; | |
156 | memcpy(&token,t,sizeof(Token)); | |
157 | t->free(); | |
158 | } | |
159 | else | |
160 | { | |
161 | scan(&token); | |
162 | } | |
163 | //token.print(); | |
164 | return token.value; | |
165 | } | |
166 | ||
167 | Token *Lexer::peek(Token *ct) | |
168 | { | |
169 | Token *t; | |
170 | if (ct->next) | |
171 | t = ct->next; | |
172 | else | |
173 | { | |
174 | t = Token::alloc(); | |
175 | scan(t); | |
176 | ct->next = t; | |
177 | } | |
178 | return t; | |
179 | } | |
180 | ||
181 | /*********************** | |
182 | * Look ahead at next token's value. | |
183 | */ | |
184 | ||
185 | TOK Lexer::peekNext() | |
186 | { | |
187 | return peek(&token)->value; | |
188 | } | |
189 | ||
190 | /*********************** | |
191 | * Look 2 tokens ahead at value. | |
192 | */ | |
193 | ||
194 | TOK Lexer::peekNext2() | |
195 | { | |
196 | Token *t = peek(&token); | |
197 | return peek(t)->value; | |
198 | } | |
199 | ||
200 | /********************************* | |
201 | * tk is on the opening (. | |
202 | * Look ahead and return token that is past the closing ). | |
203 | */ | |
204 | ||
205 | Token *Lexer::peekPastParen(Token *tk) | |
206 | { | |
207 | //printf("peekPastParen()\n"); | |
208 | int parens = 1; | |
209 | int curlynest = 0; | |
210 | while (1) | |
211 | { | |
212 | tk = peek(tk); | |
213 | //tk->print(); | |
214 | switch (tk->value) | |
215 | { | |
216 | case TOKlparen: | |
217 | parens++; | |
218 | continue; | |
219 | ||
220 | case TOKrparen: | |
221 | --parens; | |
222 | if (parens) | |
223 | continue; | |
224 | tk = peek(tk); | |
225 | break; | |
226 | ||
227 | case TOKlcurly: | |
228 | curlynest++; | |
229 | continue; | |
230 | ||
231 | case TOKrcurly: | |
232 | if (--curlynest >= 0) | |
233 | continue; | |
234 | break; | |
235 | ||
236 | case TOKsemicolon: | |
237 | if (curlynest) | |
238 | continue; | |
239 | break; | |
240 | ||
241 | case TOKeof: | |
242 | break; | |
243 | ||
244 | default: | |
245 | continue; | |
246 | } | |
247 | return tk; | |
248 | } | |
249 | } | |
250 | ||
251 | /**************************** | |
252 | * Turn next token in buffer into a token. | |
253 | */ | |
254 | ||
255 | void Lexer::scan(Token *t) | |
256 | { | |
257 | unsigned lastLine = scanloc.linnum; | |
258 | Loc startLoc; | |
259 | ||
260 | t->blockComment = NULL; | |
261 | t->lineComment = NULL; | |
262 | while (1) | |
263 | { | |
264 | t->ptr = p; | |
265 | //printf("p = %p, *p = '%c'\n",p,*p); | |
266 | t->loc = loc(); | |
267 | switch (*p) | |
268 | { | |
269 | case 0: | |
270 | case 0x1A: | |
271 | t->value = TOKeof; // end of file | |
272 | return; | |
273 | ||
274 | case ' ': | |
275 | case '\t': | |
276 | case '\v': | |
277 | case '\f': | |
278 | p++; | |
279 | continue; // skip white space | |
280 | ||
281 | case '\r': | |
282 | p++; | |
283 | if (*p != '\n') // if CR stands by itself | |
284 | endOfLine(); | |
285 | continue; // skip white space | |
286 | ||
287 | case '\n': | |
288 | p++; | |
289 | endOfLine(); | |
290 | continue; // skip white space | |
291 | ||
292 | case '0': case '1': case '2': case '3': case '4': | |
293 | case '5': case '6': case '7': case '8': case '9': | |
294 | t->value = number(t); | |
295 | return; | |
296 | ||
297 | case '\'': | |
298 | t->value = charConstant(t); | |
299 | return; | |
300 | ||
301 | case 'r': | |
302 | if (p[1] != '"') | |
303 | goto case_ident; | |
304 | p++; | |
305 | /* fall through */ | |
306 | case '`': | |
307 | t->value = wysiwygStringConstant(t, *p); | |
308 | return; | |
309 | ||
310 | case 'x': | |
311 | if (p[1] != '"') | |
312 | goto case_ident; | |
313 | p++; | |
314 | t->value = hexStringConstant(t); | |
315 | return; | |
316 | ||
317 | case 'q': | |
318 | if (p[1] == '"') | |
319 | { | |
320 | p++; | |
321 | t->value = delimitedStringConstant(t); | |
322 | return; | |
323 | } | |
324 | else if (p[1] == '{') | |
325 | { | |
326 | p++; | |
327 | t->value = tokenStringConstant(t); | |
328 | return; | |
329 | } | |
330 | else | |
331 | goto case_ident; | |
332 | ||
333 | case '"': | |
334 | t->value = escapeStringConstant(t); | |
335 | return; | |
336 | ||
337 | case 'a': case 'b': case 'c': case 'd': case 'e': | |
338 | case 'f': case 'g': case 'h': case 'i': case 'j': | |
339 | case 'k': case 'l': case 'm': case 'n': case 'o': | |
340 | case 'p': /*case 'q': case 'r':*/ case 's': case 't': | |
341 | case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': | |
342 | case 'z': | |
343 | case 'A': case 'B': case 'C': case 'D': case 'E': | |
344 | case 'F': case 'G': case 'H': case 'I': case 'J': | |
345 | case 'K': case 'L': case 'M': case 'N': case 'O': | |
346 | case 'P': case 'Q': case 'R': case 'S': case 'T': | |
347 | case 'U': case 'V': case 'W': case 'X': case 'Y': | |
348 | case 'Z': | |
349 | case '_': | |
350 | case_ident: | |
351 | { utf8_t c; | |
352 | ||
353 | while (1) | |
354 | { | |
355 | c = *++p; | |
356 | if (isidchar(c)) | |
357 | continue; | |
358 | else if (c & 0x80) | |
359 | { const utf8_t *s = p; | |
360 | unsigned u = decodeUTF(); | |
361 | if (isUniAlpha(u)) | |
362 | continue; | |
363 | error("char 0x%04x not allowed in identifier", u); | |
364 | p = s; | |
365 | } | |
366 | break; | |
367 | } | |
368 | ||
369 | Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr); | |
370 | t->ident = id; | |
371 | t->value = (TOK) id->getValue(); | |
372 | anyToken = 1; | |
373 | if (*t->ptr == '_') // if special identifier token | |
374 | { | |
375 | static bool initdone = false; | |
376 | static char date[11+1]; | |
377 | static char time[8+1]; | |
378 | static char timestamp[24+1]; | |
379 | ||
380 | if (!initdone) // lazy evaluation | |
381 | { | |
382 | initdone = true; | |
383 | time_t ct; | |
384 | ::time(&ct); | |
385 | char *p = ctime(&ct); | |
386 | assert(p); | |
387 | sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); | |
388 | sprintf(&time[0], "%.8s", p + 11); | |
389 | sprintf(×tamp[0], "%.24s", p); | |
390 | } | |
391 | ||
392 | if (id == Id::DATE) | |
393 | { | |
394 | t->ustring = (utf8_t *)date; | |
395 | goto Lstr; | |
396 | } | |
397 | else if (id == Id::TIME) | |
398 | { | |
399 | t->ustring = (utf8_t *)time; | |
400 | goto Lstr; | |
401 | } | |
402 | else if (id == Id::VENDOR) | |
403 | { | |
404 | t->ustring = (utf8_t *)const_cast<char *>(global.vendor); | |
405 | goto Lstr; | |
406 | } | |
407 | else if (id == Id::TIMESTAMP) | |
408 | { | |
409 | t->ustring = (utf8_t *)timestamp; | |
410 | Lstr: | |
411 | t->value = TOKstring; | |
412 | t->postfix = 0; | |
413 | t->len = (unsigned)strlen((char *)t->ustring); | |
414 | } | |
415 | else if (id == Id::VERSIONX) | |
416 | { unsigned major = 0; | |
417 | unsigned minor = 0; | |
418 | bool point = false; | |
419 | ||
420 | for (const char *p = global.version + 1; 1; p++) | |
421 | { | |
422 | c = *p; | |
423 | if (isdigit((utf8_t)c)) | |
424 | minor = minor * 10 + c - '0'; | |
425 | else if (c == '.') | |
426 | { | |
427 | if (point) | |
428 | break; // ignore everything after second '.' | |
429 | point = true; | |
430 | major = minor; | |
431 | minor = 0; | |
432 | } | |
433 | else | |
434 | break; | |
435 | } | |
436 | t->value = TOKint64v; | |
437 | t->uns64value = major * 1000 + minor; | |
438 | } | |
439 | else if (id == Id::EOFX) | |
440 | { | |
441 | t->value = TOKeof; | |
442 | // Advance scanner to end of file | |
443 | while (!(*p == 0 || *p == 0x1A)) | |
444 | p++; | |
445 | } | |
446 | } | |
447 | //printf("t->value = %d\n",t->value); | |
448 | return; | |
449 | } | |
450 | ||
451 | case '/': | |
452 | p++; | |
453 | switch (*p) | |
454 | { | |
455 | case '=': | |
456 | p++; | |
457 | t->value = TOKdivass; | |
458 | return; | |
459 | ||
460 | case '*': | |
461 | p++; | |
462 | startLoc = loc(); | |
463 | while (1) | |
464 | { | |
465 | while (1) | |
466 | { utf8_t c = *p; | |
467 | switch (c) | |
468 | { | |
469 | case '/': | |
470 | break; | |
471 | ||
472 | case '\n': | |
473 | endOfLine(); | |
474 | p++; | |
475 | continue; | |
476 | ||
477 | case '\r': | |
478 | p++; | |
479 | if (*p != '\n') | |
480 | endOfLine(); | |
481 | continue; | |
482 | ||
483 | case 0: | |
484 | case 0x1A: | |
485 | error("unterminated /* */ comment"); | |
486 | p = end; | |
487 | t->loc = loc(); | |
488 | t->value = TOKeof; | |
489 | return; | |
490 | ||
491 | default: | |
492 | if (c & 0x80) | |
493 | { unsigned u = decodeUTF(); | |
494 | if (u == PS || u == LS) | |
495 | endOfLine(); | |
496 | } | |
497 | p++; | |
498 | continue; | |
499 | } | |
500 | break; | |
501 | } | |
502 | p++; | |
503 | if (p[-2] == '*' && p - 3 != t->ptr) | |
504 | break; | |
505 | } | |
506 | if (commentToken) | |
507 | { | |
508 | t->loc = startLoc; | |
509 | t->value = TOKcomment; | |
510 | return; | |
511 | } | |
512 | else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr) | |
513 | { // if /** but not /**/ | |
514 | getDocComment(t, lastLine == startLoc.linnum); | |
515 | } | |
516 | continue; | |
517 | ||
518 | case '/': // do // style comments | |
519 | startLoc = loc(); | |
520 | while (1) | |
521 | { utf8_t c = *++p; | |
522 | switch (c) | |
523 | { | |
524 | case '\n': | |
525 | break; | |
526 | ||
527 | case '\r': | |
528 | if (p[1] == '\n') | |
529 | p++; | |
530 | break; | |
531 | ||
532 | case 0: | |
533 | case 0x1A: | |
534 | if (commentToken) | |
535 | { | |
536 | p = end; | |
537 | t->loc = startLoc; | |
538 | t->value = TOKcomment; | |
539 | return; | |
540 | } | |
541 | if (doDocComment && t->ptr[2] == '/') | |
542 | getDocComment(t, lastLine == startLoc.linnum); | |
543 | p = end; | |
544 | t->loc = loc(); | |
545 | t->value = TOKeof; | |
546 | return; | |
547 | ||
548 | default: | |
549 | if (c & 0x80) | |
550 | { unsigned u = decodeUTF(); | |
551 | if (u == PS || u == LS) | |
552 | break; | |
553 | } | |
554 | continue; | |
555 | } | |
556 | break; | |
557 | } | |
558 | ||
559 | if (commentToken) | |
560 | { | |
561 | p++; | |
562 | endOfLine(); | |
563 | t->loc = startLoc; | |
564 | t->value = TOKcomment; | |
565 | return; | |
566 | } | |
567 | if (doDocComment && t->ptr[2] == '/') | |
568 | getDocComment(t, lastLine == startLoc.linnum); | |
569 | ||
570 | p++; | |
571 | endOfLine(); | |
572 | continue; | |
573 | ||
574 | case '+': | |
575 | { int nest; | |
576 | ||
577 | startLoc = loc(); | |
578 | p++; | |
579 | nest = 1; | |
580 | while (1) | |
581 | { utf8_t c = *p; | |
582 | switch (c) | |
583 | { | |
584 | case '/': | |
585 | p++; | |
586 | if (*p == '+') | |
587 | { | |
588 | p++; | |
589 | nest++; | |
590 | } | |
591 | continue; | |
592 | ||
593 | case '+': | |
594 | p++; | |
595 | if (*p == '/') | |
596 | { | |
597 | p++; | |
598 | if (--nest == 0) | |
599 | break; | |
600 | } | |
601 | continue; | |
602 | ||
603 | case '\r': | |
604 | p++; | |
605 | if (*p != '\n') | |
606 | endOfLine(); | |
607 | continue; | |
608 | ||
609 | case '\n': | |
610 | endOfLine(); | |
611 | p++; | |
612 | continue; | |
613 | ||
614 | case 0: | |
615 | case 0x1A: | |
616 | error("unterminated /+ +/ comment"); | |
617 | p = end; | |
618 | t->loc = loc(); | |
619 | t->value = TOKeof; | |
620 | return; | |
621 | ||
622 | default: | |
623 | if (c & 0x80) | |
624 | { unsigned u = decodeUTF(); | |
625 | if (u == PS || u == LS) | |
626 | endOfLine(); | |
627 | } | |
628 | p++; | |
629 | continue; | |
630 | } | |
631 | break; | |
632 | } | |
633 | if (commentToken) | |
634 | { | |
635 | t->loc = startLoc; | |
636 | t->value = TOKcomment; | |
637 | return; | |
638 | } | |
639 | if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr) | |
640 | { // if /++ but not /++/ | |
641 | getDocComment(t, lastLine == startLoc.linnum); | |
642 | } | |
643 | continue; | |
644 | } | |
645 | default: | |
646 | break; | |
647 | } | |
648 | t->value = TOKdiv; | |
649 | return; | |
650 | ||
651 | case '.': | |
652 | p++; | |
653 | if (isdigit(*p)) | |
654 | { /* Note that we don't allow ._1 and ._ as being | |
655 | * valid floating point numbers. | |
656 | */ | |
657 | p--; | |
658 | t->value = inreal(t); | |
659 | } | |
660 | else if (p[0] == '.') | |
661 | { | |
662 | if (p[1] == '.') | |
663 | { p += 2; | |
664 | t->value = TOKdotdotdot; | |
665 | } | |
666 | else | |
667 | { p++; | |
668 | t->value = TOKslice; | |
669 | } | |
670 | } | |
671 | else | |
672 | t->value = TOKdot; | |
673 | return; | |
674 | ||
675 | case '&': | |
676 | p++; | |
677 | if (*p == '=') | |
678 | { p++; | |
679 | t->value = TOKandass; | |
680 | } | |
681 | else if (*p == '&') | |
682 | { p++; | |
683 | t->value = TOKandand; | |
684 | } | |
685 | else | |
686 | t->value = TOKand; | |
687 | return; | |
688 | ||
689 | case '|': | |
690 | p++; | |
691 | if (*p == '=') | |
692 | { p++; | |
693 | t->value = TOKorass; | |
694 | } | |
695 | else if (*p == '|') | |
696 | { p++; | |
697 | t->value = TOKoror; | |
698 | } | |
699 | else | |
700 | t->value = TOKor; | |
701 | return; | |
702 | ||
703 | case '-': | |
704 | p++; | |
705 | if (*p == '=') | |
706 | { p++; | |
707 | t->value = TOKminass; | |
708 | } | |
709 | else if (*p == '-') | |
710 | { p++; | |
711 | t->value = TOKminusminus; | |
712 | } | |
713 | else | |
714 | t->value = TOKmin; | |
715 | return; | |
716 | ||
717 | case '+': | |
718 | p++; | |
719 | if (*p == '=') | |
720 | { p++; | |
721 | t->value = TOKaddass; | |
722 | } | |
723 | else if (*p == '+') | |
724 | { p++; | |
725 | t->value = TOKplusplus; | |
726 | } | |
727 | else | |
728 | t->value = TOKadd; | |
729 | return; | |
730 | ||
731 | case '<': | |
732 | p++; | |
733 | if (*p == '=') | |
734 | { p++; | |
735 | t->value = TOKle; // <= | |
736 | } | |
737 | else if (*p == '<') | |
738 | { p++; | |
739 | if (*p == '=') | |
740 | { p++; | |
741 | t->value = TOKshlass; // <<= | |
742 | } | |
743 | else | |
744 | t->value = TOKshl; // << | |
745 | } | |
746 | else if (*p == '>') | |
747 | { p++; | |
748 | if (*p == '=') | |
749 | { p++; | |
750 | t->value = TOKleg; // <>= | |
751 | } | |
752 | else | |
753 | t->value = TOKlg; // <> | |
754 | } | |
755 | else | |
756 | t->value = TOKlt; // < | |
757 | return; | |
758 | ||
759 | case '>': | |
760 | p++; | |
761 | if (*p == '=') | |
762 | { p++; | |
763 | t->value = TOKge; // >= | |
764 | } | |
765 | else if (*p == '>') | |
766 | { p++; | |
767 | if (*p == '=') | |
768 | { p++; | |
769 | t->value = TOKshrass; // >>= | |
770 | } | |
771 | else if (*p == '>') | |
772 | { p++; | |
773 | if (*p == '=') | |
774 | { p++; | |
775 | t->value = TOKushrass; // >>>= | |
776 | } | |
777 | else | |
778 | t->value = TOKushr; // >>> | |
779 | } | |
780 | else | |
781 | t->value = TOKshr; // >> | |
782 | } | |
783 | else | |
784 | t->value = TOKgt; // > | |
785 | return; | |
786 | ||
787 | case '!': | |
788 | p++; | |
789 | if (*p == '=') | |
790 | { p++; | |
791 | t->value = TOKnotequal; // != | |
792 | } | |
793 | else if (*p == '<') | |
794 | { p++; | |
795 | if (*p == '>') | |
796 | { p++; | |
797 | if (*p == '=') | |
798 | { p++; | |
799 | t->value = TOKunord; // !<>= | |
800 | } | |
801 | else | |
802 | t->value = TOKue; // !<> | |
803 | } | |
804 | else if (*p == '=') | |
805 | { p++; | |
806 | t->value = TOKug; // !<= | |
807 | } | |
808 | else | |
809 | t->value = TOKuge; // !< | |
810 | } | |
811 | else if (*p == '>') | |
812 | { p++; | |
813 | if (*p == '=') | |
814 | { p++; | |
815 | t->value = TOKul; // !>= | |
816 | } | |
817 | else | |
818 | t->value = TOKule; // !> | |
819 | } | |
820 | else | |
821 | t->value = TOKnot; // ! | |
822 | return; | |
823 | ||
824 | case '=': | |
825 | p++; | |
826 | if (*p == '=') | |
827 | { p++; | |
828 | t->value = TOKequal; // == | |
829 | } | |
830 | else if (*p == '>') | |
831 | { p++; | |
832 | t->value = TOKgoesto; // => | |
833 | } | |
834 | else | |
835 | t->value = TOKassign; // = | |
836 | return; | |
837 | ||
838 | case '~': | |
839 | p++; | |
840 | if (*p == '=') | |
841 | { p++; | |
842 | t->value = TOKcatass; // ~= | |
843 | } | |
844 | else | |
845 | t->value = TOKtilde; // ~ | |
846 | return; | |
847 | ||
848 | case '^': | |
849 | p++; | |
850 | if (*p == '^') | |
851 | { p++; | |
852 | if (*p == '=') | |
853 | { p++; | |
854 | t->value = TOKpowass; // ^^= | |
855 | } | |
856 | else | |
857 | t->value = TOKpow; // ^^ | |
858 | } | |
859 | else if (*p == '=') | |
860 | { p++; | |
861 | t->value = TOKxorass; // ^= | |
862 | } | |
863 | else | |
864 | t->value = TOKxor; // ^ | |
865 | return; | |
866 | ||
867 | case '(': p++; t->value = TOKlparen; return; | |
868 | case ')': p++; t->value = TOKrparen; return; | |
869 | case '[': p++; t->value = TOKlbracket; return; | |
870 | case ']': p++; t->value = TOKrbracket; return; | |
871 | case '{': p++; t->value = TOKlcurly; return; | |
872 | case '}': p++; t->value = TOKrcurly; return; | |
873 | case '?': p++; t->value = TOKquestion; return; | |
874 | case ',': p++; t->value = TOKcomma; return; | |
875 | case ';': p++; t->value = TOKsemicolon; return; | |
876 | case ':': p++; t->value = TOKcolon; return; | |
877 | case '$': p++; t->value = TOKdollar; return; | |
878 | case '@': p++; t->value = TOKat; return; | |
879 | ||
880 | case '*': | |
881 | p++; | |
882 | if (*p == '=') | |
883 | { p++; | |
884 | t->value = TOKmulass; | |
885 | } | |
886 | else | |
887 | t->value = TOKmul; | |
888 | return; | |
889 | case '%': | |
890 | p++; | |
891 | if (*p == '=') | |
892 | { p++; | |
893 | t->value = TOKmodass; | |
894 | } | |
895 | else | |
896 | t->value = TOKmod; | |
897 | return; | |
898 | ||
899 | case '#': | |
900 | { | |
901 | p++; | |
902 | Token n; | |
903 | scan(&n); | |
904 | if (n.value == TOKidentifier && n.ident == Id::line) | |
905 | { | |
906 | poundLine(); | |
907 | continue; | |
908 | } | |
909 | else | |
910 | { | |
911 | t->value = TOKpound; | |
912 | return; | |
913 | } | |
914 | } | |
915 | ||
916 | default: | |
917 | { unsigned c = *p; | |
918 | ||
919 | if (c & 0x80) | |
920 | { c = decodeUTF(); | |
921 | ||
922 | // Check for start of unicode identifier | |
923 | if (isUniAlpha(c)) | |
924 | goto case_ident; | |
925 | ||
926 | if (c == PS || c == LS) | |
927 | { | |
928 | endOfLine(); | |
929 | p++; | |
930 | continue; | |
931 | } | |
932 | } | |
933 | if (c < 0x80 && isprint(c)) | |
934 | error("character '%c' is not a valid token", c); | |
935 | else | |
936 | error("character 0x%02x is not a valid token", c); | |
937 | p++; | |
938 | continue; | |
939 | } | |
940 | } | |
941 | } | |
942 | } | |
943 | ||
944 | /******************************************* | |
945 | * Parse escape sequence. | |
946 | */ | |
947 | ||
948 | unsigned Lexer::escapeSequence() | |
949 | { unsigned c = *p; | |
950 | ||
951 | int n; | |
952 | int ndigits; | |
953 | ||
954 | switch (c) | |
955 | { | |
956 | case '\'': | |
957 | case '"': | |
958 | case '?': | |
959 | case '\\': | |
960 | Lconsume: | |
961 | p++; | |
962 | break; | |
963 | ||
964 | case 'a': c = 7; goto Lconsume; | |
965 | case 'b': c = 8; goto Lconsume; | |
966 | case 'f': c = 12; goto Lconsume; | |
967 | case 'n': c = 10; goto Lconsume; | |
968 | case 'r': c = 13; goto Lconsume; | |
969 | case 't': c = 9; goto Lconsume; | |
970 | case 'v': c = 11; goto Lconsume; | |
971 | ||
972 | case 'u': | |
973 | ndigits = 4; | |
974 | goto Lhex; | |
975 | case 'U': | |
976 | ndigits = 8; | |
977 | goto Lhex; | |
978 | case 'x': | |
979 | ndigits = 2; | |
980 | Lhex: | |
981 | p++; | |
982 | c = *p; | |
983 | if (ishex((utf8_t)c)) | |
984 | { unsigned v; | |
985 | ||
986 | n = 0; | |
987 | v = 0; | |
988 | while (1) | |
989 | { | |
990 | if (isdigit((utf8_t)c)) | |
991 | c -= '0'; | |
992 | else if (islower(c)) | |
993 | c -= 'a' - 10; | |
994 | else | |
995 | c -= 'A' - 10; | |
996 | v = v * 16 + c; | |
997 | c = *++p; | |
998 | if (++n == ndigits) | |
999 | break; | |
1000 | if (!ishex((utf8_t)c)) | |
1001 | { error("escape hex sequence has %d hex digits instead of %d", n, ndigits); | |
1002 | break; | |
1003 | } | |
1004 | } | |
1005 | if (ndigits != 2 && !utf_isValidDchar(v)) | |
1006 | { error("invalid UTF character \\U%08x", v); | |
1007 | v = '?'; // recover with valid UTF character | |
1008 | } | |
1009 | c = v; | |
1010 | } | |
1011 | else | |
1012 | error("undefined escape hex sequence \\%c",c); | |
1013 | break; | |
1014 | ||
1015 | case '&': // named character entity | |
1016 | for (const utf8_t *idstart = ++p; 1; p++) | |
1017 | { | |
1018 | switch (*p) | |
1019 | { | |
1020 | case ';': | |
1021 | c = HtmlNamedEntity(idstart, p - idstart); | |
1022 | if (c == ~0U) | |
1023 | { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart); | |
1024 | c = ' '; | |
1025 | } | |
1026 | p++; | |
1027 | break; | |
1028 | ||
1029 | default: | |
1030 | if (isalpha(*p) || | |
1031 | (p != idstart && isdigit(*p))) | |
1032 | continue; | |
1033 | error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart); | |
1034 | break; | |
1035 | } | |
1036 | break; | |
1037 | } | |
1038 | break; | |
1039 | ||
1040 | case 0: | |
1041 | case 0x1A: // end of file | |
1042 | c = '\\'; | |
1043 | break; | |
1044 | ||
1045 | default: | |
1046 | if (isoctal((utf8_t)c)) | |
1047 | { unsigned v; | |
1048 | ||
1049 | n = 0; | |
1050 | v = 0; | |
1051 | do | |
1052 | { | |
1053 | v = v * 8 + (c - '0'); | |
1054 | c = *++p; | |
1055 | } while (++n < 3 && isoctal((utf8_t)c)); | |
1056 | c = v; | |
1057 | if (c > 0xFF) | |
1058 | error("escape octal sequence \\%03o is larger than \\377", c); | |
1059 | } | |
1060 | else | |
1061 | error("undefined escape sequence \\%c",c); | |
1062 | break; | |
1063 | } | |
1064 | return c; | |
1065 | } | |
1066 | ||
1067 | /************************************** | |
1068 | */ | |
1069 | ||
1070 | TOK Lexer::wysiwygStringConstant(Token *t, int tc) | |
1071 | { | |
1072 | int c; | |
1073 | Loc start = loc(); | |
1074 | ||
1075 | p++; | |
1076 | stringbuffer.reset(); | |
1077 | while (1) | |
1078 | { | |
1079 | c = *p++; | |
1080 | switch (c) | |
1081 | { | |
1082 | case '\n': | |
1083 | endOfLine(); | |
1084 | break; | |
1085 | ||
1086 | case '\r': | |
1087 | if (*p == '\n') | |
1088 | continue; // ignore | |
1089 | c = '\n'; // treat EndOfLine as \n character | |
1090 | endOfLine(); | |
1091 | break; | |
1092 | ||
1093 | case 0: | |
1094 | case 0x1A: | |
1095 | error("unterminated string constant starting at %s", start.toChars()); | |
1096 | t->ustring = (utf8_t *)const_cast<char *>(""); | |
1097 | t->len = 0; | |
1098 | t->postfix = 0; | |
1099 | return TOKstring; | |
1100 | ||
1101 | case '"': | |
1102 | case '`': | |
1103 | if (c == tc) | |
1104 | { | |
1105 | t->len = (unsigned)stringbuffer.offset; | |
1106 | stringbuffer.writeByte(0); | |
1107 | t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1108 | memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1109 | stringPostfix(t); | |
1110 | return TOKstring; | |
1111 | } | |
1112 | break; | |
1113 | ||
1114 | default: | |
1115 | if (c & 0x80) | |
1116 | { p--; | |
1117 | unsigned u = decodeUTF(); | |
1118 | p++; | |
1119 | if (u == PS || u == LS) | |
1120 | endOfLine(); | |
1121 | stringbuffer.writeUTF8(u); | |
1122 | continue; | |
1123 | } | |
1124 | break; | |
1125 | } | |
1126 | stringbuffer.writeByte(c); | |
1127 | } | |
1128 | } | |
1129 | ||
1130 | /************************************** | |
1131 | * Lex hex strings: | |
1132 | * x"0A ae 34FE BD" | |
1133 | */ | |
1134 | ||
1135 | TOK Lexer::hexStringConstant(Token *t) | |
1136 | { | |
1137 | unsigned c; | |
1138 | Loc start = loc(); | |
1139 | unsigned n = 0; | |
1140 | unsigned v = ~0; // dead assignment, needed to suppress warning | |
1141 | ||
1142 | p++; | |
1143 | stringbuffer.reset(); | |
1144 | while (1) | |
1145 | { | |
1146 | c = *p++; | |
1147 | switch (c) | |
1148 | { | |
1149 | case ' ': | |
1150 | case '\t': | |
1151 | case '\v': | |
1152 | case '\f': | |
1153 | continue; // skip white space | |
1154 | ||
1155 | case '\r': | |
1156 | if (*p == '\n') | |
1157 | continue; // ignore | |
1158 | // Treat isolated '\r' as if it were a '\n' | |
1159 | /* fall through */ | |
1160 | case '\n': | |
1161 | endOfLine(); | |
1162 | continue; | |
1163 | ||
1164 | case 0: | |
1165 | case 0x1A: | |
1166 | error("unterminated string constant starting at %s", start.toChars()); | |
1167 | t->ustring = (utf8_t *)const_cast<char *>(""); | |
1168 | t->len = 0; | |
1169 | t->postfix = 0; | |
1170 | return TOKxstring; | |
1171 | ||
1172 | case '"': | |
1173 | if (n & 1) | |
1174 | { error("odd number (%d) of hex characters in hex string", n); | |
1175 | stringbuffer.writeByte(v); | |
1176 | } | |
1177 | t->len = (unsigned)stringbuffer.offset; | |
1178 | stringbuffer.writeByte(0); | |
1179 | t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1180 | memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1181 | stringPostfix(t); | |
1182 | return TOKxstring; | |
1183 | ||
1184 | default: | |
1185 | if (c >= '0' && c <= '9') | |
1186 | c -= '0'; | |
1187 | else if (c >= 'a' && c <= 'f') | |
1188 | c -= 'a' - 10; | |
1189 | else if (c >= 'A' && c <= 'F') | |
1190 | c -= 'A' - 10; | |
1191 | else if (c & 0x80) | |
1192 | { p--; | |
1193 | unsigned u = decodeUTF(); | |
1194 | p++; | |
1195 | if (u == PS || u == LS) | |
1196 | endOfLine(); | |
1197 | else | |
1198 | error("non-hex character \\u%04x in hex string", u); | |
1199 | } | |
1200 | else | |
1201 | error("non-hex character '%c' in hex string", c); | |
1202 | if (n & 1) | |
1203 | { v = (v << 4) | c; | |
1204 | stringbuffer.writeByte(v); | |
1205 | } | |
1206 | else | |
1207 | v = c; | |
1208 | n++; | |
1209 | break; | |
1210 | } | |
1211 | } | |
1212 | } | |
1213 | ||
1214 | ||
1215 | /************************************** | |
1216 | * Lex delimited strings: | |
1217 | * q"(foo(xxx))" // "foo(xxx)" | |
1218 | * q"[foo(]" // "foo(" | |
1219 | * q"/foo]/" // "foo]" | |
1220 | * q"HERE | |
1221 | * foo | |
1222 | * HERE" // "foo\n" | |
1223 | * Input: | |
1224 | * p is on the " | |
1225 | */ | |
1226 | ||
1227 | TOK Lexer::delimitedStringConstant(Token *t) | |
1228 | { | |
1229 | unsigned c; | |
1230 | Loc start = loc(); | |
1231 | unsigned delimleft = 0; | |
1232 | unsigned delimright = 0; | |
1233 | unsigned nest = 1; | |
1234 | unsigned nestcount = ~0; // dead assignment, needed to suppress warning | |
1235 | Identifier *hereid = NULL; | |
1236 | unsigned blankrol = 0; | |
1237 | unsigned startline = 0; | |
1238 | ||
1239 | p++; | |
1240 | stringbuffer.reset(); | |
1241 | while (1) | |
1242 | { | |
1243 | c = *p++; | |
1244 | //printf("c = '%c'\n", c); | |
1245 | switch (c) | |
1246 | { | |
1247 | case '\n': | |
1248 | Lnextline: | |
1249 | endOfLine(); | |
1250 | startline = 1; | |
1251 | if (blankrol) | |
1252 | { blankrol = 0; | |
1253 | continue; | |
1254 | } | |
1255 | if (hereid) | |
1256 | { | |
1257 | stringbuffer.writeUTF8(c); | |
1258 | continue; | |
1259 | } | |
1260 | break; | |
1261 | ||
1262 | case '\r': | |
1263 | if (*p == '\n') | |
1264 | continue; // ignore | |
1265 | c = '\n'; // treat EndOfLine as \n character | |
1266 | goto Lnextline; | |
1267 | ||
1268 | case 0: | |
1269 | case 0x1A: | |
1270 | error("unterminated delimited string constant starting at %s", start.toChars()); | |
1271 | t->ustring = (utf8_t *)const_cast<char *>(""); | |
1272 | t->len = 0; | |
1273 | t->postfix = 0; | |
1274 | return TOKstring; | |
1275 | ||
1276 | default: | |
1277 | if (c & 0x80) | |
1278 | { p--; | |
1279 | c = decodeUTF(); | |
1280 | p++; | |
1281 | if (c == PS || c == LS) | |
1282 | goto Lnextline; | |
1283 | } | |
1284 | break; | |
1285 | } | |
1286 | if (delimleft == 0) | |
1287 | { delimleft = c; | |
1288 | nest = 1; | |
1289 | nestcount = 1; | |
1290 | if (c == '(') | |
1291 | delimright = ')'; | |
1292 | else if (c == '{') | |
1293 | delimright = '}'; | |
1294 | else if (c == '[') | |
1295 | delimright = ']'; | |
1296 | else if (c == '<') | |
1297 | delimright = '>'; | |
1298 | else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) | |
1299 | { // Start of identifier; must be a heredoc | |
1300 | Token tok; | |
1301 | p--; | |
1302 | scan(&tok); // read in heredoc identifier | |
1303 | if (tok.value != TOKidentifier) | |
1304 | { error("identifier expected for heredoc, not %s", tok.toChars()); | |
1305 | delimright = c; | |
1306 | } | |
1307 | else | |
1308 | { hereid = tok.ident; | |
1309 | //printf("hereid = '%s'\n", hereid->toChars()); | |
1310 | blankrol = 1; | |
1311 | } | |
1312 | nest = 0; | |
1313 | } | |
1314 | else | |
1315 | { delimright = c; | |
1316 | nest = 0; | |
1317 | if (isspace(c)) | |
1318 | error("delimiter cannot be whitespace"); | |
1319 | } | |
1320 | } | |
1321 | else | |
1322 | { | |
1323 | if (blankrol) | |
1324 | { error("heredoc rest of line should be blank"); | |
1325 | blankrol = 0; | |
1326 | continue; | |
1327 | } | |
1328 | if (nest == 1) | |
1329 | { | |
1330 | if (c == delimleft) | |
1331 | nestcount++; | |
1332 | else if (c == delimright) | |
1333 | { nestcount--; | |
1334 | if (nestcount == 0) | |
1335 | goto Ldone; | |
1336 | } | |
1337 | } | |
1338 | else if (c == delimright) | |
1339 | goto Ldone; | |
1340 | if (startline && isalpha(c) && hereid) | |
1341 | { Token tok; | |
1342 | const utf8_t *psave = p; | |
1343 | p--; | |
1344 | scan(&tok); // read in possible heredoc identifier | |
1345 | //printf("endid = '%s'\n", tok.ident->toChars()); | |
1346 | if (tok.value == TOKidentifier && tok.ident->equals(hereid)) | |
1347 | { /* should check that rest of line is blank | |
1348 | */ | |
1349 | goto Ldone; | |
1350 | } | |
1351 | p = psave; | |
1352 | } | |
1353 | stringbuffer.writeUTF8(c); | |
1354 | startline = 0; | |
1355 | } | |
1356 | } | |
1357 | ||
1358 | Ldone: | |
1359 | if (*p == '"') | |
1360 | p++; | |
1361 | else if (hereid) | |
1362 | error("delimited string must end in %s\"", hereid->toChars()); | |
1363 | else | |
1364 | error("delimited string must end in %c\"", delimright); | |
1365 | t->len = (unsigned)stringbuffer.offset; | |
1366 | stringbuffer.writeByte(0); | |
1367 | t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1368 | memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1369 | stringPostfix(t); | |
1370 | return TOKstring; | |
1371 | } | |
1372 | ||
1373 | /************************************** | |
1374 | * Lex delimited strings: | |
1375 | * q{ foo(xxx) } // " foo(xxx) " | |
1376 | * q{foo(} // "foo(" | |
1377 | * q{{foo}"}"} // "{foo}"}"" | |
1378 | * Input: | |
1379 | * p is on the q | |
1380 | */ | |
1381 | ||
1382 | TOK Lexer::tokenStringConstant(Token *t) | |
1383 | { | |
1384 | unsigned nest = 1; | |
1385 | Loc start = loc(); | |
1386 | const utf8_t *pstart = ++p; | |
1387 | ||
1388 | while (1) | |
1389 | { Token tok; | |
1390 | ||
1391 | scan(&tok); | |
1392 | switch (tok.value) | |
1393 | { | |
1394 | case TOKlcurly: | |
1395 | nest++; | |
1396 | continue; | |
1397 | ||
1398 | case TOKrcurly: | |
1399 | if (--nest == 0) | |
1400 | { | |
1401 | t->len = (unsigned)(p - 1 - pstart); | |
1402 | t->ustring = (utf8_t *)mem.xmalloc(t->len + 1); | |
1403 | memcpy(t->ustring, pstart, t->len); | |
1404 | t->ustring[t->len] = 0; | |
1405 | stringPostfix(t); | |
1406 | return TOKstring; | |
1407 | } | |
1408 | continue; | |
1409 | ||
1410 | case TOKeof: | |
1411 | error("unterminated token string constant starting at %s", start.toChars()); | |
1412 | t->ustring = (utf8_t *)const_cast<char *>(""); | |
1413 | t->len = 0; | |
1414 | t->postfix = 0; | |
1415 | return TOKstring; | |
1416 | ||
1417 | default: | |
1418 | continue; | |
1419 | } | |
1420 | } | |
1421 | } | |
1422 | ||
1423 | ||
1424 | ||
1425 | /************************************** | |
1426 | */ | |
1427 | ||
1428 | TOK Lexer::escapeStringConstant(Token *t) | |
1429 | { | |
1430 | unsigned c; | |
1431 | Loc start = loc(); | |
1432 | ||
1433 | p++; | |
1434 | stringbuffer.reset(); | |
1435 | while (1) | |
1436 | { | |
1437 | c = *p++; | |
1438 | switch (c) | |
1439 | { | |
1440 | case '\\': | |
1441 | switch (*p) | |
1442 | { | |
1443 | case 'u': | |
1444 | case 'U': | |
1445 | case '&': | |
1446 | c = escapeSequence(); | |
1447 | stringbuffer.writeUTF8(c); | |
1448 | continue; | |
1449 | ||
1450 | default: | |
1451 | c = escapeSequence(); | |
1452 | break; | |
1453 | } | |
1454 | break; | |
1455 | case '\n': | |
1456 | endOfLine(); | |
1457 | break; | |
1458 | ||
1459 | case '\r': | |
1460 | if (*p == '\n') | |
1461 | continue; // ignore | |
1462 | c = '\n'; // treat EndOfLine as \n character | |
1463 | endOfLine(); | |
1464 | break; | |
1465 | ||
1466 | case '"': | |
1467 | t->len = (unsigned)stringbuffer.offset; | |
1468 | stringbuffer.writeByte(0); | |
1469 | t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1470 | memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1471 | stringPostfix(t); | |
1472 | return TOKstring; | |
1473 | ||
1474 | case 0: | |
1475 | case 0x1A: | |
1476 | p--; | |
1477 | error("unterminated string constant starting at %s", start.toChars()); | |
1478 | t->ustring = (utf8_t *)const_cast<char *>(""); | |
1479 | t->len = 0; | |
1480 | t->postfix = 0; | |
1481 | return TOKstring; | |
1482 | ||
1483 | default: | |
1484 | if (c & 0x80) | |
1485 | { | |
1486 | p--; | |
1487 | c = decodeUTF(); | |
1488 | if (c == LS || c == PS) | |
1489 | { c = '\n'; | |
1490 | endOfLine(); | |
1491 | } | |
1492 | p++; | |
1493 | stringbuffer.writeUTF8(c); | |
1494 | continue; | |
1495 | } | |
1496 | break; | |
1497 | } | |
1498 | stringbuffer.writeByte(c); | |
1499 | } | |
1500 | } | |
1501 | ||
1502 | /************************************** | |
1503 | */ | |
1504 | ||
1505 | TOK Lexer::charConstant(Token *t) | |
1506 | { | |
1507 | unsigned c; | |
1508 | TOK tk = TOKcharv; | |
1509 | ||
1510 | //printf("Lexer::charConstant\n"); | |
1511 | p++; | |
1512 | c = *p++; | |
1513 | switch (c) | |
1514 | { | |
1515 | case '\\': | |
1516 | switch (*p) | |
1517 | { | |
1518 | case 'u': | |
1519 | t->uns64value = escapeSequence(); | |
1520 | tk = TOKwcharv; | |
1521 | break; | |
1522 | ||
1523 | case 'U': | |
1524 | case '&': | |
1525 | t->uns64value = escapeSequence(); | |
1526 | tk = TOKdcharv; | |
1527 | break; | |
1528 | ||
1529 | default: | |
1530 | t->uns64value = escapeSequence(); | |
1531 | break; | |
1532 | } | |
1533 | break; | |
1534 | case '\n': | |
1535 | L1: | |
1536 | endOfLine(); | |
1537 | /* fall through */ | |
1538 | case '\r': | |
1539 | case 0: | |
1540 | case 0x1A: | |
1541 | case '\'': | |
1542 | error("unterminated character constant"); | |
1543 | t->uns64value = '?'; | |
1544 | return tk; | |
1545 | ||
1546 | default: | |
1547 | if (c & 0x80) | |
1548 | { | |
1549 | p--; | |
1550 | c = decodeUTF(); | |
1551 | p++; | |
1552 | if (c == LS || c == PS) | |
1553 | goto L1; | |
1554 | if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) | |
1555 | tk = TOKwcharv; | |
1556 | else | |
1557 | tk = TOKdcharv; | |
1558 | } | |
1559 | t->uns64value = c; | |
1560 | break; | |
1561 | } | |
1562 | ||
1563 | if (*p != '\'') | |
1564 | { | |
1565 | error("unterminated character constant"); | |
1566 | t->uns64value = '?'; | |
1567 | return tk; | |
1568 | } | |
1569 | p++; | |
1570 | return tk; | |
1571 | } | |
1572 | ||
1573 | /*************************************** | |
1574 | * Get postfix of string literal. | |
1575 | */ | |
1576 | ||
1577 | void Lexer::stringPostfix(Token *t) | |
1578 | { | |
1579 | switch (*p) | |
1580 | { | |
1581 | case 'c': | |
1582 | case 'w': | |
1583 | case 'd': | |
1584 | t->postfix = *p; | |
1585 | p++; | |
1586 | break; | |
1587 | ||
1588 | default: | |
1589 | t->postfix = 0; | |
1590 | break; | |
1591 | } | |
1592 | } | |
1593 | ||
1594 | /************************************** | |
1595 | * Read in a number. | |
1596 | * If it's an integer, store it in tok.TKutok.Vlong. | |
1597 | * integers can be decimal, octal or hex | |
1598 | * Handle the suffixes U, UL, LU, L, etc. | |
1599 | * If it's double, store it in tok.TKutok.Vdouble. | |
1600 | * Returns: | |
1601 | * TKnum | |
1602 | * TKdouble,... | |
1603 | */ | |
1604 | ||
1605 | TOK Lexer::number(Token *t) | |
1606 | { | |
1607 | int base = 10; | |
1608 | const utf8_t *start = p; | |
1609 | unsigned c; | |
1610 | uinteger_t n = 0; // unsigned >=64 bit integer type | |
1611 | int d; | |
1612 | bool err = false; | |
1613 | bool overflow = false; | |
1614 | ||
1615 | c = *p; | |
1616 | if (c == '0') | |
1617 | { | |
1618 | ++p; | |
1619 | c = *p; | |
1620 | switch (c) | |
1621 | { | |
1622 | case '0': case '1': case '2': case '3': | |
1623 | case '4': case '5': case '6': case '7': | |
1624 | n = c - '0'; | |
1625 | ++p; | |
1626 | base = 8; | |
1627 | break; | |
1628 | ||
1629 | case 'x': | |
1630 | case 'X': | |
1631 | ++p; | |
1632 | base = 16; | |
1633 | break; | |
1634 | ||
1635 | case 'b': | |
1636 | case 'B': | |
1637 | ++p; | |
1638 | base = 2; | |
1639 | break; | |
1640 | ||
1641 | case '.': | |
1642 | if (p[1] == '.') | |
1643 | goto Ldone; // if ".." | |
1644 | if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) | |
1645 | goto Ldone; // if ".identifier" or ".unicode" | |
1646 | goto Lreal; // '.' is part of current token | |
1647 | ||
1648 | case 'i': | |
1649 | case 'f': | |
1650 | case 'F': | |
1651 | goto Lreal; | |
1652 | ||
1653 | case '_': | |
1654 | ++p; | |
1655 | base = 8; | |
1656 | break; | |
1657 | ||
1658 | case 'L': | |
1659 | if (p[1] == 'i') | |
1660 | goto Lreal; | |
1661 | break; | |
1662 | ||
1663 | default: | |
1664 | break; | |
1665 | } | |
1666 | } | |
1667 | ||
1668 | while (1) | |
1669 | { | |
1670 | c = *p; | |
1671 | switch (c) | |
1672 | { | |
1673 | case '0': case '1': | |
1674 | ++p; | |
1675 | d = c - '0'; | |
1676 | break; | |
1677 | ||
1678 | case '2': case '3': | |
1679 | case '4': case '5': case '6': case '7': | |
1680 | if (base == 2 && !err) | |
1681 | { | |
1682 | error("binary digit expected"); | |
1683 | err = true; | |
1684 | } | |
1685 | ++p; | |
1686 | d = c - '0'; | |
1687 | break; | |
1688 | ||
1689 | case '8': case '9': | |
1690 | ++p; | |
1691 | if (base < 10 && !err) | |
1692 | { | |
1693 | error("radix %d digit expected, not '%c'", base, c); | |
1694 | err = true; | |
1695 | } | |
1696 | d = c - '0'; | |
1697 | break; | |
1698 | ||
1699 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
1700 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
1701 | ++p; | |
1702 | if (base != 16) | |
1703 | { | |
1704 | if (c == 'e' || c == 'E' || c == 'f' || c == 'F') | |
1705 | goto Lreal; | |
1706 | if (!err) | |
1707 | { | |
1708 | error("radix %d digit expected, not '%c'", base, c); | |
1709 | err = true; | |
1710 | } | |
1711 | } | |
1712 | if (c >= 'a') | |
1713 | d = c + 10 - 'a'; | |
1714 | else | |
1715 | d = c + 10 - 'A'; | |
1716 | break; | |
1717 | ||
1718 | case 'L': | |
1719 | if (p[1] == 'i') | |
1720 | goto Lreal; | |
1721 | goto Ldone; | |
1722 | ||
1723 | case '.': | |
1724 | if (p[1] == '.') | |
1725 | goto Ldone; // if ".." | |
1726 | if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) | |
1727 | goto Ldone; // if ".identifier" or ".unicode" | |
1728 | goto Lreal; // otherwise as part of a floating point literal | |
1729 | ||
1730 | case 'p': | |
1731 | case 'P': | |
1732 | case 'i': | |
1733 | Lreal: | |
1734 | p = start; | |
1735 | return inreal(t); | |
1736 | ||
1737 | case '_': | |
1738 | ++p; | |
1739 | continue; | |
1740 | ||
1741 | default: | |
1742 | goto Ldone; | |
1743 | } | |
1744 | ||
1745 | uinteger_t n2 = n * base; | |
1746 | if ((n2 / base != n || n2 + d < n)) | |
1747 | { | |
1748 | overflow = true; | |
1749 | } | |
1750 | n = n2 + d; | |
1751 | ||
1752 | // if n needs more than 64 bits | |
1753 | if (sizeof(n) > 8 && | |
1754 | n > 0xFFFFFFFFFFFFFFFFULL) | |
1755 | { | |
1756 | overflow = true; | |
1757 | } | |
1758 | } | |
1759 | ||
1760 | Ldone: | |
1761 | ||
1762 | if (overflow && !err) | |
1763 | { | |
1764 | error("integer overflow"); | |
1765 | err = true; | |
1766 | } | |
1767 | ||
1768 | enum FLAGS | |
1769 | { | |
1770 | FLAGS_none = 0, | |
1771 | FLAGS_decimal = 1, // decimal | |
1772 | FLAGS_unsigned = 2, // u or U suffix | |
1773 | FLAGS_long = 4, // L suffix | |
1774 | }; | |
1775 | ||
1776 | unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none; | |
1777 | ||
1778 | // Parse trailing 'u', 'U', 'l' or 'L' in any combination | |
1779 | const utf8_t *psuffix = p; | |
1780 | while (1) | |
1781 | { | |
1782 | utf8_t f; | |
1783 | switch (*p) | |
1784 | { | |
1785 | case 'U': | |
1786 | case 'u': | |
1787 | f = FLAGS_unsigned; | |
1788 | goto L1; | |
1789 | ||
1790 | case 'l': | |
1791 | f = FLAGS_long; | |
1792 | error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); | |
1793 | goto L1; | |
1794 | ||
1795 | case 'L': | |
1796 | f = FLAGS_long; | |
1797 | L1: | |
1798 | p++; | |
1799 | if ((flags & f) && !err) | |
1800 | { | |
1801 | error("unrecognized token"); | |
1802 | err = true; | |
1803 | } | |
1804 | flags = (FLAGS) (flags | f); | |
1805 | continue; | |
1806 | default: | |
1807 | break; | |
1808 | } | |
1809 | break; | |
1810 | } | |
1811 | ||
1812 | if (base == 8 && n >= 8) | |
1813 | error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", | |
1814 | n, p - psuffix, psuffix, n, p - psuffix, psuffix); | |
1815 | ||
1816 | TOK result; | |
1817 | switch (flags) | |
1818 | { | |
1819 | case FLAGS_none: | |
1820 | /* Octal or Hexadecimal constant. | |
1821 | * First that fits: int, uint, long, ulong | |
1822 | */ | |
1823 | if (n & 0x8000000000000000LL) | |
1824 | result = TOKuns64v; | |
1825 | else if (n & 0xFFFFFFFF00000000LL) | |
1826 | result = TOKint64v; | |
1827 | else if (n & 0x80000000) | |
1828 | result = TOKuns32v; | |
1829 | else | |
1830 | result = TOKint32v; | |
1831 | break; | |
1832 | ||
1833 | case FLAGS_decimal: | |
1834 | /* First that fits: int, long, long long | |
1835 | */ | |
1836 | if (n & 0x8000000000000000LL) | |
1837 | { | |
1838 | if (!err) | |
1839 | { | |
1840 | error("signed integer overflow"); | |
1841 | err = true; | |
1842 | } | |
1843 | result = TOKuns64v; | |
1844 | } | |
1845 | else if (n & 0xFFFFFFFF80000000LL) | |
1846 | result = TOKint64v; | |
1847 | else | |
1848 | result = TOKint32v; | |
1849 | break; | |
1850 | ||
1851 | case FLAGS_unsigned: | |
1852 | case FLAGS_decimal | FLAGS_unsigned: | |
1853 | /* First that fits: uint, ulong | |
1854 | */ | |
1855 | if (n & 0xFFFFFFFF00000000LL) | |
1856 | result = TOKuns64v; | |
1857 | else | |
1858 | result = TOKuns32v; | |
1859 | break; | |
1860 | ||
1861 | case FLAGS_decimal | FLAGS_long: | |
1862 | if (n & 0x8000000000000000LL) | |
1863 | { | |
1864 | if (!err) | |
1865 | { | |
1866 | error("signed integer overflow"); | |
1867 | err = true; | |
1868 | } | |
1869 | result = TOKuns64v; | |
1870 | } | |
1871 | else | |
1872 | result = TOKint64v; | |
1873 | break; | |
1874 | ||
1875 | case FLAGS_long: | |
1876 | if (n & 0x8000000000000000LL) | |
1877 | result = TOKuns64v; | |
1878 | else | |
1879 | result = TOKint64v; | |
1880 | break; | |
1881 | ||
1882 | case FLAGS_unsigned | FLAGS_long: | |
1883 | case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: | |
1884 | result = TOKuns64v; | |
1885 | break; | |
1886 | ||
1887 | default: | |
1888 | assert(0); | |
1889 | } | |
1890 | t->uns64value = n; | |
1891 | return result; | |
1892 | } | |
1893 | ||
1894 | /************************************** | |
1895 | * Read in characters, converting them to real. | |
1896 | * Bugs: | |
1897 | * Exponent overflow not detected. | |
1898 | * Too much requested precision is not detected. | |
1899 | */ | |
1900 | ||
1901 | TOK Lexer::inreal(Token *t) | |
1902 | { | |
1903 | //printf("Lexer::inreal()\n"); | |
1904 | bool isWellformedString = true; | |
1905 | stringbuffer.reset(); | |
1906 | const utf8_t *pstart = p; | |
1907 | char hex = 0; | |
1908 | unsigned c = *p++; | |
1909 | ||
1910 | // Leading '0x' | |
1911 | if (c == '0') | |
1912 | { | |
1913 | c = *p++; | |
1914 | if (c == 'x' || c == 'X') | |
1915 | { | |
1916 | hex = true; | |
1917 | c = *p++; | |
1918 | } | |
1919 | } | |
1920 | ||
1921 | // Digits to left of '.' | |
1922 | while (1) | |
1923 | { | |
1924 | if (c == '.') | |
1925 | { | |
1926 | c = *p++; | |
1927 | break; | |
1928 | } | |
1929 | if (isdigit(c) || (hex && isxdigit(c)) || c == '_') | |
1930 | { | |
1931 | c = *p++; | |
1932 | continue; | |
1933 | } | |
1934 | break; | |
1935 | } | |
1936 | ||
1937 | // Digits to right of '.' | |
1938 | while (1) | |
1939 | { | |
1940 | if (isdigit(c) || (hex && isxdigit(c)) || c == '_') | |
1941 | { | |
1942 | c = *p++; | |
1943 | continue; | |
1944 | } | |
1945 | break; | |
1946 | } | |
1947 | ||
1948 | if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) | |
1949 | { | |
1950 | c = *p++; | |
1951 | if (c == '-' || c == '+') | |
1952 | { | |
1953 | c = *p++; | |
1954 | } | |
1955 | bool anyexp = false; | |
1956 | while (1) | |
1957 | { | |
1958 | if (isdigit(c)) | |
1959 | { | |
1960 | anyexp = true; | |
1961 | c = *p++; | |
1962 | continue; | |
1963 | } | |
1964 | if (c == '_') | |
1965 | { | |
1966 | c = *p++; | |
1967 | continue; | |
1968 | } | |
1969 | if (!anyexp) | |
1970 | { | |
1971 | error("missing exponent"); | |
1972 | isWellformedString = false; | |
1973 | } | |
1974 | break; | |
1975 | } | |
1976 | } | |
1977 | else if (hex) | |
1978 | { | |
1979 | error("exponent required for hex float"); | |
1980 | isWellformedString = false; | |
1981 | } | |
1982 | --p; | |
1983 | while (pstart < p) | |
1984 | { | |
1985 | if (*pstart != '_') | |
1986 | stringbuffer.writeByte(*pstart); | |
1987 | ++pstart; | |
1988 | } | |
1989 | ||
1990 | stringbuffer.writeByte(0); | |
1991 | const char *sbufptr = (char *)stringbuffer.data; | |
1992 | TOK result; | |
1993 | bool isOutOfRange = false; | |
1994 | t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero); | |
1995 | errno = 0; | |
1996 | switch (*p) | |
1997 | { | |
1998 | case 'F': | |
1999 | case 'f': | |
2000 | if (isWellformedString && !isOutOfRange) | |
2001 | isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr); | |
2002 | result = TOKfloat32v; | |
2003 | p++; | |
2004 | break; | |
2005 | ||
2006 | default: | |
2007 | if (isWellformedString && !isOutOfRange) | |
2008 | isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr); | |
2009 | result = TOKfloat64v; | |
2010 | break; | |
2011 | ||
2012 | case 'l': | |
2013 | error("use 'L' suffix instead of 'l'"); | |
2014 | /* fall through */ | |
2015 | case 'L': | |
2016 | result = TOKfloat80v; | |
2017 | p++; | |
2018 | break; | |
2019 | } | |
2020 | if (*p == 'i' || *p == 'I') | |
2021 | { | |
2022 | if (*p == 'I') | |
2023 | error("use 'i' suffix instead of 'I'"); | |
2024 | p++; | |
2025 | switch (result) | |
2026 | { | |
2027 | case TOKfloat32v: | |
2028 | result = TOKimaginary32v; | |
2029 | break; | |
2030 | case TOKfloat64v: | |
2031 | result = TOKimaginary64v; | |
2032 | break; | |
2033 | case TOKfloat80v: | |
2034 | result = TOKimaginary80v; | |
2035 | break; | |
2036 | default: break; | |
2037 | } | |
2038 | } | |
2039 | const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v); | |
2040 | if (isOutOfRange && !isLong) | |
2041 | { | |
2042 | const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; | |
2043 | error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix); | |
2044 | } | |
2045 | return result; | |
2046 | } | |
2047 | ||
2048 | /********************************************* | |
2049 | * parse: | |
2050 | * #line linnum [filespec] | |
2051 | * also allow __LINE__ for linnum, and __FILE__ for filespec | |
2052 | */ | |
2053 | ||
2054 | void Lexer::poundLine() | |
2055 | { | |
2056 | Token tok; | |
2057 | int linnum = this->scanloc.linnum; | |
2058 | char *filespec = NULL; | |
2059 | Loc loc = this->loc(); | |
2060 | ||
2061 | scan(&tok); | |
2062 | if (tok.value == TOKint32v || tok.value == TOKint64v) | |
2063 | { | |
2064 | int lin = (int)(tok.uns64value - 1); | |
2065 | if ((unsigned)lin != tok.uns64value - 1) | |
2066 | error("line number %lld out of range", (unsigned long long)tok.uns64value); | |
2067 | else | |
2068 | linnum = lin; | |
2069 | } | |
2070 | else if (tok.value == TOKline) | |
2071 | { | |
2072 | } | |
2073 | else | |
2074 | goto Lerr; | |
2075 | ||
2076 | while (1) | |
2077 | { | |
2078 | switch (*p) | |
2079 | { | |
2080 | case 0: | |
2081 | case 0x1A: | |
2082 | case '\n': | |
2083 | Lnewline: | |
2084 | this->scanloc.linnum = linnum; | |
2085 | if (filespec) | |
2086 | this->scanloc.filename = filespec; | |
2087 | return; | |
2088 | ||
2089 | case '\r': | |
2090 | p++; | |
2091 | if (*p != '\n') | |
2092 | { p--; | |
2093 | goto Lnewline; | |
2094 | } | |
2095 | continue; | |
2096 | ||
2097 | case ' ': | |
2098 | case '\t': | |
2099 | case '\v': | |
2100 | case '\f': | |
2101 | p++; | |
2102 | continue; // skip white space | |
2103 | ||
2104 | case '_': | |
2105 | if (memcmp(p, "__FILE__", 8) == 0) | |
2106 | { | |
2107 | p += 8; | |
2108 | filespec = mem.xstrdup(scanloc.filename); | |
2109 | continue; | |
2110 | } | |
2111 | goto Lerr; | |
2112 | ||
2113 | case '"': | |
2114 | if (filespec) | |
2115 | goto Lerr; | |
2116 | stringbuffer.reset(); | |
2117 | p++; | |
2118 | while (1) | |
2119 | { unsigned c; | |
2120 | ||
2121 | c = *p; | |
2122 | switch (c) | |
2123 | { | |
2124 | case '\n': | |
2125 | case '\r': | |
2126 | case 0: | |
2127 | case 0x1A: | |
2128 | goto Lerr; | |
2129 | ||
2130 | case '"': | |
2131 | stringbuffer.writeByte(0); | |
2132 | filespec = mem.xstrdup((char *)stringbuffer.data); | |
2133 | p++; | |
2134 | break; | |
2135 | ||
2136 | default: | |
2137 | if (c & 0x80) | |
2138 | { unsigned u = decodeUTF(); | |
2139 | if (u == PS || u == LS) | |
2140 | goto Lerr; | |
2141 | } | |
2142 | stringbuffer.writeByte(c); | |
2143 | p++; | |
2144 | continue; | |
2145 | } | |
2146 | break; | |
2147 | } | |
2148 | continue; | |
2149 | ||
2150 | default: | |
2151 | if (*p & 0x80) | |
2152 | { unsigned u = decodeUTF(); | |
2153 | if (u == PS || u == LS) | |
2154 | goto Lnewline; | |
2155 | } | |
2156 | goto Lerr; | |
2157 | } | |
2158 | } | |
2159 | ||
2160 | Lerr: | |
2161 | error(loc, "#line integer [\"filespec\"]\\n expected"); | |
2162 | } | |
2163 | ||
2164 | ||
2165 | /******************************************** | |
2166 | * Decode UTF character. | |
2167 | * Issue error messages for invalid sequences. | |
2168 | * Return decoded character, advance p to last character in UTF sequence. | |
2169 | */ | |
2170 | ||
2171 | unsigned Lexer::decodeUTF() | |
2172 | { | |
2173 | dchar_t u; | |
2174 | utf8_t c; | |
2175 | const utf8_t *s = p; | |
2176 | size_t len; | |
2177 | size_t idx; | |
2178 | const char *msg; | |
2179 | ||
2180 | c = *s; | |
2181 | assert(c & 0x80); | |
2182 | ||
2183 | // Check length of remaining string up to 6 UTF-8 characters | |
2184 | for (len = 1; len < 6 && s[len]; len++) | |
2185 | ; | |
2186 | ||
2187 | idx = 0; | |
2188 | msg = utf_decodeChar(s, len, &idx, &u); | |
2189 | p += idx - 1; | |
2190 | if (msg) | |
2191 | { | |
2192 | error("%s", msg); | |
2193 | } | |
2194 | return u; | |
2195 | } | |
2196 | ||
2197 | ||
2198 | /*************************************************** | |
2199 | * Parse doc comment embedded between t->ptr and p. | |
2200 | * Remove trailing blanks and tabs from lines. | |
2201 | * Replace all newlines with \n. | |
2202 | * Remove leading comment character from each line. | |
2203 | * Decide if it's a lineComment or a blockComment. | |
2204 | * Append to previous one for this token. | |
2205 | */ | |
2206 | ||
2207 | void Lexer::getDocComment(Token *t, unsigned lineComment) | |
2208 | { | |
2209 | /* ct tells us which kind of comment it is: '/', '*', or '+' | |
2210 | */ | |
2211 | utf8_t ct = t->ptr[2]; | |
2212 | ||
2213 | /* Start of comment text skips over / * *, / + +, or / / / | |
2214 | */ | |
2215 | const utf8_t *q = t->ptr + 3; // start of comment text | |
2216 | ||
2217 | const utf8_t *qend = p; | |
2218 | if (ct == '*' || ct == '+') | |
2219 | qend -= 2; | |
2220 | ||
2221 | /* Scan over initial row of ****'s or ++++'s or ////'s | |
2222 | */ | |
2223 | for (; q < qend; q++) | |
2224 | { | |
2225 | if (*q != ct) | |
2226 | break; | |
2227 | } | |
2228 | ||
2229 | /* Remove leading spaces until start of the comment | |
2230 | */ | |
2231 | int linestart = 0; | |
2232 | if (ct == '/') | |
2233 | { | |
2234 | while (q < qend && (*q == ' ' || *q == '\t')) | |
2235 | ++q; | |
2236 | } | |
2237 | else if (q < qend) | |
2238 | { | |
2239 | if (*q == '\r') | |
2240 | { | |
2241 | ++q; | |
2242 | if (q < qend && *q == '\n') | |
2243 | ++q; | |
2244 | linestart = 1; | |
2245 | } | |
2246 | else if (*q == '\n') | |
2247 | { | |
2248 | ++q; | |
2249 | linestart = 1; | |
2250 | } | |
2251 | } | |
2252 | ||
2253 | /* Remove trailing row of ****'s or ++++'s | |
2254 | */ | |
2255 | if (ct != '/') | |
2256 | { | |
2257 | for (; q < qend; qend--) | |
2258 | { | |
2259 | if (qend[-1] != ct) | |
2260 | break; | |
2261 | } | |
2262 | } | |
2263 | ||
2264 | /* Comment is now [q .. qend]. | |
2265 | * Canonicalize it into buf[]. | |
2266 | */ | |
2267 | OutBuffer buf; | |
2268 | ||
2269 | for (; q < qend; q++) | |
2270 | { | |
2271 | utf8_t c = *q; | |
2272 | ||
2273 | switch (c) | |
2274 | { | |
2275 | case '*': | |
2276 | case '+': | |
2277 | if (linestart && c == ct) | |
2278 | { linestart = 0; | |
2279 | /* Trim preceding whitespace up to preceding \n | |
2280 | */ | |
2281 | while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2282 | buf.offset--; | |
2283 | continue; | |
2284 | } | |
2285 | break; | |
2286 | ||
2287 | case ' ': | |
2288 | case '\t': | |
2289 | break; | |
2290 | ||
2291 | case '\r': | |
2292 | if (q[1] == '\n') | |
2293 | continue; // skip the \r | |
2294 | goto Lnewline; | |
2295 | ||
2296 | default: | |
2297 | if (c == 226) | |
2298 | { | |
2299 | // If LS or PS | |
2300 | if (q[1] == 128 && | |
2301 | (q[2] == 168 || q[2] == 169)) | |
2302 | { | |
2303 | q += 2; | |
2304 | goto Lnewline; | |
2305 | } | |
2306 | } | |
2307 | linestart = 0; | |
2308 | break; | |
2309 | ||
2310 | Lnewline: | |
2311 | c = '\n'; // replace all newlines with \n | |
2312 | /* fall through */ | |
2313 | case '\n': | |
2314 | linestart = 1; | |
2315 | ||
2316 | /* Trim trailing whitespace | |
2317 | */ | |
2318 | while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2319 | buf.offset--; | |
2320 | ||
2321 | break; | |
2322 | } | |
2323 | buf.writeByte(c); | |
2324 | } | |
2325 | ||
2326 | /* Trim trailing whitespace (if the last line does not have newline) | |
2327 | */ | |
2328 | if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2329 | { | |
2330 | while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2331 | buf.offset--; | |
2332 | } | |
2333 | ||
2334 | // Always end with a newline | |
2335 | if (!buf.offset || buf.data[buf.offset - 1] != '\n') | |
2336 | buf.writeByte('\n'); | |
2337 | ||
2338 | buf.writeByte(0); | |
2339 | ||
2340 | // It's a line comment if the start of the doc comment comes | |
2341 | // after other non-whitespace on the same line. | |
2342 | const utf8_t** dc = (lineComment && anyToken) | |
2343 | ? &t->lineComment | |
2344 | : &t->blockComment; | |
2345 | ||
2346 | // Combine with previous doc comment, if any | |
2347 | if (*dc) | |
2348 | *dc = combineComments(*dc, (utf8_t *)buf.data); | |
2349 | else | |
2350 | *dc = (utf8_t *)buf.extractData(); | |
2351 | } | |
2352 | ||
2353 | /******************************************** | |
2354 | * Combine two document comments into one, | |
2355 | * separated by a newline. | |
2356 | */ | |
2357 | ||
2358 | const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2) | |
2359 | { | |
2360 | //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); | |
2361 | ||
2362 | const utf8_t *c = c2; | |
2363 | ||
2364 | if (c1) | |
2365 | { | |
2366 | c = c1; | |
2367 | if (c2) | |
2368 | { | |
2369 | size_t len1 = strlen((const char *)c1); | |
2370 | size_t len2 = strlen((const char *)c2); | |
2371 | ||
2372 | int insertNewLine = 0; | |
2373 | if (len1 && c1[len1 - 1] != '\n') | |
2374 | { | |
2375 | ++len1; | |
2376 | insertNewLine = 1; | |
2377 | } | |
2378 | ||
2379 | utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1); | |
2380 | memcpy(p, c1, len1 - insertNewLine); | |
2381 | if (insertNewLine) | |
2382 | p[len1 - 1] = '\n'; | |
2383 | ||
2384 | p[len1] = '\n'; | |
2385 | ||
2386 | memcpy(p + len1 + 1, c2, len2); | |
2387 | p[len1 + 1 + len2] = 0; | |
2388 | c = p; | |
2389 | } | |
2390 | } | |
2391 | return c; | |
2392 | } |