]> git.ipfire.org Git - thirdparty/gcc.git/blame - libphobos/src/std/regex/internal/parser.d
d: Import dmd b8384668f, druntime e6caaab9, phobos 5ab9ad256 (v2.098.0-beta.1)
[thirdparty/gcc.git] / libphobos / src / std / regex / internal / parser.d
CommitLineData
b4c522fa
IB
1//Written in the D programming language
2/*
3 Regular expression pattern parser.
4*/
5module std.regex.internal.parser;
6
5fee5ec3 7import std.regex.internal.ir;
b4c522fa
IB
8import std.range.primitives, std.uni, std.meta,
9 std.traits, std.typecons, std.exception;
5fee5ec3 10static import std.ascii;
b4c522fa
IB
11
12// package relevant info from parser into a regex object
13auto makeRegex(S, CG)(Parser!(S, CG) p)
14{
5fee5ec3
IB
15 import std.regex.internal.backtracking : BacktrackingMatcher;
16 import std.regex.internal.thompson : ThompsonMatcher;
17 import std.algorithm.searching : canFind;
18 alias Char = BasicElementOf!S;
19 Regex!Char re;
b4c522fa
IB
20 auto g = p.g;
21 with(re)
22 {
23 ir = g.ir;
24 dict = g.dict;
25 ngroup = g.ngroup;
26 maxCounterDepth = g.counterDepth;
27 flags = p.re_flags;
28 charsets = g.charsets;
29 matchers = g.matchers;
30 backrefed = g.backrefed;
5fee5ec3 31 re.pattern = p.origin.idup;
b4c522fa 32 re.postprocess();
5fee5ec3
IB
33 // check if we have backreferences, if so - use backtracking
34 if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
35 else if (re.backrefed.canFind!"a != 0")
36 factory = new RuntimeFactory!(BacktrackingMatcher, Char);
37 else
38 factory = new RuntimeFactory!(ThompsonMatcher, Char);
b4c522fa
IB
39 debug(std_regex_parser)
40 {
41 __ctfe || print();
42 }
43 //@@@BUG@@@ (not reduced)
44 //somehow just using validate _collides_ with std.utf.validate (!)
45 version (assert) re.validateRe();
46 }
47 return re;
48}
49
50// helper for unittest
51auto makeRegex(S)(S arg)
52if (isSomeString!S)
53{
54 return makeRegex(Parser!(S, CodeGen)(arg, ""));
55}
56
57@system unittest
58{
59 import std.algorithm.comparison : equal;
60 auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
61 auto nc = re.namedCaptures;
62 static assert(isRandomAccessRange!(typeof(nc)));
63 assert(!nc.empty);
64 assert(nc.length == 2);
65 assert(nc.equal(["name", "var"]));
66 assert(nc[0] == "name");
67 assert(nc[1..$].equal(["var"]));
68
69 re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
70 nc = re.namedCaptures;
71 assert(nc.length == 1);
72 assert(nc[0] == "named");
73 assert(nc.front == "named");
74 assert(nc.back == "named");
75
76 re = makeRegex(`(\w+) (\w+)`);
77 nc = re.namedCaptures;
78 assert(nc.empty);
79
80 re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
81 nc = re.namedCaptures;
82 auto cp = nc.save;
83 assert(nc.equal(cp));
84 nc.popFront();
85 assert(nc.equal(cp[1..$]));
86 nc.popBack();
87 assert(nc.equal(cp[1 .. $ - 1]));
88}
89
90
91@trusted void reverseBytecode()(Bytecode[] code)
92{
93 Bytecode[] rev = new Bytecode[code.length];
94 uint revPc = cast(uint) rev.length;
95 Stack!(Tuple!(uint, uint, uint)) stack;
96 uint start = 0;
97 uint end = cast(uint) code.length;
98 for (;;)
99 {
100 for (uint pc = start; pc < end; )
101 {
102 immutable len = code[pc].length;
103 if (code[pc].code == IR.GotoEndOr)
104 break; //pick next alternation branch
105 if (code[pc].isAtom)
106 {
107 rev[revPc - len .. revPc] = code[pc .. pc + len];
108 revPc -= len;
109 pc += len;
110 }
111 else if (code[pc].isStart || code[pc].isEnd)
112 {
113 //skip over other embedded lookbehinds they are reversed
114 if (code[pc].code == IR.LookbehindStart
115 || code[pc].code == IR.NeglookbehindStart)
116 {
117 immutable blockLen = len + code[pc].data
118 + code[pc].pairedLength;
119 rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
120 pc += blockLen;
121 revPc -= blockLen;
122 continue;
123 }
124 immutable second = code[pc].indexOfPair(pc);
125 immutable secLen = code[second].length;
126 rev[revPc - secLen .. revPc] = code[second .. second + secLen];
127 revPc -= secLen;
128 if (code[pc].code == IR.OrStart)
129 {
130 //we pass len bytes forward, but secLen in reverse
131 immutable revStart = revPc - (second + len - secLen - pc);
132 uint r = revStart;
133 uint i = pc + IRL!(IR.OrStart);
134 while (code[i].code == IR.Option)
135 {
136 if (code[i - 1].code != IR.OrStart)
137 {
138 assert(code[i - 1].code == IR.GotoEndOr);
139 rev[r - 1] = code[i - 1];
140 }
141 rev[r] = code[i];
142 auto newStart = i + IRL!(IR.Option);
143 auto newEnd = newStart + code[i].data;
144 auto newRpc = r + code[i].data + IRL!(IR.Option);
145 if (code[newEnd].code != IR.OrEnd)
146 {
147 newRpc--;
148 }
149 stack.push(tuple(newStart, newEnd, newRpc));
150 r += code[i].data + IRL!(IR.Option);
151 i += code[i].data + IRL!(IR.Option);
152 }
153 pc = i;
154 revPc = revStart;
155 assert(code[pc].code == IR.OrEnd);
156 }
157 else
158 pc += len;
159 }
160 }
161 if (stack.empty)
162 break;
163 start = stack.top[0];
164 end = stack.top[1];
165 revPc = stack.top[2];
166 stack.pop();
167 }
168 code[] = rev[];
169}
170
b4c522fa
IB
171struct CodeGen
172{
173 Bytecode[] ir; // resulting bytecode
174 Stack!(uint) fixupStack; // stack of opened start instructions
175 NamedGroup[] dict; // maps name -> user group number
176 Stack!(uint) groupStack; // stack of current number of group
177 uint nesting = 0; // group nesting level and repetitions step
178 uint lookaroundNest = 0; // nesting of lookaround
179 uint counterDepth = 0; // current depth of nested counted repetitions
180 CodepointSet[] charsets; // sets for char classes
181 const(CharMatcher)[] matchers; // matchers for char classes
182 uint[] backrefed; // bitarray for groups refered by backref
183 uint ngroup; // final number of groups (of all patterns)
184
185 void start(uint length)
186 {
187 if (!__ctfe)
188 ir.reserve((length*5+2)/4);
189 fixupStack.push(0);
190 groupStack.push(1);//0 - whole match
191 }
192
193 //mark referenced groups for latter processing
194 void markBackref(uint n)
195 {
196 if (n/32 >= backrefed.length)
197 backrefed.length = n/32 + 1;
198 backrefed[n / 32] |= 1 << (n & 31);
199 }
200
201 bool isOpenGroup(uint n)
202 {
203 import std.algorithm.searching : canFind;
204 // walk the fixup stack and see if there are groups labeled 'n'
205 // fixup '0' is reserved for alternations
206 return fixupStack.data[1..$].
207 canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
208 }
209
210 void put(Bytecode code)
211 {
212 enforce(ir.length < maxCompiledLength,
213 "maximum compiled pattern length is exceeded");
214 ir ~= code;
215 }
216
217 void putRaw(uint number)
218 {
219 enforce(ir.length < maxCompiledLength,
220 "maximum compiled pattern length is exceeded");
221 ir ~= Bytecode.fromRaw(number);
222 }
223
224 //try to generate optimal IR code for this CodepointSet
225 @trusted void charsetToIr(CodepointSet set)
226 {//@@@BUG@@@ writeln is @system
227 uint chars = cast(uint) set.length;
228 if (chars < Bytecode.maxSequence)
229 {
230 switch (chars)
231 {
232 case 1:
233 put(Bytecode(IR.Char, set.byCodepoint.front));
234 break;
235 case 0:
236 throw new RegexException("empty CodepointSet not allowed");
237 default:
238 foreach (ch; set.byCodepoint)
239 put(Bytecode(IR.OrChar, ch, chars));
240 }
241 }
242 else
243 {
244 import std.algorithm.searching : countUntil;
245 const ivals = set.byInterval;
246 immutable n = charsets.countUntil(set);
247 if (n >= 0)
248 {
249 if (ivals.length*2 > maxCharsetUsed)
250 put(Bytecode(IR.Trie, cast(uint) n));
251 else
252 put(Bytecode(IR.CodepointSet, cast(uint) n));
253 return;
254 }
255 if (ivals.length*2 > maxCharsetUsed)
256 {
257 auto t = getMatcher(set);
258 put(Bytecode(IR.Trie, cast(uint) matchers.length));
259 matchers ~= t;
260 debug(std_regex_allocation) writeln("Trie generated");
261 }
262 else
263 {
264 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
265 matchers ~= CharMatcher.init;
266 }
267 charsets ~= set;
268 assert(charsets.length == matchers.length);
269 }
270 }
271
272 void genLogicGroup()
273 {
274 nesting++;
275 pushFixup(length);
276 put(Bytecode(IR.Nop, 0));
277 }
278
279 void genGroup()
280 {
281 nesting++;
282 pushFixup(length);
283 immutable nglob = groupStack.top++;
284 enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
285 put(Bytecode(IR.GroupStart, nglob));
286 }
287
288 void genNamedGroup(string name)
289 {
290 import std.array : insertInPlace;
291 import std.range : assumeSorted;
292 nesting++;
293 pushFixup(length);
294 immutable nglob = groupStack.top++;
295 enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
296 auto t = NamedGroup(name, nglob);
297 auto d = assumeSorted!"a.name < b.name"(dict);
298 immutable ind = d.lowerBound(t).length;
299 insertInPlace(dict, ind, t);
300 put(Bytecode(IR.GroupStart, nglob));
301 }
302
303 //generate code for start of lookaround: (?= (?! (?<= (?<!
304 void genLookaround(IR opcode)
305 {
306 nesting++;
307 pushFixup(length);
308 put(Bytecode(opcode, 0));
309 put(Bytecode.fromRaw(0));
310 put(Bytecode.fromRaw(0));
311 groupStack.push(0);
312 lookaroundNest++;
313 enforce(lookaroundNest <= maxLookaroundDepth,
314 "maximum lookaround depth is exceeded");
315 }
316
317 void endPattern(uint num)
318 {
319 import std.algorithm.comparison : max;
320 put(Bytecode(IR.End, num));
321 ngroup = max(ngroup, groupStack.top);
322 groupStack.top = 1; // reset group counter
323 }
324
325 //fixup lookaround with start at offset fix and append a proper *-End opcode
326 void fixLookaround(uint fix)
327 {
328 lookaroundNest--;
329 ir[fix] = Bytecode(ir[fix].code,
330 cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
331 auto g = groupStack.pop();
332 assert(!groupStack.empty);
333 ir[fix+1] = Bytecode.fromRaw(groupStack.top);
334 //groups are cumulative across lookarounds
335 ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
336 groupStack.top += g;
337 if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
338 {
339 reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
340 }
341 put(ir[fix].paired);
342 }
343
344 // repetition of {1,1}
345 void fixRepetition(uint offset)
346 {
347 import std.algorithm.mutation : copy;
348 immutable replace = ir[offset].code == IR.Nop;
349 if (replace)
350 {
351 copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
352 ir.length -= 1;
353 }
354 }
355
356 // repetition of {x,y}
357 void fixRepetition(uint offset, uint min, uint max, bool greedy)
358 {
359 static import std.algorithm.comparison;
360 import std.algorithm.mutation : copy;
361 import std.array : insertInPlace;
362 immutable replace = ir[offset].code == IR.Nop;
363 immutable len = cast(uint) ir.length - offset - replace;
364 if (max != infinite)
365 {
366 if (min != 1 || max != 1)
367 {
368 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
369 if (replace)
370 ir[offset] = op;
371 else
372 insertInPlace(ir, offset, op);
373 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
374 put(Bytecode.init); //hotspot
375 putRaw(1);
376 putRaw(min);
377 putRaw(max);
378 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
379 }
380 }
381 else if (min) //&& max is infinite
382 {
383 if (min != 1)
384 {
385 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
386 if (replace)
387 ir[offset] = op;
388 else
389 insertInPlace(ir, offset, op);
390 offset += 1;//so it still points to the repeated block
391 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
392 put(Bytecode.init); //hotspot
393 putRaw(1);
394 putRaw(min);
395 putRaw(min);
396 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
397 }
398 else if (replace)
399 {
400 copy(ir[offset+1 .. $], ir[offset .. $-1]);
401 ir.length -= 1;
402 }
403 put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
404 enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
405 ir ~= ir[offset .. offset+len];
406 //IR.InfinteX is always a hotspot
407 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
408 put(Bytecode.init); //merge index
409 }
410 else//vanila {0,inf}
411 {
412 Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
413 if (replace)
414 ir[offset] = op;
415 else
416 insertInPlace(ir, offset, op);
417 //IR.InfinteX is always a hotspot
418 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
419 put(Bytecode.init); //merge index
420 }
421 }
422
423 void fixAlternation()
424 {
425 import std.array : insertInPlace;
426 uint fix = fixupStack.top;
427 if (ir.length > fix && ir[fix].code == IR.Option)
428 {
429 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
430 put(Bytecode(IR.GotoEndOr, 0));
431 fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
432 put(Bytecode(IR.Option, 0));
433 return;
434 }
435 uint len, orStart;
436 //start a new option
437 if (fixupStack.length == 1)
438 {//only root entry, effectively no fixup
439 len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
440 orStart = 0;
441 }
442 else
443 {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
444 len = cast(uint) ir.length - fix - (ir[fix].length - 1);
445 orStart = fix + ir[fix].length;
446 }
447 insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
448 assert(ir[orStart].code == IR.OrStart);
449 put(Bytecode(IR.GotoEndOr, 0));
450 fixupStack.push(orStart); //fixup for StartOR
451 fixupStack.push(cast(uint) ir.length); //for second Option
452 put(Bytecode(IR.Option, 0));
453 }
454
455 // finalizes IR.Option, fix points to the first option of sequence
456 void finishAlternation(uint fix)
457 {
458 enforce(ir[fix].code == IR.Option, "no matching ')'");
459 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
460 fix = fixupStack.pop();
461 enforce(ir[fix].code == IR.OrStart, "no matching ')'");
462 ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
463 put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
464 uint pc = fix + IRL!(IR.OrStart);
465 while (ir[pc].code == IR.Option)
466 {
467 pc = pc + ir[pc].data;
468 if (ir[pc].code != IR.GotoEndOr)
469 break;
470 ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
471 pc += IRL!(IR.GotoEndOr);
472 }
473 put(Bytecode.fromRaw(0));
474 }
475
476 // returns: (flag - repetition possible?, fixup of the start of this "group")
477 Tuple!(bool, uint) onClose()
478 {
479 nesting--;
480 uint fix = popFixup();
481 switch (ir[fix].code)
482 {
483 case IR.GroupStart:
484 put(Bytecode(IR.GroupEnd, ir[fix].data));
485 return tuple(true, fix);
486 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
487 assert(lookaroundNest);
488 fixLookaround(fix);
489 return tuple(false, 0u);
490 case IR.Option: //| xxx )
491 //two fixups: last option + full OR
492 finishAlternation(fix);
493 fix = topFixup;
494 switch (ir[fix].code)
495 {
496 case IR.GroupStart:
497 popFixup();
498 put(Bytecode(IR.GroupEnd, ir[fix].data));
499 return tuple(true, fix);
500 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
501 assert(lookaroundNest);
502 fix = popFixup();
503 fixLookaround(fix);
504 return tuple(false, 0u);
505 default://(?:xxx)
506 popFixup();
507 return tuple(true, fix);
508 }
509 default://(?:xxx)
510 return tuple(true, fix);
511 }
512 }
513
514 uint popFixup(){ return fixupStack.pop(); }
515
516 void pushFixup(uint val){ return fixupStack.push(val); }
517
518 @property uint topFixup(){ return fixupStack.top; }
519
520 @property size_t fixupLength(){ return fixupStack.data.length; }
521
522 @property uint length(){ return cast(uint) ir.length; }
523}
524
525// safety limits
526enum maxGroupNumber = 2^^19;
527enum maxLookaroundDepth = 16;
528// *Bytecode.sizeof, i.e. 1Mb of bytecode alone
529enum maxCompiledLength = 2^^18;
530// amounts to up to 4 Mb of auxilary table for matching
531enum maxCumulativeRepetitionLength = 2^^20;
532// marker to indicate infinite repetition
533enum infinite = ~0u;
534
535struct Parser(R, Generator)
536if (isForwardRange!R && is(ElementType!R : dchar))
537{
5fee5ec3 538 dchar front;
b4c522fa
IB
539 bool empty;
540 R pat, origin; //keep full pattern for pretty printing error messages
541 uint re_flags = 0; //global flags e.g. multiline + internal ones
542 Generator g;
543
544 @trusted this(S)(R pattern, S flags)
545 if (isSomeString!S)
546 {
547 pat = origin = pattern;
548 //reserve slightly more then avg as sampled from unittests
549 parseFlags(flags);
5fee5ec3
IB
550 front = ' ';//a safe default for freeform parsing
551 popFront();
b4c522fa
IB
552 g.start(cast(uint) pat.length);
553 try
554 {
555 parseRegex();
556 }
557 catch (Exception e)
558 {
559 error(e.msg);//also adds pattern location
560 }
561 g.endPattern(1);
562 }
563
5fee5ec3 564 void _popFront()
b4c522fa
IB
565 {
566 if (pat.empty)
567 {
568 empty = true;
b4c522fa 569 }
5fee5ec3
IB
570 else
571 {
572 front = pat.front;
573 pat.popFront();
574 }
b4c522fa
IB
575 }
576
577 void skipSpace()
578 {
5fee5ec3 579 while (!empty && isWhite(front)) _popFront();
b4c522fa
IB
580 }
581
5fee5ec3 582 void popFront()
b4c522fa 583 {
5fee5ec3
IB
584 _popFront();
585 if (re_flags & RegexOption.freeform) skipSpace();
b4c522fa
IB
586 }
587
5fee5ec3
IB
588 auto save(){ return this; }
589
b4c522fa
IB
590 //parsing number with basic overflow check
591 uint parseDecimal()
592 {
593 uint r = 0;
5fee5ec3 594 while (std.ascii.isDigit(front))
b4c522fa
IB
595 {
596 if (r >= (uint.max/10))
597 error("Overflow in decimal number");
5fee5ec3
IB
598 r = 10*r + cast(uint)(front-'0');
599 popFront();
600 if (empty) break;
b4c522fa
IB
601 }
602 return r;
603 }
604
b4c522fa
IB
605 //
606 @trusted void parseFlags(S)(S flags)
607 {//@@@BUG@@@ text is @system
608 import std.conv : text;
609 foreach (ch; flags)//flags are ASCII anyway
610 {
611 L_FlagSwitch:
612 switch (ch)
613 {
614
615 foreach (i, op; __traits(allMembers, RegexOption))
616 {
617 case RegexOptionNames[i]:
618 if (re_flags & mixin("RegexOption."~op))
619 throw new RegexException(text("redundant flag specified: ",ch));
620 re_flags |= mixin("RegexOption."~op);
621 break L_FlagSwitch;
622 }
623 default:
624 throw new RegexException(text("unknown regex flag '",ch,"'"));
625 }
626 }
627 }
628
629 //parse and store IR for regex pattern
630 @trusted void parseRegex()
631 {
632 uint fix;//fixup pointer
633
634 while (!empty)
635 {
636 debug(std_regex_parser)
637 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
5fee5ec3 638 switch (front)
b4c522fa
IB
639 {
640 case '(':
5fee5ec3
IB
641 popFront();
642 if (front == '?')
b4c522fa 643 {
5fee5ec3
IB
644 popFront();
645 switch (front)
b4c522fa
IB
646 {
647 case '#':
648 for (;;)
649 {
5fee5ec3
IB
650 popFront();
651 enforce(!empty, "Unexpected end of pattern");
652 if (front == ')')
b4c522fa 653 {
5fee5ec3 654 popFront();
b4c522fa
IB
655 break;
656 }
657 }
658 break;
659 case ':':
660 g.genLogicGroup();
5fee5ec3 661 popFront();
b4c522fa
IB
662 break;
663 case '=':
664 g.genLookaround(IR.LookaheadStart);
5fee5ec3 665 popFront();
b4c522fa
IB
666 break;
667 case '!':
668 g.genLookaround(IR.NeglookaheadStart);
5fee5ec3 669 popFront();
b4c522fa
IB
670 break;
671 case 'P':
5fee5ec3
IB
672 popFront();
673 enforce(front == '<', "Expected '<' in named group");
b4c522fa 674 string name;
5fee5ec3
IB
675 popFront();
676 if (empty || !(isAlpha(front) || front == '_'))
b4c522fa 677 error("Expected alpha starting a named group");
5fee5ec3
IB
678 name ~= front;
679 popFront();
680 while (!empty && (isAlpha(front) ||
681 front == '_' || std.ascii.isDigit(front)))
b4c522fa 682 {
5fee5ec3
IB
683 name ~= front;
684 popFront();
b4c522fa 685 }
5fee5ec3
IB
686 enforce(front == '>', "Expected '>' closing named group");
687 popFront();
b4c522fa
IB
688 g.genNamedGroup(name);
689 break;
690 case '<':
5fee5ec3
IB
691 popFront();
692 if (front == '=')
b4c522fa 693 g.genLookaround(IR.LookbehindStart);
5fee5ec3 694 else if (front == '!')
b4c522fa
IB
695 g.genLookaround(IR.NeglookbehindStart);
696 else
697 error("'!' or '=' expected after '<'");
5fee5ec3 698 popFront();
b4c522fa
IB
699 break;
700 default:
701 uint enableFlags, disableFlags;
702 bool enable = true;
703 do
704 {
5fee5ec3 705 switch (front)
b4c522fa
IB
706 {
707 case 's':
708 if (enable)
709 enableFlags |= RegexOption.singleline;
710 else
711 disableFlags |= RegexOption.singleline;
712 break;
713 case 'x':
714 if (enable)
715 enableFlags |= RegexOption.freeform;
716 else
717 disableFlags |= RegexOption.freeform;
718 break;
719 case 'i':
720 if (enable)
721 enableFlags |= RegexOption.casefold;
722 else
723 disableFlags |= RegexOption.casefold;
724 break;
725 case 'm':
726 if (enable)
727 enableFlags |= RegexOption.multiline;
728 else
729 disableFlags |= RegexOption.multiline;
730 break;
731 case '-':
732 if (!enable)
733 error(" unexpected second '-' in flags");
734 enable = false;
735 break;
736 default:
737 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
738 }
5fee5ec3
IB
739 popFront();
740 }while (front != ')');
741 popFront();
b4c522fa
IB
742 re_flags |= enableFlags;
743 re_flags &= ~disableFlags;
744 }
745 }
746 else
747 {
748 g.genGroup();
749 }
750 break;
751 case ')':
752 enforce(g.nesting, "Unmatched ')'");
5fee5ec3 753 popFront();
b4c522fa
IB
754 auto pair = g.onClose();
755 if (pair[0])
756 parseQuantifier(pair[1]);
757 break;
758 case '|':
5fee5ec3 759 popFront();
b4c522fa
IB
760 g.fixAlternation();
761 break;
762 default://no groups or whatever
763 immutable start = g.length;
764 parseAtom();
765 parseQuantifier(start);
766 }
767 }
768
769 if (g.fixupLength != 1)
770 {
771 fix = g.popFixup();
772 g.finishAlternation(fix);
773 enforce(g.fixupLength == 1, "no matching ')'");
774 }
775 }
776
777
778 //parse and store IR for atom-quantifier pair
779 @trusted void parseQuantifier(uint offset)
780 {//copy is @system
781 if (empty)
782 return g.fixRepetition(offset);
783 uint min, max;
5fee5ec3 784 switch (front)
b4c522fa
IB
785 {
786 case '*':
787 min = 0;
788 max = infinite;
789 break;
790 case '?':
791 min = 0;
792 max = 1;
793 break;
794 case '+':
795 min = 1;
796 max = infinite;
797 break;
798 case '{':
5fee5ec3
IB
799 popFront();
800 enforce(!empty, "Unexpected end of regex pattern");
801 enforce(std.ascii.isDigit(front), "First number required in repetition");
b4c522fa 802 min = parseDecimal();
5fee5ec3 803 if (front == '}')
b4c522fa 804 max = min;
5fee5ec3 805 else if (front == ',')
b4c522fa 806 {
5fee5ec3
IB
807 popFront();
808 if (std.ascii.isDigit(front))
b4c522fa 809 max = parseDecimal();
5fee5ec3 810 else if (front == '}')
b4c522fa
IB
811 max = infinite;
812 else
813 error("Unexpected symbol in regex pattern");
814 skipSpace();
5fee5ec3 815 enforce(front == '}', "Unmatched '{' in regex pattern");
b4c522fa
IB
816 }
817 else
818 error("Unexpected symbol in regex pattern");
5fee5ec3 819 enforce(min <= max, "Illegal {n,m} quantifier");
b4c522fa
IB
820 break;
821 default:
822 g.fixRepetition(offset);
823 return;
824 }
825 bool greedy = true;
826 //check only if we managed to get new symbol
5fee5ec3
IB
827 popFront();
828 if (!empty && front == '?')
b4c522fa
IB
829 {
830 greedy = false;
5fee5ec3 831 popFront();
b4c522fa
IB
832 }
833 g.fixRepetition(offset, min, max, greedy);
834 }
835
836 //parse and store IR for atom
837 void parseAtom()
838 {
839 if (empty)
840 return;
5fee5ec3 841 switch (front)
b4c522fa
IB
842 {
843 case '*', '?', '+', '|', '{', '}':
844 error("'*', '+', '?', '{', '}' not allowed in atom");
b4c522fa
IB
845 case '.':
846 if (re_flags & RegexOption.singleline)
847 g.put(Bytecode(IR.Any, 0));
848 else
849 {
850 CodepointSet set;
851 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
852 }
5fee5ec3 853 popFront();
b4c522fa
IB
854 break;
855 case '[':
856 parseCharset();
857 break;
858 case '\\':
5fee5ec3
IB
859 _popFront();
860 enforce(!empty, "Unfinished escape sequence");
b4c522fa
IB
861 parseEscape();
862 break;
863 case '^':
864 if (re_flags & RegexOption.multiline)
865 g.put(Bytecode(IR.Bol, 0));
866 else
867 g.put(Bytecode(IR.Bof, 0));
5fee5ec3 868 popFront();
b4c522fa
IB
869 break;
870 case '$':
871 if (re_flags & RegexOption.multiline)
872 g.put(Bytecode(IR.Eol, 0));
873 else
874 g.put(Bytecode(IR.Eof, 0));
5fee5ec3 875 popFront();
b4c522fa
IB
876 break;
877 default:
b4c522fa
IB
878 if (re_flags & RegexOption.casefold)
879 {
5fee5ec3 880 auto range = simpleCaseFoldings(front);
b4c522fa
IB
881 assert(range.length <= 5);
882 if (range.length == 1)
883 g.put(Bytecode(IR.Char, range.front));
884 else
885 foreach (v; range)
886 g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
887 }
888 else
5fee5ec3
IB
889 g.put(Bytecode(IR.Char, front));
890 popFront();
b4c522fa 891 }
b4c522fa
IB
892 }
893
b4c522fa
IB
894 //parse and store IR for CodepointSet
895 void parseCharset()
896 {
897 const save = re_flags;
898 re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
5fee5ec3
IB
899 bool casefold = cast(bool)(re_flags & RegexOption.casefold);
900 g.charsetToIr(unicode.parseSet(this, casefold));
b4c522fa 901 re_flags = save;
5fee5ec3 902 // Last next() in parseCharset is executed w/o freeform flag
b4c522fa
IB
903 if (re_flags & RegexOption.freeform) skipSpace();
904 }
905
b4c522fa
IB
906 //parse and generate IR for escape stand alone escape sequence
907 @trusted void parseEscape()
908 {//accesses array of appender
909 import std.algorithm.iteration : sum;
5fee5ec3 910 switch (front)
b4c522fa 911 {
5fee5ec3
IB
912 case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break;
913 case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break;
914 case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break;
915 case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break;
916 case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break;
b4c522fa
IB
917
918 case 'd':
5fee5ec3 919 popFront();
b4c522fa
IB
920 g.charsetToIr(unicode.Nd);
921 break;
922 case 'D':
5fee5ec3 923 popFront();
b4c522fa
IB
924 g.charsetToIr(unicode.Nd.inverted);
925 break;
5fee5ec3
IB
926 case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break;
927 case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
b4c522fa 928 case 's':
5fee5ec3 929 popFront();
b4c522fa
IB
930 g.charsetToIr(unicode.White_Space);
931 break;
932 case 'S':
5fee5ec3 933 popFront();
b4c522fa
IB
934 g.charsetToIr(unicode.White_Space.inverted);
935 break;
936 case 'w':
5fee5ec3 937 popFront();
b4c522fa
IB
938 g.charsetToIr(wordCharacter);
939 break;
940 case 'W':
5fee5ec3 941 popFront();
b4c522fa
IB
942 g.charsetToIr(wordCharacter.inverted);
943 break;
944 case 'p': case 'P':
5fee5ec3
IB
945 bool casefold = cast(bool)(re_flags & RegexOption.casefold);
946 auto set = unicode.parsePropertySpec(this, front == 'P', casefold);
947 g.charsetToIr(set);
b4c522fa
IB
948 break;
949 case 'x':
950 immutable code = parseUniHex(pat, 2);
5fee5ec3 951 popFront();
b4c522fa
IB
952 g.put(Bytecode(IR.Char,code));
953 break;
954 case 'u': case 'U':
5fee5ec3
IB
955 immutable code = parseUniHex(pat, front == 'u' ? 4 : 8);
956 popFront();
b4c522fa
IB
957 g.put(Bytecode(IR.Char, code));
958 break;
959 case 'c': //control codes
5fee5ec3
IB
960 Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this));
961 popFront();
b4c522fa
IB
962 g.put(code);
963 break;
964 case '0':
5fee5ec3 965 popFront();
b4c522fa
IB
966 g.put(Bytecode(IR.Char, 0));//NUL character
967 break;
968 case '1': .. case '9':
5fee5ec3 969 uint nref = cast(uint) front - '0';
b4c522fa
IB
970 immutable maxBackref = sum(g.groupStack.data);
971 enforce(nref < maxBackref, "Backref to unseen group");
972 //perl's disambiguation rule i.e.
973 //get next digit only if there is such group number
5fee5ec3
IB
974 popFront();
975 while (nref < maxBackref && !empty && std.ascii.isDigit(front))
b4c522fa 976 {
5fee5ec3
IB
977 nref = nref * 10 + front - '0';
978 popFront();
b4c522fa
IB
979 }
980 if (nref >= maxBackref)
981 nref /= 10;
982 enforce(!g.isOpenGroup(nref), "Backref to open group");
983 uint localLimit = maxBackref - g.groupStack.top;
984 if (nref >= localLimit)
985 {
986 g.put(Bytecode(IR.Backref, nref-localLimit));
987 g.ir[$-1].setLocalRef();
988 }
989 else
990 g.put(Bytecode(IR.Backref, nref));
991 g.markBackref(nref);
992 break;
993 default:
5fee5ec3 994 if (front == '\\' && !pat.empty)
b4c522fa 995 {
5fee5ec3 996 if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
b4c522fa
IB
997 enforce(false, "invalid escape sequence");
998 }
5fee5ec3 999 if (front >= privateUseStart && front <= privateUseEnd)
b4c522fa 1000 {
5fee5ec3 1001 g.endPattern(front - privateUseStart + 1);
b4c522fa
IB
1002 break;
1003 }
5fee5ec3
IB
1004 auto op = Bytecode(IR.Char, front);
1005 popFront();
b4c522fa
IB
1006 g.put(op);
1007 }
1008 }
1009
b4c522fa
IB
1010 //
1011 @trusted void error(string msg)
1012 {
1013 import std.array : appender;
5fee5ec3 1014 import std.format.write : formattedWrite;
b4c522fa
IB
1015 auto app = appender!string();
1016 formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1017 msg, origin[0..$-pat.length], pat);
1018 throw new RegexException(app.data);
1019 }
1020
1021 alias Char = BasicElementOf!R;
1022
1023 @property program()
1024 {
1025 return makeRegex(this);
1026 }
1027}
1028
1029/+
1030 Postproces the IR, then optimize.
1031+/
1032@trusted void postprocess(Char)(ref Regex!Char zis)
1033{//@@@BUG@@@ write is @system
1034 with(zis)
1035 {
1036 struct FixedStack(T)
1037 {
1038 T[] arr;
1039 uint _top;
1040 //this(T[] storage){ arr = storage; _top = -1; }
1041 @property ref T top(){ assert(!empty); return arr[_top]; }
1042 void push(T x){ arr[++_top] = x; }
1043 T pop() { assert(!empty); return arr[_top--]; }
1044 @property bool empty(){ return _top == -1; }
1045 }
1046 auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1047 counterRange.push(1);
1048 ulong cumRange = 0;
1049 for (uint i = 0; i < ir.length; i += ir[i].length)
1050 {
1051 if (ir[i].hotspot)
1052 {
1053 assert(i + 1 < ir.length,
1054 "unexpected end of IR while looking for hotspot");
1055 ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1056 hotspotTableSize += counterRange.top;
1057 }
1058 switch (ir[i].code)
1059 {
1060 case IR.RepeatStart, IR.RepeatQStart:
1061 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1062 assert(ir[repEnd].code == ir[i].paired.code);
1063 immutable max = ir[repEnd + 4].raw;
1064 ir[repEnd+2].raw = counterRange.top;
1065 ir[repEnd+3].raw *= counterRange.top;
1066 ir[repEnd+4].raw *= counterRange.top;
1067 ulong cntRange = cast(ulong)(max)*counterRange.top;
1068 cumRange += cntRange;
1069 enforce(cumRange < maxCumulativeRepetitionLength,
1070 "repetition length limit is exceeded");
1071 counterRange.push(cast(uint) cntRange + counterRange.top);
1072 threadCount += counterRange.top;
1073 break;
1074 case IR.RepeatEnd, IR.RepeatQEnd:
1075 threadCount += counterRange.top;
1076 counterRange.pop();
1077 break;
1078 case IR.GroupStart:
1079 if (isBackref(ir[i].data))
1080 ir[i].setBackrefence();
1081 threadCount += counterRange.top;
1082 break;
1083 case IR.GroupEnd:
1084 if (isBackref(ir[i].data))
1085 ir[i].setBackrefence();
1086 threadCount += counterRange.top;
1087 break;
1088 default:
1089 threadCount += counterRange.top;
1090 }
1091 }
1092 checkIfOneShot();
1093 if (!(flags & RegexInfo.oneShot))
1094 kickstart = Kickstart!Char(zis, new uint[](256));
1095 debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1096 optimize(zis);
1097 }
1098}
1099
1100void fixupBytecode()(Bytecode[] ir)
1101{
1102 Stack!uint fixups;
1103
1104 with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1105 {
1106 if (ir[i].isStart || ir[i].code == Option)
1107 fixups.push(i);
1108 else if (ir[i].code == OrEnd)
1109 {
1110 // Alternatives need more care
1111 auto j = fixups.pop(); // last Option
1112 ir[j].data = i - j - ir[j].length;
1113 j = fixups.pop(); // OrStart
1114 ir[j].data = i - j - ir[j].length;
1115 ir[i].data = ir[j].data;
1116
1117 // fixup all GotoEndOrs
1118 j = j + IRL!(OrStart);
1119 assert(ir[j].code == Option);
1120 for (;;)
1121 {
1122 auto next = j + ir[j].data + IRL!(Option);
1123 if (ir[next].code == IR.OrEnd)
1124 break;
1125 ir[next - IRL!(GotoEndOr)].data = i - next;
1126 j = next;
1127 }
1128 }
1129 else if (ir[i].code == GotoEndOr)
1130 {
1131 auto j = fixups.pop(); // Option
1132 ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1133 }
1134 else if (ir[i].isEnd)
1135 {
1136 auto j = fixups.pop();
1137 ir[i].data = i - j - ir[j].length;
1138 ir[j].data = ir[i].data;
1139 }
1140 }
1141 assert(fixups.empty);
1142}
1143
1144void optimize(Char)(ref Regex!Char zis)
1145{
1146 import std.array : insertInPlace;
1147 CodepointSet nextSet(uint idx)
1148 {
1149 CodepointSet set;
1150 with(zis) with(IR)
1151 Outer:
1152 for (uint i = idx; i < ir.length; i += ir[i].length)
1153 {
1154 switch (ir[i].code)
1155 {
1156 case Char:
1157 set.add(ir[i].data, ir[i].data+1);
1158 goto default;
1159 //TODO: OrChar
1160 case Trie, CodepointSet:
1161 set = zis.charsets[ir[i].data];
1162 goto default;
1163 case GroupStart,GroupEnd:
1164 break;
1165 default:
1166 break Outer;
1167 }
1168 }
1169 return set;
1170 }
1171
1172 with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1173 {
1174 if (ir[i].code == InfiniteEnd)
1175 {
1176 auto set = nextSet(i+IRL!(InfiniteEnd));
1177 if (!set.empty && set.length < 10_000)
1178 {
1179 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1180 ir[i - ir[i].data - IRL!(InfiniteStart)] =
1181 Bytecode(InfiniteBloomStart, ir[i].data);
1182 ir.insertInPlace(i+IRL!(InfiniteEnd),
1183 Bytecode.fromRaw(cast(uint) zis.filters.length));
1184 zis.filters ~= BitTable(set);
1185 fixupBytecode(ir);
1186 }
1187 }
1188 }
1189}
1190
1191//IR code validator - proper nesting, illegal instructions, etc.
1192@trusted void validateRe(Char)(ref Regex!Char zis)
1193{//@@@BUG@@@ text is @system
1194 import std.conv : text;
1195 with(zis)
1196 {
1197 for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1198 {
1199 if (ir[pc].isStart || ir[pc].isEnd)
1200 {
1201 immutable dest = ir[pc].indexOfPair(pc);
1202 assert(dest < ir.length, text("Wrong length in opcode at pc=",
1203 pc, " ", dest, " vs ", ir.length));
1204 assert(ir[dest].paired == ir[pc],
1205 text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1206 }
1207 else if (ir[pc].isAtom)
1208 {
1209
1210 }
1211 else
1212 assert(0, text("Unknown type of instruction at pc=", pc));
1213 }
1214 }
1215}