]>
Commit | Line | Data |
---|---|---|
b4c522fa IB |
1 | //Written in the D programming language |
2 | /* | |
3 | Regular expression pattern parser. | |
4 | */ | |
5 | module std.regex.internal.parser; | |
6 | ||
5fee5ec3 | 7 | import std.regex.internal.ir; |
b4c522fa IB |
8 | import std.range.primitives, std.uni, std.meta, |
9 | std.traits, std.typecons, std.exception; | |
5fee5ec3 | 10 | static import std.ascii; |
b4c522fa IB |
11 | |
12 | // package relevant info from parser into a regex object | |
13 | auto makeRegex(S, CG)(Parser!(S, CG) p) | |
14 | { | |
5fee5ec3 IB |
15 | import std.regex.internal.backtracking : BacktrackingMatcher; |
16 | import std.regex.internal.thompson : ThompsonMatcher; | |
17 | import std.algorithm.searching : canFind; | |
18 | alias Char = BasicElementOf!S; | |
19 | Regex!Char re; | |
b4c522fa IB |
20 | auto g = p.g; |
21 | with(re) | |
22 | { | |
23 | ir = g.ir; | |
24 | dict = g.dict; | |
25 | ngroup = g.ngroup; | |
26 | maxCounterDepth = g.counterDepth; | |
27 | flags = p.re_flags; | |
28 | charsets = g.charsets; | |
29 | matchers = g.matchers; | |
30 | backrefed = g.backrefed; | |
5fee5ec3 | 31 | re.pattern = p.origin.idup; |
b4c522fa | 32 | re.postprocess(); |
5fee5ec3 IB |
33 | // check if we have backreferences, if so - use backtracking |
34 | if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); | |
35 | else if (re.backrefed.canFind!"a != 0") | |
36 | factory = new RuntimeFactory!(BacktrackingMatcher, Char); | |
37 | else | |
38 | factory = new RuntimeFactory!(ThompsonMatcher, Char); | |
b4c522fa IB |
39 | debug(std_regex_parser) |
40 | { | |
41 | __ctfe || print(); | |
42 | } | |
43 | //@@@BUG@@@ (not reduced) | |
44 | //somehow just using validate _collides_ with std.utf.validate (!) | |
45 | version (assert) re.validateRe(); | |
46 | } | |
47 | return re; | |
48 | } | |
49 | ||
50 | // helper for unittest | |
51 | auto makeRegex(S)(S arg) | |
52 | if (isSomeString!S) | |
53 | { | |
54 | return makeRegex(Parser!(S, CodeGen)(arg, "")); | |
55 | } | |
56 | ||
57 | @system unittest | |
58 | { | |
59 | import std.algorithm.comparison : equal; | |
60 | auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`); | |
61 | auto nc = re.namedCaptures; | |
62 | static assert(isRandomAccessRange!(typeof(nc))); | |
63 | assert(!nc.empty); | |
64 | assert(nc.length == 2); | |
65 | assert(nc.equal(["name", "var"])); | |
66 | assert(nc[0] == "name"); | |
67 | assert(nc[1..$].equal(["var"])); | |
68 | ||
69 | re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`); | |
70 | nc = re.namedCaptures; | |
71 | assert(nc.length == 1); | |
72 | assert(nc[0] == "named"); | |
73 | assert(nc.front == "named"); | |
74 | assert(nc.back == "named"); | |
75 | ||
76 | re = makeRegex(`(\w+) (\w+)`); | |
77 | nc = re.namedCaptures; | |
78 | assert(nc.empty); | |
79 | ||
80 | re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`); | |
81 | nc = re.namedCaptures; | |
82 | auto cp = nc.save; | |
83 | assert(nc.equal(cp)); | |
84 | nc.popFront(); | |
85 | assert(nc.equal(cp[1..$])); | |
86 | nc.popBack(); | |
87 | assert(nc.equal(cp[1 .. $ - 1])); | |
88 | } | |
89 | ||
90 | ||
91 | @trusted void reverseBytecode()(Bytecode[] code) | |
92 | { | |
93 | Bytecode[] rev = new Bytecode[code.length]; | |
94 | uint revPc = cast(uint) rev.length; | |
95 | Stack!(Tuple!(uint, uint, uint)) stack; | |
96 | uint start = 0; | |
97 | uint end = cast(uint) code.length; | |
98 | for (;;) | |
99 | { | |
100 | for (uint pc = start; pc < end; ) | |
101 | { | |
102 | immutable len = code[pc].length; | |
103 | if (code[pc].code == IR.GotoEndOr) | |
104 | break; //pick next alternation branch | |
105 | if (code[pc].isAtom) | |
106 | { | |
107 | rev[revPc - len .. revPc] = code[pc .. pc + len]; | |
108 | revPc -= len; | |
109 | pc += len; | |
110 | } | |
111 | else if (code[pc].isStart || code[pc].isEnd) | |
112 | { | |
113 | //skip over other embedded lookbehinds they are reversed | |
114 | if (code[pc].code == IR.LookbehindStart | |
115 | || code[pc].code == IR.NeglookbehindStart) | |
116 | { | |
117 | immutable blockLen = len + code[pc].data | |
118 | + code[pc].pairedLength; | |
119 | rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; | |
120 | pc += blockLen; | |
121 | revPc -= blockLen; | |
122 | continue; | |
123 | } | |
124 | immutable second = code[pc].indexOfPair(pc); | |
125 | immutable secLen = code[second].length; | |
126 | rev[revPc - secLen .. revPc] = code[second .. second + secLen]; | |
127 | revPc -= secLen; | |
128 | if (code[pc].code == IR.OrStart) | |
129 | { | |
130 | //we pass len bytes forward, but secLen in reverse | |
131 | immutable revStart = revPc - (second + len - secLen - pc); | |
132 | uint r = revStart; | |
133 | uint i = pc + IRL!(IR.OrStart); | |
134 | while (code[i].code == IR.Option) | |
135 | { | |
136 | if (code[i - 1].code != IR.OrStart) | |
137 | { | |
138 | assert(code[i - 1].code == IR.GotoEndOr); | |
139 | rev[r - 1] = code[i - 1]; | |
140 | } | |
141 | rev[r] = code[i]; | |
142 | auto newStart = i + IRL!(IR.Option); | |
143 | auto newEnd = newStart + code[i].data; | |
144 | auto newRpc = r + code[i].data + IRL!(IR.Option); | |
145 | if (code[newEnd].code != IR.OrEnd) | |
146 | { | |
147 | newRpc--; | |
148 | } | |
149 | stack.push(tuple(newStart, newEnd, newRpc)); | |
150 | r += code[i].data + IRL!(IR.Option); | |
151 | i += code[i].data + IRL!(IR.Option); | |
152 | } | |
153 | pc = i; | |
154 | revPc = revStart; | |
155 | assert(code[pc].code == IR.OrEnd); | |
156 | } | |
157 | else | |
158 | pc += len; | |
159 | } | |
160 | } | |
161 | if (stack.empty) | |
162 | break; | |
163 | start = stack.top[0]; | |
164 | end = stack.top[1]; | |
165 | revPc = stack.top[2]; | |
166 | stack.pop(); | |
167 | } | |
168 | code[] = rev[]; | |
169 | } | |
170 | ||
b4c522fa IB |
171 | struct CodeGen |
172 | { | |
173 | Bytecode[] ir; // resulting bytecode | |
174 | Stack!(uint) fixupStack; // stack of opened start instructions | |
175 | NamedGroup[] dict; // maps name -> user group number | |
176 | Stack!(uint) groupStack; // stack of current number of group | |
177 | uint nesting = 0; // group nesting level and repetitions step | |
178 | uint lookaroundNest = 0; // nesting of lookaround | |
179 | uint counterDepth = 0; // current depth of nested counted repetitions | |
180 | CodepointSet[] charsets; // sets for char classes | |
181 | const(CharMatcher)[] matchers; // matchers for char classes | |
182 | uint[] backrefed; // bitarray for groups refered by backref | |
183 | uint ngroup; // final number of groups (of all patterns) | |
184 | ||
185 | void start(uint length) | |
186 | { | |
187 | if (!__ctfe) | |
188 | ir.reserve((length*5+2)/4); | |
189 | fixupStack.push(0); | |
190 | groupStack.push(1);//0 - whole match | |
191 | } | |
192 | ||
193 | //mark referenced groups for latter processing | |
194 | void markBackref(uint n) | |
195 | { | |
196 | if (n/32 >= backrefed.length) | |
197 | backrefed.length = n/32 + 1; | |
198 | backrefed[n / 32] |= 1 << (n & 31); | |
199 | } | |
200 | ||
201 | bool isOpenGroup(uint n) | |
202 | { | |
203 | import std.algorithm.searching : canFind; | |
204 | // walk the fixup stack and see if there are groups labeled 'n' | |
205 | // fixup '0' is reserved for alternations | |
206 | return fixupStack.data[1..$]. | |
207 | canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)(); | |
208 | } | |
209 | ||
210 | void put(Bytecode code) | |
211 | { | |
212 | enforce(ir.length < maxCompiledLength, | |
213 | "maximum compiled pattern length is exceeded"); | |
214 | ir ~= code; | |
215 | } | |
216 | ||
217 | void putRaw(uint number) | |
218 | { | |
219 | enforce(ir.length < maxCompiledLength, | |
220 | "maximum compiled pattern length is exceeded"); | |
221 | ir ~= Bytecode.fromRaw(number); | |
222 | } | |
223 | ||
224 | //try to generate optimal IR code for this CodepointSet | |
225 | @trusted void charsetToIr(CodepointSet set) | |
226 | {//@@@BUG@@@ writeln is @system | |
227 | uint chars = cast(uint) set.length; | |
228 | if (chars < Bytecode.maxSequence) | |
229 | { | |
230 | switch (chars) | |
231 | { | |
232 | case 1: | |
233 | put(Bytecode(IR.Char, set.byCodepoint.front)); | |
234 | break; | |
235 | case 0: | |
236 | throw new RegexException("empty CodepointSet not allowed"); | |
237 | default: | |
238 | foreach (ch; set.byCodepoint) | |
239 | put(Bytecode(IR.OrChar, ch, chars)); | |
240 | } | |
241 | } | |
242 | else | |
243 | { | |
244 | import std.algorithm.searching : countUntil; | |
245 | const ivals = set.byInterval; | |
246 | immutable n = charsets.countUntil(set); | |
247 | if (n >= 0) | |
248 | { | |
249 | if (ivals.length*2 > maxCharsetUsed) | |
250 | put(Bytecode(IR.Trie, cast(uint) n)); | |
251 | else | |
252 | put(Bytecode(IR.CodepointSet, cast(uint) n)); | |
253 | return; | |
254 | } | |
255 | if (ivals.length*2 > maxCharsetUsed) | |
256 | { | |
257 | auto t = getMatcher(set); | |
258 | put(Bytecode(IR.Trie, cast(uint) matchers.length)); | |
259 | matchers ~= t; | |
260 | debug(std_regex_allocation) writeln("Trie generated"); | |
261 | } | |
262 | else | |
263 | { | |
264 | put(Bytecode(IR.CodepointSet, cast(uint) charsets.length)); | |
265 | matchers ~= CharMatcher.init; | |
266 | } | |
267 | charsets ~= set; | |
268 | assert(charsets.length == matchers.length); | |
269 | } | |
270 | } | |
271 | ||
272 | void genLogicGroup() | |
273 | { | |
274 | nesting++; | |
275 | pushFixup(length); | |
276 | put(Bytecode(IR.Nop, 0)); | |
277 | } | |
278 | ||
279 | void genGroup() | |
280 | { | |
281 | nesting++; | |
282 | pushFixup(length); | |
283 | immutable nglob = groupStack.top++; | |
284 | enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); | |
285 | put(Bytecode(IR.GroupStart, nglob)); | |
286 | } | |
287 | ||
288 | void genNamedGroup(string name) | |
289 | { | |
290 | import std.array : insertInPlace; | |
291 | import std.range : assumeSorted; | |
292 | nesting++; | |
293 | pushFixup(length); | |
294 | immutable nglob = groupStack.top++; | |
295 | enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); | |
296 | auto t = NamedGroup(name, nglob); | |
297 | auto d = assumeSorted!"a.name < b.name"(dict); | |
298 | immutable ind = d.lowerBound(t).length; | |
299 | insertInPlace(dict, ind, t); | |
300 | put(Bytecode(IR.GroupStart, nglob)); | |
301 | } | |
302 | ||
303 | //generate code for start of lookaround: (?= (?! (?<= (?<! | |
304 | void genLookaround(IR opcode) | |
305 | { | |
306 | nesting++; | |
307 | pushFixup(length); | |
308 | put(Bytecode(opcode, 0)); | |
309 | put(Bytecode.fromRaw(0)); | |
310 | put(Bytecode.fromRaw(0)); | |
311 | groupStack.push(0); | |
312 | lookaroundNest++; | |
313 | enforce(lookaroundNest <= maxLookaroundDepth, | |
314 | "maximum lookaround depth is exceeded"); | |
315 | } | |
316 | ||
317 | void endPattern(uint num) | |
318 | { | |
319 | import std.algorithm.comparison : max; | |
320 | put(Bytecode(IR.End, num)); | |
321 | ngroup = max(ngroup, groupStack.top); | |
322 | groupStack.top = 1; // reset group counter | |
323 | } | |
324 | ||
325 | //fixup lookaround with start at offset fix and append a proper *-End opcode | |
326 | void fixLookaround(uint fix) | |
327 | { | |
328 | lookaroundNest--; | |
329 | ir[fix] = Bytecode(ir[fix].code, | |
330 | cast(uint) ir.length - fix - IRL!(IR.LookaheadStart)); | |
331 | auto g = groupStack.pop(); | |
332 | assert(!groupStack.empty); | |
333 | ir[fix+1] = Bytecode.fromRaw(groupStack.top); | |
334 | //groups are cumulative across lookarounds | |
335 | ir[fix+2] = Bytecode.fromRaw(groupStack.top+g); | |
336 | groupStack.top += g; | |
337 | if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart) | |
338 | { | |
339 | reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]); | |
340 | } | |
341 | put(ir[fix].paired); | |
342 | } | |
343 | ||
344 | // repetition of {1,1} | |
345 | void fixRepetition(uint offset) | |
346 | { | |
347 | import std.algorithm.mutation : copy; | |
348 | immutable replace = ir[offset].code == IR.Nop; | |
349 | if (replace) | |
350 | { | |
351 | copy(ir[offset + 1 .. $], ir[offset .. $ - 1]); | |
352 | ir.length -= 1; | |
353 | } | |
354 | } | |
355 | ||
356 | // repetition of {x,y} | |
357 | void fixRepetition(uint offset, uint min, uint max, bool greedy) | |
358 | { | |
359 | static import std.algorithm.comparison; | |
360 | import std.algorithm.mutation : copy; | |
361 | import std.array : insertInPlace; | |
362 | immutable replace = ir[offset].code == IR.Nop; | |
363 | immutable len = cast(uint) ir.length - offset - replace; | |
364 | if (max != infinite) | |
365 | { | |
366 | if (min != 1 || max != 1) | |
367 | { | |
368 | Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); | |
369 | if (replace) | |
370 | ir[offset] = op; | |
371 | else | |
372 | insertInPlace(ir, offset, op); | |
373 | put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); | |
374 | put(Bytecode.init); //hotspot | |
375 | putRaw(1); | |
376 | putRaw(min); | |
377 | putRaw(max); | |
378 | counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); | |
379 | } | |
380 | } | |
381 | else if (min) //&& max is infinite | |
382 | { | |
383 | if (min != 1) | |
384 | { | |
385 | Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); | |
386 | if (replace) | |
387 | ir[offset] = op; | |
388 | else | |
389 | insertInPlace(ir, offset, op); | |
390 | offset += 1;//so it still points to the repeated block | |
391 | put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); | |
392 | put(Bytecode.init); //hotspot | |
393 | putRaw(1); | |
394 | putRaw(min); | |
395 | putRaw(min); | |
396 | counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); | |
397 | } | |
398 | else if (replace) | |
399 | { | |
400 | copy(ir[offset+1 .. $], ir[offset .. $-1]); | |
401 | ir.length -= 1; | |
402 | } | |
403 | put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); | |
404 | enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); | |
405 | ir ~= ir[offset .. offset+len]; | |
406 | //IR.InfinteX is always a hotspot | |
407 | put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); | |
408 | put(Bytecode.init); //merge index | |
409 | } | |
410 | else//vanila {0,inf} | |
411 | { | |
412 | Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len); | |
413 | if (replace) | |
414 | ir[offset] = op; | |
415 | else | |
416 | insertInPlace(ir, offset, op); | |
417 | //IR.InfinteX is always a hotspot | |
418 | put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); | |
419 | put(Bytecode.init); //merge index | |
420 | } | |
421 | } | |
422 | ||
423 | void fixAlternation() | |
424 | { | |
425 | import std.array : insertInPlace; | |
426 | uint fix = fixupStack.top; | |
427 | if (ir.length > fix && ir[fix].code == IR.Option) | |
428 | { | |
429 | ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix); | |
430 | put(Bytecode(IR.GotoEndOr, 0)); | |
431 | fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option | |
432 | put(Bytecode(IR.Option, 0)); | |
433 | return; | |
434 | } | |
435 | uint len, orStart; | |
436 | //start a new option | |
437 | if (fixupStack.length == 1) | |
438 | {//only root entry, effectively no fixup | |
439 | len = cast(uint) ir.length + IRL!(IR.GotoEndOr); | |
440 | orStart = 0; | |
441 | } | |
442 | else | |
443 | {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length | |
444 | len = cast(uint) ir.length - fix - (ir[fix].length - 1); | |
445 | orStart = fix + ir[fix].length; | |
446 | } | |
447 | insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); | |
448 | assert(ir[orStart].code == IR.OrStart); | |
449 | put(Bytecode(IR.GotoEndOr, 0)); | |
450 | fixupStack.push(orStart); //fixup for StartOR | |
451 | fixupStack.push(cast(uint) ir.length); //for second Option | |
452 | put(Bytecode(IR.Option, 0)); | |
453 | } | |
454 | ||
455 | // finalizes IR.Option, fix points to the first option of sequence | |
456 | void finishAlternation(uint fix) | |
457 | { | |
458 | enforce(ir[fix].code == IR.Option, "no matching ')'"); | |
459 | ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart)); | |
460 | fix = fixupStack.pop(); | |
461 | enforce(ir[fix].code == IR.OrStart, "no matching ')'"); | |
462 | ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart)); | |
463 | put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart))); | |
464 | uint pc = fix + IRL!(IR.OrStart); | |
465 | while (ir[pc].code == IR.Option) | |
466 | { | |
467 | pc = pc + ir[pc].data; | |
468 | if (ir[pc].code != IR.GotoEndOr) | |
469 | break; | |
470 | ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); | |
471 | pc += IRL!(IR.GotoEndOr); | |
472 | } | |
473 | put(Bytecode.fromRaw(0)); | |
474 | } | |
475 | ||
476 | // returns: (flag - repetition possible?, fixup of the start of this "group") | |
477 | Tuple!(bool, uint) onClose() | |
478 | { | |
479 | nesting--; | |
480 | uint fix = popFixup(); | |
481 | switch (ir[fix].code) | |
482 | { | |
483 | case IR.GroupStart: | |
484 | put(Bytecode(IR.GroupEnd, ir[fix].data)); | |
485 | return tuple(true, fix); | |
486 | case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: | |
487 | assert(lookaroundNest); | |
488 | fixLookaround(fix); | |
489 | return tuple(false, 0u); | |
490 | case IR.Option: //| xxx ) | |
491 | //two fixups: last option + full OR | |
492 | finishAlternation(fix); | |
493 | fix = topFixup; | |
494 | switch (ir[fix].code) | |
495 | { | |
496 | case IR.GroupStart: | |
497 | popFixup(); | |
498 | put(Bytecode(IR.GroupEnd, ir[fix].data)); | |
499 | return tuple(true, fix); | |
500 | case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: | |
501 | assert(lookaroundNest); | |
502 | fix = popFixup(); | |
503 | fixLookaround(fix); | |
504 | return tuple(false, 0u); | |
505 | default://(?:xxx) | |
506 | popFixup(); | |
507 | return tuple(true, fix); | |
508 | } | |
509 | default://(?:xxx) | |
510 | return tuple(true, fix); | |
511 | } | |
512 | } | |
513 | ||
514 | uint popFixup(){ return fixupStack.pop(); } | |
515 | ||
516 | void pushFixup(uint val){ return fixupStack.push(val); } | |
517 | ||
518 | @property uint topFixup(){ return fixupStack.top; } | |
519 | ||
520 | @property size_t fixupLength(){ return fixupStack.data.length; } | |
521 | ||
522 | @property uint length(){ return cast(uint) ir.length; } | |
523 | } | |
524 | ||
525 | // safety limits | |
526 | enum maxGroupNumber = 2^^19; | |
527 | enum maxLookaroundDepth = 16; | |
528 | // *Bytecode.sizeof, i.e. 1Mb of bytecode alone | |
529 | enum maxCompiledLength = 2^^18; | |
530 | // amounts to up to 4 Mb of auxilary table for matching | |
531 | enum maxCumulativeRepetitionLength = 2^^20; | |
532 | // marker to indicate infinite repetition | |
533 | enum infinite = ~0u; | |
534 | ||
535 | struct Parser(R, Generator) | |
536 | if (isForwardRange!R && is(ElementType!R : dchar)) | |
537 | { | |
5fee5ec3 | 538 | dchar front; |
b4c522fa IB |
539 | bool empty; |
540 | R pat, origin; //keep full pattern for pretty printing error messages | |
541 | uint re_flags = 0; //global flags e.g. multiline + internal ones | |
542 | Generator g; | |
543 | ||
544 | @trusted this(S)(R pattern, S flags) | |
545 | if (isSomeString!S) | |
546 | { | |
547 | pat = origin = pattern; | |
548 | //reserve slightly more then avg as sampled from unittests | |
549 | parseFlags(flags); | |
5fee5ec3 IB |
550 | front = ' ';//a safe default for freeform parsing |
551 | popFront(); | |
b4c522fa IB |
552 | g.start(cast(uint) pat.length); |
553 | try | |
554 | { | |
555 | parseRegex(); | |
556 | } | |
557 | catch (Exception e) | |
558 | { | |
559 | error(e.msg);//also adds pattern location | |
560 | } | |
561 | g.endPattern(1); | |
562 | } | |
563 | ||
5fee5ec3 | 564 | void _popFront() |
b4c522fa IB |
565 | { |
566 | if (pat.empty) | |
567 | { | |
568 | empty = true; | |
b4c522fa | 569 | } |
5fee5ec3 IB |
570 | else |
571 | { | |
572 | front = pat.front; | |
573 | pat.popFront(); | |
574 | } | |
b4c522fa IB |
575 | } |
576 | ||
577 | void skipSpace() | |
578 | { | |
5fee5ec3 | 579 | while (!empty && isWhite(front)) _popFront(); |
b4c522fa IB |
580 | } |
581 | ||
5fee5ec3 | 582 | void popFront() |
b4c522fa | 583 | { |
5fee5ec3 IB |
584 | _popFront(); |
585 | if (re_flags & RegexOption.freeform) skipSpace(); | |
b4c522fa IB |
586 | } |
587 | ||
5fee5ec3 IB |
588 | auto save(){ return this; } |
589 | ||
b4c522fa IB |
590 | //parsing number with basic overflow check |
591 | uint parseDecimal() | |
592 | { | |
593 | uint r = 0; | |
5fee5ec3 | 594 | while (std.ascii.isDigit(front)) |
b4c522fa IB |
595 | { |
596 | if (r >= (uint.max/10)) | |
597 | error("Overflow in decimal number"); | |
5fee5ec3 IB |
598 | r = 10*r + cast(uint)(front-'0'); |
599 | popFront(); | |
600 | if (empty) break; | |
b4c522fa IB |
601 | } |
602 | return r; | |
603 | } | |
604 | ||
b4c522fa IB |
605 | // |
606 | @trusted void parseFlags(S)(S flags) | |
607 | {//@@@BUG@@@ text is @system | |
608 | import std.conv : text; | |
609 | foreach (ch; flags)//flags are ASCII anyway | |
610 | { | |
611 | L_FlagSwitch: | |
612 | switch (ch) | |
613 | { | |
614 | ||
615 | foreach (i, op; __traits(allMembers, RegexOption)) | |
616 | { | |
617 | case RegexOptionNames[i]: | |
618 | if (re_flags & mixin("RegexOption."~op)) | |
619 | throw new RegexException(text("redundant flag specified: ",ch)); | |
620 | re_flags |= mixin("RegexOption."~op); | |
621 | break L_FlagSwitch; | |
622 | } | |
623 | default: | |
624 | throw new RegexException(text("unknown regex flag '",ch,"'")); | |
625 | } | |
626 | } | |
627 | } | |
628 | ||
629 | //parse and store IR for regex pattern | |
630 | @trusted void parseRegex() | |
631 | { | |
632 | uint fix;//fixup pointer | |
633 | ||
634 | while (!empty) | |
635 | { | |
636 | debug(std_regex_parser) | |
637 | __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); | |
5fee5ec3 | 638 | switch (front) |
b4c522fa IB |
639 | { |
640 | case '(': | |
5fee5ec3 IB |
641 | popFront(); |
642 | if (front == '?') | |
b4c522fa | 643 | { |
5fee5ec3 IB |
644 | popFront(); |
645 | switch (front) | |
b4c522fa IB |
646 | { |
647 | case '#': | |
648 | for (;;) | |
649 | { | |
5fee5ec3 IB |
650 | popFront(); |
651 | enforce(!empty, "Unexpected end of pattern"); | |
652 | if (front == ')') | |
b4c522fa | 653 | { |
5fee5ec3 | 654 | popFront(); |
b4c522fa IB |
655 | break; |
656 | } | |
657 | } | |
658 | break; | |
659 | case ':': | |
660 | g.genLogicGroup(); | |
5fee5ec3 | 661 | popFront(); |
b4c522fa IB |
662 | break; |
663 | case '=': | |
664 | g.genLookaround(IR.LookaheadStart); | |
5fee5ec3 | 665 | popFront(); |
b4c522fa IB |
666 | break; |
667 | case '!': | |
668 | g.genLookaround(IR.NeglookaheadStart); | |
5fee5ec3 | 669 | popFront(); |
b4c522fa IB |
670 | break; |
671 | case 'P': | |
5fee5ec3 IB |
672 | popFront(); |
673 | enforce(front == '<', "Expected '<' in named group"); | |
b4c522fa | 674 | string name; |
5fee5ec3 IB |
675 | popFront(); |
676 | if (empty || !(isAlpha(front) || front == '_')) | |
b4c522fa | 677 | error("Expected alpha starting a named group"); |
5fee5ec3 IB |
678 | name ~= front; |
679 | popFront(); | |
680 | while (!empty && (isAlpha(front) || | |
681 | front == '_' || std.ascii.isDigit(front))) | |
b4c522fa | 682 | { |
5fee5ec3 IB |
683 | name ~= front; |
684 | popFront(); | |
b4c522fa | 685 | } |
5fee5ec3 IB |
686 | enforce(front == '>', "Expected '>' closing named group"); |
687 | popFront(); | |
b4c522fa IB |
688 | g.genNamedGroup(name); |
689 | break; | |
690 | case '<': | |
5fee5ec3 IB |
691 | popFront(); |
692 | if (front == '=') | |
b4c522fa | 693 | g.genLookaround(IR.LookbehindStart); |
5fee5ec3 | 694 | else if (front == '!') |
b4c522fa IB |
695 | g.genLookaround(IR.NeglookbehindStart); |
696 | else | |
697 | error("'!' or '=' expected after '<'"); | |
5fee5ec3 | 698 | popFront(); |
b4c522fa IB |
699 | break; |
700 | default: | |
701 | uint enableFlags, disableFlags; | |
702 | bool enable = true; | |
703 | do | |
704 | { | |
5fee5ec3 | 705 | switch (front) |
b4c522fa IB |
706 | { |
707 | case 's': | |
708 | if (enable) | |
709 | enableFlags |= RegexOption.singleline; | |
710 | else | |
711 | disableFlags |= RegexOption.singleline; | |
712 | break; | |
713 | case 'x': | |
714 | if (enable) | |
715 | enableFlags |= RegexOption.freeform; | |
716 | else | |
717 | disableFlags |= RegexOption.freeform; | |
718 | break; | |
719 | case 'i': | |
720 | if (enable) | |
721 | enableFlags |= RegexOption.casefold; | |
722 | else | |
723 | disableFlags |= RegexOption.casefold; | |
724 | break; | |
725 | case 'm': | |
726 | if (enable) | |
727 | enableFlags |= RegexOption.multiline; | |
728 | else | |
729 | disableFlags |= RegexOption.multiline; | |
730 | break; | |
731 | case '-': | |
732 | if (!enable) | |
733 | error(" unexpected second '-' in flags"); | |
734 | enable = false; | |
735 | break; | |
736 | default: | |
737 | error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); | |
738 | } | |
5fee5ec3 IB |
739 | popFront(); |
740 | }while (front != ')'); | |
741 | popFront(); | |
b4c522fa IB |
742 | re_flags |= enableFlags; |
743 | re_flags &= ~disableFlags; | |
744 | } | |
745 | } | |
746 | else | |
747 | { | |
748 | g.genGroup(); | |
749 | } | |
750 | break; | |
751 | case ')': | |
752 | enforce(g.nesting, "Unmatched ')'"); | |
5fee5ec3 | 753 | popFront(); |
b4c522fa IB |
754 | auto pair = g.onClose(); |
755 | if (pair[0]) | |
756 | parseQuantifier(pair[1]); | |
757 | break; | |
758 | case '|': | |
5fee5ec3 | 759 | popFront(); |
b4c522fa IB |
760 | g.fixAlternation(); |
761 | break; | |
762 | default://no groups or whatever | |
763 | immutable start = g.length; | |
764 | parseAtom(); | |
765 | parseQuantifier(start); | |
766 | } | |
767 | } | |
768 | ||
769 | if (g.fixupLength != 1) | |
770 | { | |
771 | fix = g.popFixup(); | |
772 | g.finishAlternation(fix); | |
773 | enforce(g.fixupLength == 1, "no matching ')'"); | |
774 | } | |
775 | } | |
776 | ||
777 | ||
778 | //parse and store IR for atom-quantifier pair | |
779 | @trusted void parseQuantifier(uint offset) | |
780 | {//copy is @system | |
781 | if (empty) | |
782 | return g.fixRepetition(offset); | |
783 | uint min, max; | |
5fee5ec3 | 784 | switch (front) |
b4c522fa IB |
785 | { |
786 | case '*': | |
787 | min = 0; | |
788 | max = infinite; | |
789 | break; | |
790 | case '?': | |
791 | min = 0; | |
792 | max = 1; | |
793 | break; | |
794 | case '+': | |
795 | min = 1; | |
796 | max = infinite; | |
797 | break; | |
798 | case '{': | |
5fee5ec3 IB |
799 | popFront(); |
800 | enforce(!empty, "Unexpected end of regex pattern"); | |
801 | enforce(std.ascii.isDigit(front), "First number required in repetition"); | |
b4c522fa | 802 | min = parseDecimal(); |
5fee5ec3 | 803 | if (front == '}') |
b4c522fa | 804 | max = min; |
5fee5ec3 | 805 | else if (front == ',') |
b4c522fa | 806 | { |
5fee5ec3 IB |
807 | popFront(); |
808 | if (std.ascii.isDigit(front)) | |
b4c522fa | 809 | max = parseDecimal(); |
5fee5ec3 | 810 | else if (front == '}') |
b4c522fa IB |
811 | max = infinite; |
812 | else | |
813 | error("Unexpected symbol in regex pattern"); | |
814 | skipSpace(); | |
5fee5ec3 | 815 | enforce(front == '}', "Unmatched '{' in regex pattern"); |
b4c522fa IB |
816 | } |
817 | else | |
818 | error("Unexpected symbol in regex pattern"); | |
5fee5ec3 | 819 | enforce(min <= max, "Illegal {n,m} quantifier"); |
b4c522fa IB |
820 | break; |
821 | default: | |
822 | g.fixRepetition(offset); | |
823 | return; | |
824 | } | |
825 | bool greedy = true; | |
826 | //check only if we managed to get new symbol | |
5fee5ec3 IB |
827 | popFront(); |
828 | if (!empty && front == '?') | |
b4c522fa IB |
829 | { |
830 | greedy = false; | |
5fee5ec3 | 831 | popFront(); |
b4c522fa IB |
832 | } |
833 | g.fixRepetition(offset, min, max, greedy); | |
834 | } | |
835 | ||
836 | //parse and store IR for atom | |
837 | void parseAtom() | |
838 | { | |
839 | if (empty) | |
840 | return; | |
5fee5ec3 | 841 | switch (front) |
b4c522fa IB |
842 | { |
843 | case '*', '?', '+', '|', '{', '}': | |
844 | error("'*', '+', '?', '{', '}' not allowed in atom"); | |
b4c522fa IB |
845 | case '.': |
846 | if (re_flags & RegexOption.singleline) | |
847 | g.put(Bytecode(IR.Any, 0)); | |
848 | else | |
849 | { | |
850 | CodepointSet set; | |
851 | g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); | |
852 | } | |
5fee5ec3 | 853 | popFront(); |
b4c522fa IB |
854 | break; |
855 | case '[': | |
856 | parseCharset(); | |
857 | break; | |
858 | case '\\': | |
5fee5ec3 IB |
859 | _popFront(); |
860 | enforce(!empty, "Unfinished escape sequence"); | |
b4c522fa IB |
861 | parseEscape(); |
862 | break; | |
863 | case '^': | |
864 | if (re_flags & RegexOption.multiline) | |
865 | g.put(Bytecode(IR.Bol, 0)); | |
866 | else | |
867 | g.put(Bytecode(IR.Bof, 0)); | |
5fee5ec3 | 868 | popFront(); |
b4c522fa IB |
869 | break; |
870 | case '$': | |
871 | if (re_flags & RegexOption.multiline) | |
872 | g.put(Bytecode(IR.Eol, 0)); | |
873 | else | |
874 | g.put(Bytecode(IR.Eof, 0)); | |
5fee5ec3 | 875 | popFront(); |
b4c522fa IB |
876 | break; |
877 | default: | |
b4c522fa IB |
878 | if (re_flags & RegexOption.casefold) |
879 | { | |
5fee5ec3 | 880 | auto range = simpleCaseFoldings(front); |
b4c522fa IB |
881 | assert(range.length <= 5); |
882 | if (range.length == 1) | |
883 | g.put(Bytecode(IR.Char, range.front)); | |
884 | else | |
885 | foreach (v; range) | |
886 | g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); | |
887 | } | |
888 | else | |
5fee5ec3 IB |
889 | g.put(Bytecode(IR.Char, front)); |
890 | popFront(); | |
b4c522fa | 891 | } |
b4c522fa IB |
892 | } |
893 | ||
b4c522fa IB |
894 | //parse and store IR for CodepointSet |
895 | void parseCharset() | |
896 | { | |
897 | const save = re_flags; | |
898 | re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did | |
5fee5ec3 IB |
899 | bool casefold = cast(bool)(re_flags & RegexOption.casefold); |
900 | g.charsetToIr(unicode.parseSet(this, casefold)); | |
b4c522fa | 901 | re_flags = save; |
5fee5ec3 | 902 | // Last next() in parseCharset is executed w/o freeform flag |
b4c522fa IB |
903 | if (re_flags & RegexOption.freeform) skipSpace(); |
904 | } | |
905 | ||
b4c522fa IB |
906 | //parse and generate IR for escape stand alone escape sequence |
907 | @trusted void parseEscape() | |
908 | {//accesses array of appender | |
909 | import std.algorithm.iteration : sum; | |
5fee5ec3 | 910 | switch (front) |
b4c522fa | 911 | { |
5fee5ec3 IB |
912 | case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break; |
913 | case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break; | |
914 | case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break; | |
915 | case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break; | |
916 | case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break; | |
b4c522fa IB |
917 | |
918 | case 'd': | |
5fee5ec3 | 919 | popFront(); |
b4c522fa IB |
920 | g.charsetToIr(unicode.Nd); |
921 | break; | |
922 | case 'D': | |
5fee5ec3 | 923 | popFront(); |
b4c522fa IB |
924 | g.charsetToIr(unicode.Nd.inverted); |
925 | break; | |
5fee5ec3 IB |
926 | case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break; |
927 | case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break; | |
b4c522fa | 928 | case 's': |
5fee5ec3 | 929 | popFront(); |
b4c522fa IB |
930 | g.charsetToIr(unicode.White_Space); |
931 | break; | |
932 | case 'S': | |
5fee5ec3 | 933 | popFront(); |
b4c522fa IB |
934 | g.charsetToIr(unicode.White_Space.inverted); |
935 | break; | |
936 | case 'w': | |
5fee5ec3 | 937 | popFront(); |
b4c522fa IB |
938 | g.charsetToIr(wordCharacter); |
939 | break; | |
940 | case 'W': | |
5fee5ec3 | 941 | popFront(); |
b4c522fa IB |
942 | g.charsetToIr(wordCharacter.inverted); |
943 | break; | |
944 | case 'p': case 'P': | |
5fee5ec3 IB |
945 | bool casefold = cast(bool)(re_flags & RegexOption.casefold); |
946 | auto set = unicode.parsePropertySpec(this, front == 'P', casefold); | |
947 | g.charsetToIr(set); | |
b4c522fa IB |
948 | break; |
949 | case 'x': | |
950 | immutable code = parseUniHex(pat, 2); | |
5fee5ec3 | 951 | popFront(); |
b4c522fa IB |
952 | g.put(Bytecode(IR.Char,code)); |
953 | break; | |
954 | case 'u': case 'U': | |
5fee5ec3 IB |
955 | immutable code = parseUniHex(pat, front == 'u' ? 4 : 8); |
956 | popFront(); | |
b4c522fa IB |
957 | g.put(Bytecode(IR.Char, code)); |
958 | break; | |
959 | case 'c': //control codes | |
5fee5ec3 IB |
960 | Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this)); |
961 | popFront(); | |
b4c522fa IB |
962 | g.put(code); |
963 | break; | |
964 | case '0': | |
5fee5ec3 | 965 | popFront(); |
b4c522fa IB |
966 | g.put(Bytecode(IR.Char, 0));//NUL character |
967 | break; | |
968 | case '1': .. case '9': | |
5fee5ec3 | 969 | uint nref = cast(uint) front - '0'; |
b4c522fa IB |
970 | immutable maxBackref = sum(g.groupStack.data); |
971 | enforce(nref < maxBackref, "Backref to unseen group"); | |
972 | //perl's disambiguation rule i.e. | |
973 | //get next digit only if there is such group number | |
5fee5ec3 IB |
974 | popFront(); |
975 | while (nref < maxBackref && !empty && std.ascii.isDigit(front)) | |
b4c522fa | 976 | { |
5fee5ec3 IB |
977 | nref = nref * 10 + front - '0'; |
978 | popFront(); | |
b4c522fa IB |
979 | } |
980 | if (nref >= maxBackref) | |
981 | nref /= 10; | |
982 | enforce(!g.isOpenGroup(nref), "Backref to open group"); | |
983 | uint localLimit = maxBackref - g.groupStack.top; | |
984 | if (nref >= localLimit) | |
985 | { | |
986 | g.put(Bytecode(IR.Backref, nref-localLimit)); | |
987 | g.ir[$-1].setLocalRef(); | |
988 | } | |
989 | else | |
990 | g.put(Bytecode(IR.Backref, nref)); | |
991 | g.markBackref(nref); | |
992 | break; | |
993 | default: | |
5fee5ec3 | 994 | if (front == '\\' && !pat.empty) |
b4c522fa | 995 | { |
5fee5ec3 | 996 | if (pat.front >= privateUseStart && pat.front <= privateUseEnd) |
b4c522fa IB |
997 | enforce(false, "invalid escape sequence"); |
998 | } | |
5fee5ec3 | 999 | if (front >= privateUseStart && front <= privateUseEnd) |
b4c522fa | 1000 | { |
5fee5ec3 | 1001 | g.endPattern(front - privateUseStart + 1); |
b4c522fa IB |
1002 | break; |
1003 | } | |
5fee5ec3 IB |
1004 | auto op = Bytecode(IR.Char, front); |
1005 | popFront(); | |
b4c522fa IB |
1006 | g.put(op); |
1007 | } | |
1008 | } | |
1009 | ||
b4c522fa IB |
1010 | // |
1011 | @trusted void error(string msg) | |
1012 | { | |
1013 | import std.array : appender; | |
5fee5ec3 | 1014 | import std.format.write : formattedWrite; |
b4c522fa IB |
1015 | auto app = appender!string(); |
1016 | formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`", | |
1017 | msg, origin[0..$-pat.length], pat); | |
1018 | throw new RegexException(app.data); | |
1019 | } | |
1020 | ||
1021 | alias Char = BasicElementOf!R; | |
1022 | ||
1023 | @property program() | |
1024 | { | |
1025 | return makeRegex(this); | |
1026 | } | |
1027 | } | |
1028 | ||
1029 | /+ | |
1030 | Postproces the IR, then optimize. | |
1031 | +/ | |
1032 | @trusted void postprocess(Char)(ref Regex!Char zis) | |
1033 | {//@@@BUG@@@ write is @system | |
1034 | with(zis) | |
1035 | { | |
1036 | struct FixedStack(T) | |
1037 | { | |
1038 | T[] arr; | |
1039 | uint _top; | |
1040 | //this(T[] storage){ arr = storage; _top = -1; } | |
1041 | @property ref T top(){ assert(!empty); return arr[_top]; } | |
1042 | void push(T x){ arr[++_top] = x; } | |
1043 | T pop() { assert(!empty); return arr[_top--]; } | |
1044 | @property bool empty(){ return _top == -1; } | |
1045 | } | |
1046 | auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1); | |
1047 | counterRange.push(1); | |
1048 | ulong cumRange = 0; | |
1049 | for (uint i = 0; i < ir.length; i += ir[i].length) | |
1050 | { | |
1051 | if (ir[i].hotspot) | |
1052 | { | |
1053 | assert(i + 1 < ir.length, | |
1054 | "unexpected end of IR while looking for hotspot"); | |
1055 | ir[i+1] = Bytecode.fromRaw(hotspotTableSize); | |
1056 | hotspotTableSize += counterRange.top; | |
1057 | } | |
1058 | switch (ir[i].code) | |
1059 | { | |
1060 | case IR.RepeatStart, IR.RepeatQStart: | |
1061 | uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart)); | |
1062 | assert(ir[repEnd].code == ir[i].paired.code); | |
1063 | immutable max = ir[repEnd + 4].raw; | |
1064 | ir[repEnd+2].raw = counterRange.top; | |
1065 | ir[repEnd+3].raw *= counterRange.top; | |
1066 | ir[repEnd+4].raw *= counterRange.top; | |
1067 | ulong cntRange = cast(ulong)(max)*counterRange.top; | |
1068 | cumRange += cntRange; | |
1069 | enforce(cumRange < maxCumulativeRepetitionLength, | |
1070 | "repetition length limit is exceeded"); | |
1071 | counterRange.push(cast(uint) cntRange + counterRange.top); | |
1072 | threadCount += counterRange.top; | |
1073 | break; | |
1074 | case IR.RepeatEnd, IR.RepeatQEnd: | |
1075 | threadCount += counterRange.top; | |
1076 | counterRange.pop(); | |
1077 | break; | |
1078 | case IR.GroupStart: | |
1079 | if (isBackref(ir[i].data)) | |
1080 | ir[i].setBackrefence(); | |
1081 | threadCount += counterRange.top; | |
1082 | break; | |
1083 | case IR.GroupEnd: | |
1084 | if (isBackref(ir[i].data)) | |
1085 | ir[i].setBackrefence(); | |
1086 | threadCount += counterRange.top; | |
1087 | break; | |
1088 | default: | |
1089 | threadCount += counterRange.top; | |
1090 | } | |
1091 | } | |
1092 | checkIfOneShot(); | |
1093 | if (!(flags & RegexInfo.oneShot)) | |
1094 | kickstart = Kickstart!Char(zis, new uint[](256)); | |
1095 | debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); | |
1096 | optimize(zis); | |
1097 | } | |
1098 | } | |
1099 | ||
1100 | void fixupBytecode()(Bytecode[] ir) | |
1101 | { | |
1102 | Stack!uint fixups; | |
1103 | ||
1104 | with(IR) for (uint i=0; i<ir.length; i+= ir[i].length) | |
1105 | { | |
1106 | if (ir[i].isStart || ir[i].code == Option) | |
1107 | fixups.push(i); | |
1108 | else if (ir[i].code == OrEnd) | |
1109 | { | |
1110 | // Alternatives need more care | |
1111 | auto j = fixups.pop(); // last Option | |
1112 | ir[j].data = i - j - ir[j].length; | |
1113 | j = fixups.pop(); // OrStart | |
1114 | ir[j].data = i - j - ir[j].length; | |
1115 | ir[i].data = ir[j].data; | |
1116 | ||
1117 | // fixup all GotoEndOrs | |
1118 | j = j + IRL!(OrStart); | |
1119 | assert(ir[j].code == Option); | |
1120 | for (;;) | |
1121 | { | |
1122 | auto next = j + ir[j].data + IRL!(Option); | |
1123 | if (ir[next].code == IR.OrEnd) | |
1124 | break; | |
1125 | ir[next - IRL!(GotoEndOr)].data = i - next; | |
1126 | j = next; | |
1127 | } | |
1128 | } | |
1129 | else if (ir[i].code == GotoEndOr) | |
1130 | { | |
1131 | auto j = fixups.pop(); // Option | |
1132 | ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option | |
1133 | } | |
1134 | else if (ir[i].isEnd) | |
1135 | { | |
1136 | auto j = fixups.pop(); | |
1137 | ir[i].data = i - j - ir[j].length; | |
1138 | ir[j].data = ir[i].data; | |
1139 | } | |
1140 | } | |
1141 | assert(fixups.empty); | |
1142 | } | |
1143 | ||
1144 | void optimize(Char)(ref Regex!Char zis) | |
1145 | { | |
1146 | import std.array : insertInPlace; | |
1147 | CodepointSet nextSet(uint idx) | |
1148 | { | |
1149 | CodepointSet set; | |
1150 | with(zis) with(IR) | |
1151 | Outer: | |
1152 | for (uint i = idx; i < ir.length; i += ir[i].length) | |
1153 | { | |
1154 | switch (ir[i].code) | |
1155 | { | |
1156 | case Char: | |
1157 | set.add(ir[i].data, ir[i].data+1); | |
1158 | goto default; | |
1159 | //TODO: OrChar | |
1160 | case Trie, CodepointSet: | |
1161 | set = zis.charsets[ir[i].data]; | |
1162 | goto default; | |
1163 | case GroupStart,GroupEnd: | |
1164 | break; | |
1165 | default: | |
1166 | break Outer; | |
1167 | } | |
1168 | } | |
1169 | return set; | |
1170 | } | |
1171 | ||
1172 | with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length) | |
1173 | { | |
1174 | if (ir[i].code == InfiniteEnd) | |
1175 | { | |
1176 | auto set = nextSet(i+IRL!(InfiniteEnd)); | |
1177 | if (!set.empty && set.length < 10_000) | |
1178 | { | |
1179 | ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data); | |
1180 | ir[i - ir[i].data - IRL!(InfiniteStart)] = | |
1181 | Bytecode(InfiniteBloomStart, ir[i].data); | |
1182 | ir.insertInPlace(i+IRL!(InfiniteEnd), | |
1183 | Bytecode.fromRaw(cast(uint) zis.filters.length)); | |
1184 | zis.filters ~= BitTable(set); | |
1185 | fixupBytecode(ir); | |
1186 | } | |
1187 | } | |
1188 | } | |
1189 | } | |
1190 | ||
1191 | //IR code validator - proper nesting, illegal instructions, etc. | |
1192 | @trusted void validateRe(Char)(ref Regex!Char zis) | |
1193 | {//@@@BUG@@@ text is @system | |
1194 | import std.conv : text; | |
1195 | with(zis) | |
1196 | { | |
1197 | for (uint pc = 0; pc < ir.length; pc += ir[pc].length) | |
1198 | { | |
1199 | if (ir[pc].isStart || ir[pc].isEnd) | |
1200 | { | |
1201 | immutable dest = ir[pc].indexOfPair(pc); | |
1202 | assert(dest < ir.length, text("Wrong length in opcode at pc=", | |
1203 | pc, " ", dest, " vs ", ir.length)); | |
1204 | assert(ir[dest].paired == ir[pc], | |
1205 | text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest)); | |
1206 | } | |
1207 | else if (ir[pc].isAtom) | |
1208 | { | |
1209 | ||
1210 | } | |
1211 | else | |
1212 | assert(0, text("Unknown type of instruction at pc=", pc)); | |
1213 | } | |
1214 | } | |
1215 | } |