]>
Commit | Line | Data |
---|---|---|
30681738 RD |
1 | ------------------------------------------------------------------------------ |
2 | -- -- | |
3 | -- GNAT LIBRARY COMPONENTS -- | |
4 | -- -- | |
5 | -- S Y S T E M . R E G P A T -- | |
6 | -- -- | |
7 | -- S p e c -- | |
8 | -- -- | |
9 | -- Copyright (C) 1986 by University of Toronto. -- | |
4b490c1e | 10 | -- Copyright (C) 1996-2020, AdaCore -- |
30681738 RD |
11 | -- -- |
12 | -- GNAT is free software; you can redistribute it and/or modify it under -- | |
13 | -- terms of the GNU General Public License as published by the Free Soft- -- | |
607d0635 | 14 | -- ware Foundation; either version 3, or (at your option) any later ver- -- |
30681738 RD |
15 | -- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- |
16 | -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- | |
607d0635 AC |
17 | -- or FITNESS FOR A PARTICULAR PURPOSE. -- |
18 | -- -- | |
19 | -- As a special exception under Section 7 of GPL version 3, you are granted -- | |
20 | -- additional permissions described in the GCC Runtime Library Exception, -- | |
21 | -- version 3.1, as published by the Free Software Foundation. -- | |
22 | -- -- | |
23 | -- You should have received a copy of the GNU General Public License and -- | |
24 | -- a copy of the GCC Runtime Library Exception along with this program; -- | |
25 | -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- | |
26 | -- <http://www.gnu.org/licenses/>. -- | |
30681738 RD |
27 | -- -- |
28 | -- GNAT was originally developed by the GNAT team at New York University. -- | |
29 | -- Extensive contributions were provided by Ada Core Technologies Inc. -- | |
30 | -- -- | |
31 | ------------------------------------------------------------------------------ | |
32 | ||
33 | -- This package implements roughly the same set of regular expressions as | |
34 | -- are available in the Perl or Python programming languages. | |
35 | ||
36 | -- This is an extension of the original V7 style regular expression library | |
37 | -- written in C by Henry Spencer. Apart from the translation to Ada, the | |
38 | -- interface has been considerably changed to use the Ada String type | |
39 | -- instead of C-style nul-terminated strings. | |
40 | ||
41 | -- Note: this package is in the System hierarchy so that it can be directly | |
42 | -- be used by other predefined packages. User access to this package is via | |
43 | -- a renaming of this package in GNAT.Regpat (file g-regpat.ads). | |
44 | ||
45 | package System.Regpat is | |
46 | pragma Preelaborate; | |
47 | ||
48 | -- The grammar is the following: | |
49 | ||
50 | -- regexp ::= expr | |
51 | -- ::= ^ expr -- anchor at the beginning of string | |
52 | -- ::= expr $ -- anchor at the end of string | |
53 | ||
54 | -- expr ::= term | |
55 | -- ::= term | term -- alternation (term or term ...) | |
56 | ||
57 | -- term ::= item | |
58 | -- ::= item item ... -- concatenation (item then item) | |
59 | ||
60 | -- item ::= elmt -- match elmt | |
61 | -- ::= elmt * -- zero or more elmt's | |
62 | -- ::= elmt + -- one or more elmt's | |
63 | -- ::= elmt ? -- matches elmt or nothing | |
64 | -- ::= elmt *? -- zero or more times, minimum number | |
65 | -- ::= elmt +? -- one or more times, minimum number | |
66 | -- ::= elmt ?? -- zero or one time, minimum number | |
67 | -- ::= elmt { num } -- matches elmt exactly num times | |
68 | -- ::= elmt { num , } -- matches elmt at least num times | |
69 | -- ::= elmt { num , num2 } -- matches between num and num2 times | |
70 | -- ::= elmt { num }? -- matches elmt exactly num times | |
71 | -- ::= elmt { num , }? -- matches elmt at least num times | |
72 | -- non-greedy version | |
73 | -- ::= elmt { num , num2 }? -- matches between num and num2 times | |
74 | -- non-greedy version | |
75 | ||
76 | -- elmt ::= nchr -- matches given character | |
77 | -- ::= [range range ...] -- matches any character listed | |
78 | -- ::= [^ range range ...] -- matches any character not listed | |
79 | -- ::= . -- matches any single character | |
80 | -- -- except newlines | |
8894aa20 AC |
81 | -- ::= ( expr ) -- parenthesis used for grouping |
82 | -- ::= (?: expr ) -- non-capturing parenthesis | |
83 | -- ::= \ num -- reference to num-th capturing | |
84 | -- parenthesis | |
30681738 RD |
85 | |
86 | -- range ::= char - char -- matches chars in given range | |
87 | -- ::= nchr | |
88 | -- ::= [: posix :] -- any character in the POSIX range | |
89 | -- ::= [:^ posix :] -- not in the POSIX range | |
90 | ||
91 | -- posix ::= alnum -- alphanumeric characters | |
92 | -- ::= alpha -- alphabetic characters | |
93 | -- ::= ascii -- ascii characters (0 .. 127) | |
94 | -- ::= cntrl -- control chars (0..31, 127..159) | |
95 | -- ::= digit -- digits ('0' .. '9') | |
96 | -- ::= graph -- graphic chars (32..126, 160..255) | |
97 | -- ::= lower -- lower case characters | |
98 | -- ::= print -- printable characters (32..127) | |
99 | -- -- and whitespaces (9 .. 13) | |
100 | -- ::= punct -- printable, except alphanumeric | |
101 | -- ::= space -- space characters | |
102 | -- ::= upper -- upper case characters | |
103 | -- ::= word -- alphanumeric characters | |
104 | -- ::= xdigit -- hexadecimal chars (0..9, a..f) | |
105 | ||
106 | -- char ::= any character, including special characters | |
107 | -- ASCII.NUL is not supported. | |
108 | ||
109 | -- nchr ::= any character except \()[].*+?^ or \char to match char | |
110 | -- \n means a newline (ASCII.LF) | |
111 | -- \t means a tab (ASCII.HT) | |
112 | -- \r means a return (ASCII.CR) | |
113 | -- \b matches the empty string at the beginning or end of a | |
114 | -- word. A word is defined as a set of alphanumerical | |
115 | -- characters (see \w below). | |
116 | -- \B matches the empty string only when *not* at the | |
117 | -- beginning or end of a word. | |
118 | -- \d matches any digit character ([0-9]) | |
119 | -- \D matches any non digit character ([^0-9]) | |
120 | -- \s matches any white space character. This is equivalent | |
121 | -- to [ \t\n\r\f\v] (tab, form-feed, vertical-tab,... | |
122 | -- \S matches any non-white space character. | |
123 | -- \w matches any alphanumeric character or underscore. | |
124 | -- This include accented letters, as defined in the | |
125 | -- package Ada.Characters.Handling. | |
126 | -- \W matches any non-alphanumeric character. | |
127 | -- \A match the empty string only at the beginning of the | |
128 | -- string, whatever flags are used for Compile (the | |
129 | -- behavior of ^ can change, see Regexp_Flags below). | |
130 | -- \G match the empty string only at the end of the | |
131 | -- string, whatever flags are used for Compile (the | |
132 | -- behavior of $ can change, see Regexp_Flags below). | |
133 | -- ... ::= is used to indication repetition (one or more terms) | |
134 | ||
135 | -- Embedded newlines are not matched by the ^ operator. | |
136 | -- It is possible to retrieve the substring matched a parenthesis | |
137 | -- expression. Although the depth of parenthesis is not limited in the | |
138 | -- regexp, only the first 9 substrings can be retrieved. | |
139 | ||
140 | -- The highest value possible for the arguments to the curly operator ({}) | |
141 | -- are given by the constant Max_Curly_Repeat below. | |
142 | ||
143 | -- The operators '*', '+', '?' and '{}' always match the longest possible | |
144 | -- substring. They all have a non-greedy version (with an extra ? after the | |
145 | -- operator), which matches the shortest possible substring. | |
146 | ||
147 | -- For instance: | |
148 | -- regexp="<.*>" string="<h1>title</h1>" matches="<h1>title</h1>" | |
149 | -- regexp="<.*?>" string="<h1>title</h1>" matches="<h1>" | |
150 | -- | |
151 | -- '{' and '}' are only considered as special characters if they appear | |
152 | -- in a substring that looks exactly like '{n}', '{n,m}' or '{n,}', where | |
153 | -- n and m are digits. No space is allowed. In other contexts, the curly | |
154 | -- braces will simply be treated as normal characters. | |
155 | ||
156 | -- Compiling Regular Expressions | |
157 | -- ============================= | |
158 | ||
159 | -- To use this package, you first need to compile the regular expression | |
160 | -- (a string) into a byte-code program, in a Pattern_Matcher structure. | |
161 | -- This first step checks that the regexp is valid, and optimizes the | |
162 | -- matching algorithms of the second step. | |
163 | ||
164 | -- Two versions of the Compile subprogram are given: one in which this | |
165 | -- package will compute itself the best possible size to allocate for the | |
166 | -- byte code; the other where you must allocate enough memory yourself. An | |
167 | -- exception is raised if there is not enough memory. | |
168 | ||
169 | -- declare | |
170 | -- Regexp : String := "a|b"; | |
171 | ||
172 | -- Matcher : Pattern_Matcher := Compile (Regexp); | |
173 | -- -- The size for matcher is automatically allocated | |
174 | ||
175 | -- Matcher2 : Pattern_Matcher (1000); | |
176 | -- -- Some space is allocated directly. | |
177 | ||
178 | -- begin | |
179 | -- Compile (Matcher2, Regexp); | |
180 | -- ... | |
181 | -- end; | |
182 | ||
183 | -- Note that the second version is significantly faster, since with the | |
184 | -- first version the regular expression has in fact to be compiled twice | |
185 | -- (first to compute the size, then to generate the byte code). | |
186 | ||
187 | -- Note also that you cannot use the function version of Compile if you | |
188 | -- specify the size of the Pattern_Matcher, since the discriminants will | |
189 | -- most probably be different and you will get a Constraint_Error | |
190 | ||
191 | -- Matching Strings | |
192 | -- ================ | |
193 | ||
194 | -- Once the regular expression has been compiled, you can use it as often | |
195 | -- as needed to match strings. | |
196 | ||
197 | -- Several versions of the Match subprogram are provided, with different | |
198 | -- parameters and return results. | |
199 | ||
200 | -- See the description under each of these subprograms | |
201 | ||
202 | -- Here is a short example showing how to get the substring matched by | |
203 | -- the first parenthesis pair. | |
204 | ||
205 | -- declare | |
206 | -- Matches : Match_Array (0 .. 1); | |
207 | -- Regexp : String := "a(b|c)d"; | |
208 | -- Str : String := "gacdg"; | |
209 | ||
210 | -- begin | |
211 | -- Match (Compile (Regexp), Str, Matches); | |
212 | -- return Str (Matches (1).First .. Matches (1).Last); | |
213 | -- -- returns 'c' | |
214 | -- end; | |
215 | ||
216 | -- Finding all occurrences | |
217 | -- ======================= | |
218 | ||
219 | -- Finding all the occurrences of a regular expression in a string cannot | |
220 | -- be done by simply passing a slice of the string. This wouldn't work for | |
221 | -- anchored regular expressions (the ones starting with "^" or ending with | |
222 | -- "$"). | |
223 | -- Instead, you need to use the last parameter to Match (Data_First), as in | |
224 | -- the following loop: | |
225 | ||
226 | -- declare | |
227 | -- Str : String := | |
228 | -- "-- first line" & ASCII.LF & "-- second line"; | |
229 | -- Matches : Match_Array (0 .. 0); | |
230 | -- Regexp : Pattern_Matcher := Compile ("^--", Multiple_Lines); | |
231 | -- Current : Natural := Str'First; | |
232 | -- begin | |
233 | -- loop | |
234 | -- Match (Regexp, Str, Matches, Current); | |
235 | -- exit when Matches (0) = No_Match; | |
236 | -- | |
237 | -- -- Process the match at position Matches (0).First | |
238 | -- | |
239 | -- Current := Matches (0).Last + 1; | |
240 | -- end loop; | |
241 | -- end; | |
242 | ||
243 | -- String Substitution | |
244 | -- =================== | |
245 | ||
246 | -- No subprogram is currently provided for string substitution. | |
247 | -- However, this is easy to simulate with the parenthesis groups, as | |
248 | -- shown below. | |
249 | ||
250 | -- This example swaps the first two words of the string: | |
251 | ||
252 | -- declare | |
253 | -- Regexp : String := "([a-z]+) +([a-z]+)"; | |
254 | -- Str : String := " first second third "; | |
255 | -- Matches : Match_Array (0 .. 2); | |
256 | ||
257 | -- begin | |
258 | -- Match (Compile (Regexp), Str, Matches); | |
259 | -- return Str (Str'First .. Matches (1).First - 1) | |
260 | -- & Str (Matches (2).First .. Matches (2).Last) | |
261 | -- & " " | |
262 | -- & Str (Matches (1).First .. Matches (1).Last) | |
263 | -- & Str (Matches (2).Last + 1 .. Str'Last); | |
264 | -- -- returns " second first third " | |
265 | -- end; | |
266 | ||
267 | --------------- | |
268 | -- Constants -- | |
269 | --------------- | |
270 | ||
271 | Expression_Error : exception; | |
272 | -- This exception is raised when trying to compile an invalid regular | |
273 | -- expression. All subprograms taking an expression as parameter may raise | |
274 | -- Expression_Error. | |
275 | ||
276 | Max_Paren_Count : constant := 255; | |
277 | -- Maximum number of parenthesis in a regular expression. This is limited | |
278 | -- by the size of a Character, as found in the byte-compiled version of | |
279 | -- regular expressions. | |
280 | ||
281 | Max_Curly_Repeat : constant := 32767; | |
282 | -- Maximum number of repetition for the curly operator. The digits in the | |
283 | -- {n}, {n,} and {n,m } operators cannot be higher than this constant, | |
284 | -- since they have to fit on two characters in the byte-compiled version of | |
285 | -- regular expressions. | |
286 | ||
287 | Max_Program_Size : constant := 2**15 - 1; | |
288 | -- Maximum size that can be allocated for a program | |
289 | ||
290 | type Program_Size is range 0 .. Max_Program_Size; | |
291 | for Program_Size'Size use 16; | |
292 | -- Number of bytes allocated for the byte-compiled version of a regular | |
293 | -- expression. The size required depends on the complexity of the regular | |
294 | -- expression in a complex manner that is undocumented (other than in the | |
295 | -- body of the Compile procedure). Normally the size is automatically set | |
296 | -- and the programmer need not be concerned about it. There are two | |
297 | -- exceptions to this. First in the calls to Match, it is possible to | |
298 | -- specify a non-zero size that is known to be large enough. This can | |
299 | -- slightly increase the efficiency by avoiding a copy. Second, in the case | |
300 | -- of calling compile, it is possible using the procedural form of Compile | |
301 | -- to use a single Pattern_Matcher variable for several different | |
302 | -- expressions by setting its size sufficiently large. | |
303 | ||
304 | Auto_Size : constant := 0; | |
305 | -- Used in calls to Match to indicate that the Size should be set to | |
306 | -- a value appropriate to the expression being used automatically. | |
307 | ||
308 | type Regexp_Flags is mod 256; | |
309 | for Regexp_Flags'Size use 8; | |
310 | -- Flags that can be given at compile time to specify default | |
311 | -- properties for the regular expression. | |
312 | ||
313 | No_Flags : constant Regexp_Flags; | |
314 | Case_Insensitive : constant Regexp_Flags; | |
315 | -- The automaton is optimized so that the matching is done in a case | |
316 | -- insensitive manner (upper case characters and lower case characters | |
317 | -- are all treated the same way). | |
318 | ||
319 | Single_Line : constant Regexp_Flags; | |
320 | -- Treat the Data we are matching as a single line. This means that | |
321 | -- ^ and $ will ignore \n (unless Multiple_Lines is also specified), | |
322 | -- and that '.' will match \n. | |
323 | ||
324 | Multiple_Lines : constant Regexp_Flags; | |
325 | -- Treat the Data as multiple lines. This means that ^ and $ will also | |
326 | -- match on internal newlines (ASCII.LF), in addition to the beginning | |
327 | -- and end of the string. | |
328 | -- | |
329 | -- This can be combined with Single_Line. | |
330 | ||
331 | ----------------- | |
332 | -- Match_Array -- | |
333 | ----------------- | |
334 | ||
335 | subtype Match_Count is Natural range 0 .. Max_Paren_Count; | |
336 | ||
337 | type Match_Location is record | |
338 | First : Natural := 0; | |
339 | Last : Natural := 0; | |
340 | end record; | |
341 | ||
342 | type Match_Array is array (Match_Count range <>) of Match_Location; | |
343 | -- Used for regular expressions that can contain parenthesized | |
344 | -- subexpressions. Certain Match subprograms below produce Matches of type | |
345 | -- Match_Array. Each component of Matches is set to the subrange of the | |
346 | -- matches substring, or to No_Match if no match. Matches (N) is for the | |
347 | -- N'th parenthesized subexpressions; Matches (0) is for the whole | |
348 | -- expression. | |
349 | -- | |
67914693 | 350 | -- Non-capturing parenthesis (introduced with (?:...)) cannot be |
8894aa20 AC |
351 | -- retrieved and do not count in the match array index. |
352 | -- | |
30681738 RD |
353 | -- For instance, if your regular expression is: "a((b*)c+)(d+)", then |
354 | -- 12 3 | |
355 | -- Matches (0) is for "a((b*)c+)(d+)" (the entire expression) | |
356 | -- Matches (1) is for "(b*)c+" | |
90878b12 | 357 | -- Matches (2) is for "b*" |
30681738 RD |
358 | -- Matches (3) is for "d+" |
359 | -- | |
360 | -- The number of parenthesis groups that can be retrieved is limited only | |
361 | -- by Max_Paren_Count. | |
362 | -- | |
363 | -- Normally, the bounds of the Matches actual parameter will be | |
364 | -- 0 .. Paren_Count (Regexp), to get all the matches. However, it is fine | |
365 | -- if Matches is shorter than that on either end; missing components will | |
366 | -- be ignored. Thus, in the above example, you could use 2 .. 2 if all you | |
367 | -- care about it the second parenthesis pair "b*". Likewise, if | |
368 | -- Matches'Last > Paren_Count (Regexp), the extra components will be set to | |
369 | -- No_Match. | |
370 | ||
371 | No_Match : constant Match_Location := (First => 0, Last => 0); | |
372 | -- The No_Match constant is (0, 0) to differentiate between matching a null | |
373 | -- string at position 1, which uses (1, 0) and no match at all. | |
374 | ||
375 | --------------------------------- | |
376 | -- Pattern_Matcher Compilation -- | |
377 | --------------------------------- | |
378 | ||
379 | -- The subprograms here are used to precompile regular expressions for use | |
380 | -- in subsequent Match calls. Precompilation improves efficiency if the | |
381 | -- same regular expression is to be used in more than one Match call. | |
382 | ||
383 | type Pattern_Matcher (Size : Program_Size) is private; | |
384 | -- Type used to represent a regular expression compiled into byte code | |
385 | ||
386 | Never_Match : constant Pattern_Matcher; | |
387 | -- A regular expression that never matches anything | |
388 | ||
389 | function Compile | |
390 | (Expression : String; | |
391 | Flags : Regexp_Flags := No_Flags) return Pattern_Matcher; | |
392 | -- Compile a regular expression into internal code | |
393 | -- | |
394 | -- Raises Expression_Error if Expression is not a legal regular expression | |
395 | -- | |
396 | -- The appropriate size is calculated automatically to correspond to the | |
397 | -- provided expression. This is the normal default method of compilation. | |
398 | -- Note that it is generally not possible to assign the result of two | |
399 | -- different calls to this Compile function to the same Pattern_Matcher | |
400 | -- variable, since the sizes will differ. | |
401 | -- | |
402 | -- Flags is the default value to use to set properties for Expression | |
403 | -- (e.g. case sensitivity,...). | |
404 | ||
405 | procedure Compile | |
406 | (Matcher : out Pattern_Matcher; | |
407 | Expression : String; | |
408 | Final_Code_Size : out Program_Size; | |
409 | Flags : Regexp_Flags := No_Flags); | |
16b05213 | 410 | -- Compile a regular expression into internal code |
30681738 RD |
411 | |
412 | -- This procedure is significantly faster than the Compile function since | |
413 | -- it avoids the extra step of precomputing the required size. | |
414 | -- | |
415 | -- However, it requires the user to provide a Pattern_Matcher variable | |
416 | -- whose size is preset to a large enough value. One advantage of this | |
417 | -- approach, in addition to the improved efficiency, is that the same | |
418 | -- Pattern_Matcher variable can be used to hold the compiled code for | |
419 | -- several different regular expressions by setting a size that is large | |
276e95ca | 420 | -- enough to accommodate all possibilities. |
30681738 RD |
421 | -- |
422 | -- In this version of the procedure call, the actual required code size is | |
423 | -- returned. Also if Matcher.Size is zero on entry, then the resulting code | |
424 | -- is not stored. A call with Matcher.Size set to Auto_Size can thus be | |
425 | -- used to determine the space required for compiling the given regular | |
426 | -- expression. | |
427 | -- | |
428 | -- This function raises Storage_Error if Matcher is too small to hold | |
429 | -- the resulting code (i.e. Matcher.Size has too small a value). | |
430 | -- | |
431 | -- Expression_Error is raised if the string Expression does not contain | |
432 | -- a valid regular expression. | |
433 | -- | |
434 | -- Flags is the default value to use to set properties for Expression (case | |
435 | -- sensitivity,...). | |
436 | ||
437 | procedure Compile | |
438 | (Matcher : out Pattern_Matcher; | |
439 | Expression : String; | |
440 | Flags : Regexp_Flags := No_Flags); | |
441 | -- Same procedure as above, expect it does not return the final | |
442 | -- program size, and Matcher.Size cannot be Auto_Size. | |
443 | ||
444 | function Paren_Count (Regexp : Pattern_Matcher) return Match_Count; | |
445 | pragma Inline (Paren_Count); | |
446 | -- Return the number of parenthesis pairs in Regexp. | |
447 | -- | |
448 | -- This is the maximum index that will be filled if a Match_Array is | |
449 | -- used as an argument to Match. | |
450 | -- | |
451 | -- Thus, if you want to be sure to get all the parenthesis, you should | |
452 | -- do something like: | |
453 | -- | |
454 | -- declare | |
455 | -- Regexp : Pattern_Matcher := Compile ("a(b*)(c+)"); | |
456 | -- Matched : Match_Array (0 .. Paren_Count (Regexp)); | |
457 | -- begin | |
458 | -- Match (Regexp, "a string", Matched); | |
459 | -- end; | |
460 | ||
461 | ------------- | |
462 | -- Quoting -- | |
463 | ------------- | |
464 | ||
465 | function Quote (Str : String) return String; | |
466 | -- Return a version of Str so that every special character is quoted. | |
467 | -- The resulting string can be used in a regular expression to match | |
468 | -- exactly Str, whatever character was present in Str. | |
469 | ||
470 | -------------- | |
471 | -- Matching -- | |
472 | -------------- | |
473 | ||
474 | -- The Match subprograms are given a regular expression in string | |
475 | -- form, and perform the corresponding match. The following parameters | |
476 | -- are present in all forms of the Match call. | |
477 | ||
478 | -- Expression contains the regular expression to be matched as a string | |
479 | ||
480 | -- Data contains the string to be matched | |
481 | ||
482 | -- Data_First is the lower bound for the match, i.e. Data (Data_First) | |
483 | -- will be the first character to be examined. If Data_First is set to | |
484 | -- the special value of -1 (the default), then the first character to | |
485 | -- be examined is Data (Data_First). However, the regular expression | |
486 | -- character ^ (start of string) still refers to the first character | |
487 | -- of the full string (Data (Data'First)), which is why there is a | |
488 | -- separate mechanism for specifying Data_First. | |
489 | ||
490 | -- Data_Last is the upper bound for the match, i.e. Data (Data_Last) | |
491 | -- will be the last character to be examined. If Data_Last is set to | |
492 | -- the special value of Positive'Last (the default), then the last | |
493 | -- character to be examined is Data (Data_Last). However, the regular | |
494 | -- expression character $ (end of string) still refers to the last | |
495 | -- character of the full string (Data (Data'Last)), which is why there | |
496 | -- is a separate mechanism for specifying Data_Last. | |
497 | ||
498 | -- Note: the use of Data_First and Data_Last is not equivalent to | |
499 | -- simply passing a slice as Expression because of the handling of | |
500 | -- regular expression characters ^ and $. | |
501 | ||
502 | -- Size is the size allocated for the compiled byte code. Normally | |
503 | -- this is defaulted to Auto_Size which means that the appropriate | |
504 | -- size is allocated automatically. It is possible to specify an | |
505 | -- explicit size, which must be sufficiently large. This slightly | |
506 | -- increases the efficiency by avoiding the extra step of computing | |
507 | -- the appropriate size. | |
508 | ||
509 | -- The following exceptions can be raised in calls to Match | |
510 | -- | |
511 | -- Storage_Error is raised if a non-zero value is given for Size | |
512 | -- and it is too small to hold the compiled byte code. | |
513 | -- | |
514 | -- Expression_Error is raised if the given expression is not a legal | |
515 | -- regular expression. | |
516 | ||
517 | procedure Match | |
518 | (Expression : String; | |
519 | Data : String; | |
520 | Matches : out Match_Array; | |
521 | Size : Program_Size := Auto_Size; | |
522 | Data_First : Integer := -1; | |
523 | Data_Last : Positive := Positive'Last); | |
524 | -- This version returns the result of the match stored in Match_Array; | |
525 | -- see comments under Match_Array above for details. | |
526 | ||
527 | function Match | |
528 | (Expression : String; | |
529 | Data : String; | |
530 | Size : Program_Size := Auto_Size; | |
531 | Data_First : Integer := -1; | |
532 | Data_Last : Positive := Positive'Last) return Natural; | |
533 | -- This version returns the position where Data matches, or if there is | |
534 | -- no match, then the value Data'First - 1. | |
535 | ||
536 | function Match | |
537 | (Expression : String; | |
538 | Data : String; | |
539 | Size : Program_Size := Auto_Size; | |
540 | Data_First : Integer := -1; | |
541 | Data_Last : Positive := Positive'Last) return Boolean; | |
542 | -- This version returns True if the match succeeds, False otherwise | |
543 | ||
544 | ------------------------------------------------ | |
545 | -- Matching a Pre-Compiled Regular Expression -- | |
546 | ------------------------------------------------ | |
547 | ||
548 | -- The following functions are significantly faster if you need to reuse | |
549 | -- the same regular expression multiple times, since you only have to | |
550 | -- compile it once. For these functions you must first compile the | |
551 | -- expression with a call to Compile as previously described. | |
552 | ||
553 | -- The parameters Data, Data_First and Data_Last are as described | |
554 | -- in the previous section. | |
555 | ||
556 | function Match | |
557 | (Self : Pattern_Matcher; | |
558 | Data : String; | |
559 | Data_First : Integer := -1; | |
560 | Data_Last : Positive := Positive'Last) return Natural; | |
561 | -- Match Data using the given pattern matcher. Returns the position | |
562 | -- where Data matches, or (Data'First - 1) if there is no match. | |
563 | ||
564 | function Match | |
565 | (Self : Pattern_Matcher; | |
566 | Data : String; | |
567 | Data_First : Integer := -1; | |
568 | Data_Last : Positive := Positive'Last) return Boolean; | |
569 | -- Return True if Data matches using the given pattern matcher | |
570 | ||
571 | pragma Inline (Match); | |
572 | -- All except the last one below | |
573 | ||
574 | procedure Match | |
575 | (Self : Pattern_Matcher; | |
576 | Data : String; | |
577 | Matches : out Match_Array; | |
578 | Data_First : Integer := -1; | |
579 | Data_Last : Positive := Positive'Last); | |
580 | -- Match Data using the given pattern matcher and store result in Matches; | |
581 | -- see comments under Match_Array above for details. | |
582 | ||
583 | ----------- | |
584 | -- Debug -- | |
585 | ----------- | |
586 | ||
587 | procedure Dump (Self : Pattern_Matcher); | |
588 | -- Dump the compiled version of the regular expression matched by Self | |
589 | ||
590 | -------------------------- | |
591 | -- Private Declarations -- | |
592 | -------------------------- | |
593 | ||
594 | private | |
595 | ||
596 | subtype Pointer is Program_Size; | |
597 | -- The Pointer type is used to point into Program_Data | |
598 | ||
599 | -- Note that the pointer type is not necessarily 2 bytes | |
600 | -- although it is stored in the program using 2 bytes | |
601 | ||
602 | type Program_Data is array (Pointer range <>) of Character; | |
603 | ||
604 | Program_First : constant := 1; | |
605 | ||
606 | -- The "internal use only" fields in regexp are present to pass info from | |
607 | -- compile to execute that permits the execute phase to run lots faster on | |
608 | -- simple cases. They are: | |
609 | ||
43c6e0cb | 610 | -- First character that must begin a match or ASCII.NUL |
30681738 RD |
611 | -- Anchored true iff match must start at beginning of line |
612 | -- Must_Have pointer to string that match must include or null | |
613 | -- Must_Have_Length length of Must_Have string | |
614 | ||
615 | -- First and Anchored permit very fast decisions on suitable starting | |
616 | -- points for a match, cutting down the work a lot. Must_Have permits fast | |
617 | -- rejection of lines that cannot possibly match. | |
618 | ||
619 | -- The Must_Have tests are costly enough that Optimize supplies a Must_Have | |
620 | -- only if the r.e. contains something potentially expensive (at present, | |
621 | -- the only such thing detected is * or at the start of the r.e., which can | |
622 | -- involve a lot of backup). The length is supplied because the test in | |
623 | -- Execute needs it and Optimize is computing it anyway. | |
624 | ||
625 | -- The initialization is meant to fail-safe in case the user of this | |
626 | -- package tries to use an uninitialized matcher. This takes advantage | |
43c6e0cb | 627 | -- of the knowledge that ASCII.NUL translates to the end-of-program (EOP) |
30681738 RD |
628 | -- instruction code of the state machine. |
629 | ||
630 | No_Flags : constant Regexp_Flags := 0; | |
631 | Case_Insensitive : constant Regexp_Flags := 1; | |
632 | Single_Line : constant Regexp_Flags := 2; | |
633 | Multiple_Lines : constant Regexp_Flags := 4; | |
634 | ||
635 | type Pattern_Matcher (Size : Pointer) is record | |
636 | First : Character := ASCII.NUL; -- internal use only | |
637 | Anchored : Boolean := False; -- internal use only | |
638 | Must_Have : Pointer := 0; -- internal use only | |
639 | Must_Have_Length : Natural := 0; -- internal use only | |
640 | Paren_Count : Natural := 0; -- # paren groups | |
641 | Flags : Regexp_Flags := No_Flags; | |
642 | Program : Program_Data (Program_First .. Size) := | |
643 | (others => ASCII.NUL); | |
644 | end record; | |
645 | ||
646 | Never_Match : constant Pattern_Matcher := | |
647 | (0, ASCII.NUL, False, 0, 0, 0, No_Flags, (others => ASCII.NUL)); | |
648 | ||
649 | end System.Regpat; |