libgo/go/regexp/regexp.go

   1 // Use of this source code is governed by a BSD-style
   2 // license that can be found in the LICENSE file.
   3
   4 // Package regexp implements a simple regular expression library.
   5 //
   6 // The syntax of the regular expressions accepted is:
   7 //
   8 //      regexp:
   9 //              concatenation { '|' concatenation }
  10 //      concatenation:
  11 //              { closure }
  12 //      closure:
  13 //              term [ '*' | '+' | '?' ]
  14 //      term:
  15 //              '^'
  16 //              '$'
  17 //              '.'
  18 //              character
  19 //              '[' [ '^' ] { character-range } ']'
  20 //              '(' regexp ')'
  21 //      character-range:
  22 //              character [ '-' character ]
  23 //
  24 // All characters are UTF-8-encoded code points.  Backslashes escape special
  25 // characters, including inside character classes.  The standard Go character
  26 // escapes are also recognized: \a \b \f \n \r \t \v.
  27 //
  28 // There are 16 methods of Regexp that match a regular expression and identify
  29 // the matched text.  Their names are matched by this regular expression:
  30 //
  31 //      Find(All)?(String)?(Submatch)?(Index)?
  32 //
  33 // If 'All' is present, the routine matches successive non-overlapping
  34 // matches of the entire expression.  Empty matches abutting a preceding
  35 // match are ignored.  The return value is a slice containing the successive
  36 // return values of the corresponding non-'All' routine.  These routines take
  37 // an extra integer argument, n; if n >= 0, the function returns at most n
  38 // matches/submatches.
  39 //
  40 // If 'String' is present, the argument is a string; otherwise it is a slice
  41 // of bytes; return values are adjusted as appropriate.
  42 //
  43 // If 'Submatch' is present, the return value is a slice identifying the
  44 // successive submatches of the expression.  Submatches are matches of
  45 // parenthesized subexpressions within the regular expression, numbered from
  46 // left to right in order of opening parenthesis.  Submatch 0 is the match of
  47 // the entire expression, submatch 1 the match of the first parenthesized
  48 // subexpression, and so on.
  49 //
  50 // If 'Index' is present, matches and submatches are identified by byte index
  51 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
  52 // the nth submatch.  The pair for n==0 identifies the match of the entire
  53 // expression.  If 'Index' is not present, the match is identified by the
  54 // text of the match/submatch.  If an index is negative, it means that
  55 // subexpression did not match any string in the input.
  56 //
  57 // (There are a few other methods that do not match this pattern.)
  58 //
  59 package regexp
  60
  61 import (
  62         "bytes"
  63         "io"
  64         "os"
  65         "strings"
  66         "utf8"
  67 )
  68
  69 var debug = false
  70
  71 // Error is the local type for a parsing error.
  72 type Error string
  73
  74 func (e Error) String() string {
  75         return string(e)
  76 }
  77
  78 // Error codes returned by failures to parse an expression.
  79 var (
  80         ErrInternal            = Error("internal error")
  81         ErrUnmatchedLpar       = Error("unmatched '('")
  82         ErrUnmatchedRpar       = Error("unmatched ')'")
  83         ErrUnmatchedLbkt       = Error("unmatched '['")
  84         ErrUnmatchedRbkt       = Error("unmatched ']'")
  85         ErrBadRange            = Error("bad range in character class")
  86         ErrExtraneousBackslash = Error("extraneous backslash")
  87         ErrBadClosure          = Error("repeated closure (**, ++, etc.)")
  88         ErrBareClosure         = Error("closure applies to nothing")
  89         ErrBadBackslash        = Error("illegal backslash escape")
  90 )
  91
  92 // An instruction executed by the NFA
  93 type instr interface {
  94         kind() int   // the type of this instruction: _CHAR, _ANY, etc.
  95         next() instr // the instruction to execute after this one
  96         setNext(i instr)
  97         index() int
  98         setIndex(i int)
  99         print()
 100 }
 101
 102 // Fields and methods common to all instructions
 103 type common struct {
 104         _next  instr
 105         _index int
 106 }
 107
 108 func (c *common) next() instr     { return c._next }
 109 func (c *common) setNext(i instr) { c._next = i }
 110 func (c *common) index() int      { return c._index }
 111 func (c *common) setIndex(i int)  { c._index = i }
 112
 113 // Regexp is the representation of a compiled regular expression.
 114 // The public interface is entirely through methods.
 115 type Regexp struct {
 116         expr        string // the original expression
 117         prefix      string // initial plain text string
 118         prefixBytes []byte // initial plain text bytes
 119         inst        []instr
 120         start       instr // first instruction of machine
 121         prefixStart instr // where to start if there is a prefix
 122         nbra        int   // number of brackets in expression, for subexpressions
 123 }
 124
 125 const (
 126         _START     = iota // beginning of program
 127         _END              // end of program: success
 128         _BOT              // '^' beginning of text
 129         _EOT              // '$' end of text
 130         _CHAR             // 'a' regular character
 131         _CHARCLASS        // [a-z] character class
 132         _ANY              // '.' any character including newline
 133         _NOTNL            // [^\n] special case: any character but newline
 134         _BRA              // '(' parenthesized expression
 135         _EBRA             // ')'; end of '(' parenthesized expression
 136         _ALT              // '|' alternation
 137         _NOP              // do nothing; makes it easy to link without patching
 138 )
 139
 140 // --- START start of program
 141 type _Start struct {
 142         common
 143 }
 144
 145 func (start *_Start) kind() int { return _START }
 146 func (start *_Start) print()    { print("start") }
 147
 148 // --- END end of program
 149 type _End struct {
 150         common
 151 }
 152
 153 func (end *_End) kind() int { return _END }
 154 func (end *_End) print()    { print("end") }
 155
 156 // --- BOT beginning of text
 157 type _Bot struct {
 158         common
 159 }
 160
 161 func (bot *_Bot) kind() int { return _BOT }
 162 func (bot *_Bot) print()    { print("bot") }
 163
 164 // --- EOT end of text
 165 type _Eot struct {
 166         common
 167 }
 168
 169 func (eot *_Eot) kind() int { return _EOT }
 170 func (eot *_Eot) print()    { print("eot") }
 171
 172 // --- CHAR a regular character
 173 type _Char struct {
 174         common
 175         char int
 176 }
 177
 178 func (char *_Char) kind() int { return _CHAR }
 179 func (char *_Char) print()    { print("char ", string(char.char)) }
 180
 181 func newChar(char int) *_Char {
 182         c := new(_Char)
 183         c.char = char
 184         return c
 185 }
 186
 187 // --- CHARCLASS [a-z]
 188
 189 type _CharClass struct {
 190         common
 191         negate bool // is character class negated? ([^a-z])
 192         // slice of int, stored pairwise: [a-z] is (a,z); x is (x,x):
 193         ranges     []int
 194         cmin, cmax int
 195 }
 196
 197 func (cclass *_CharClass) kind() int { return _CHARCLASS }
 198
 199 func (cclass *_CharClass) print() {
 200         print("charclass")
 201         if cclass.negate {
 202                 print(" (negated)")
 203         }
 204         for i := 0; i < len(cclass.ranges); i += 2 {
 205                 l := cclass.ranges[i]
 206                 r := cclass.ranges[i+1]
 207                 if l == r {
 208                         print(" [", string(l), "]")
 209                 } else {
 210                         print(" [", string(l), "-", string(r), "]")
 211                 }
 212         }
 213 }
 214
 215 func (cclass *_CharClass) addRange(a, b int) {
 216         // range is a through b inclusive
 217         cclass.ranges = append(cclass.ranges, a, b)
 218         if a < cclass.cmin {
 219                 cclass.cmin = a
 220         }
 221         if b > cclass.cmax {
 222                 cclass.cmax = b
 223         }
 224 }
 225
 226 func (cclass *_CharClass) matches(c int) bool {
 227         if c < cclass.cmin || c > cclass.cmax {
 228                 return cclass.negate
 229         }
 230         ranges := cclass.ranges
 231         for i := 0; i < len(ranges); i = i + 2 {
 232                 if ranges[i] <= c && c <= ranges[i+1] {
 233                         return !cclass.negate
 234                 }
 235         }
 236         return cclass.negate
 237 }
 238
 239 func newCharClass() *_CharClass {
 240         c := new(_CharClass)
 241         c.ranges = make([]int, 0, 4)
 242         c.cmin = 0x10FFFF + 1 // MaxRune + 1
 243         c.cmax = -1
 244         return c
 245 }
 246
 247 // --- ANY any character
 248 type _Any struct {
 249         common
 250 }
 251
 252 func (any *_Any) kind() int { return _ANY }
 253 func (any *_Any) print()    { print("any") }
 254
 255 // --- NOTNL any character but newline
 256 type _NotNl struct {
 257         common
 258 }
 259
 260 func (notnl *_NotNl) kind() int { return _NOTNL }
 261 func (notnl *_NotNl) print()    { print("notnl") }
 262
 263 // --- BRA parenthesized expression
 264 type _Bra struct {
 265         common
 266         n int // subexpression number
 267 }
 268
 269 func (bra *_Bra) kind() int { return _BRA }
 270 func (bra *_Bra) print()    { print("bra", bra.n) }
 271
 272 // --- EBRA end of parenthesized expression
 273 type _Ebra struct {
 274         common
 275         n int // subexpression number
 276 }
 277
 278 func (ebra *_Ebra) kind() int { return _EBRA }
 279 func (ebra *_Ebra) print()    { print("ebra ", ebra.n) }
 280
 281 // --- ALT alternation
 282 type _Alt struct {
 283         common
 284         left instr // other branch
 285 }
 286
 287 func (alt *_Alt) kind() int { return _ALT }
 288 func (alt *_Alt) print()    { print("alt(", alt.left.index(), ")") }
 289
 290 // --- NOP no operation
 291 type _Nop struct {
 292         common
 293 }
 294
 295 func (nop *_Nop) kind() int { return _NOP }
 296 func (nop *_Nop) print()    { print("nop") }
 297
 298 func (re *Regexp) add(i instr) instr {
 299         i.setIndex(len(re.inst))
 300         re.inst = append(re.inst, i)
 301         return i
 302 }
 303
 304 type parser struct {
 305         re    *Regexp
 306         nlpar int // number of unclosed lpars
 307         pos   int
 308         ch    int
 309 }
 310
 311 func (p *parser) error(err Error) {
 312         panic(err)
 313 }
 314
 315 const endOfFile = -1
 316
 317 func (p *parser) c() int { return p.ch }
 318
 319 func (p *parser) nextc() int {
 320         if p.pos >= len(p.re.expr) {
 321                 p.ch = endOfFile
 322         } else {
 323                 c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:])
 324                 p.ch = c
 325                 p.pos += w
 326         }
 327         return p.ch
 328 }
 329
 330 func newParser(re *Regexp) *parser {
 331         p := new(parser)
 332         p.re = re
 333         p.nextc() // load p.ch
 334         return p
 335 }
 336
 337 func special(c int) bool {
 338         for _, r := range `\.+*?()|[]^$` {
 339                 if c == r {
 340                         return true
 341                 }
 342         }
 343         return false
 344 }
 345
 346 func ispunct(c int) bool {
 347         for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" {
 348                 if c == r {
 349                         return true
 350                 }
 351         }
 352         return false
 353 }
 354
 355 var escapes = []byte("abfnrtv")
 356 var escaped = []byte("\a\b\f\n\r\t\v")
 357
 358 func escape(c int) int {
 359         for i, b := range escapes {
 360                 if int(b) == c {
 361                         return i
 362                 }
 363         }
 364         return -1
 365 }
 366
 367 func (p *parser) charClass() instr {
 368         cc := newCharClass()
 369         if p.c() == '^' {
 370                 cc.negate = true
 371                 p.nextc()
 372         }
 373         left := -1
 374         for {
 375                 switch c := p.c(); c {
 376                 case ']', endOfFile:
 377                         if left >= 0 {
 378                                 p.error(ErrBadRange)
 379                         }
 380                         // Is it [^\n]?
 381                         if cc.negate && len(cc.ranges) == 2 &&
 382                                 cc.ranges[0] == '\n' && cc.ranges[1] == '\n' {
 383                                 nl := new(_NotNl)
 384                                 p.re.add(nl)
 385                                 return nl
 386                         }
 387                         // Special common case: "[a]" -> "a"
 388                         if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] {
 389                                 c := newChar(cc.ranges[0])
 390                                 p.re.add(c)
 391                                 return c
 392                         }
 393                         p.re.add(cc)
 394                         return cc
 395                 case '-': // do this before backslash processing
 396                         p.error(ErrBadRange)
 397                 case '\\':
 398                         c = p.nextc()
 399                         switch {
 400                         case c == endOfFile:
 401                                 p.error(ErrExtraneousBackslash)
 402                         case ispunct(c):
 403                                 // c is as delivered
 404                         case escape(c) >= 0:
 405                                 c = int(escaped[escape(c)])
 406                         default:
 407                                 p.error(ErrBadBackslash)
 408                         }
 409                         fallthrough
 410                 default:
 411                         p.nextc()
 412                         switch {
 413                         case left < 0: // first of pair
 414                                 if p.c() == '-' { // range
 415                                         p.nextc()
 416                                         left = c
 417                                 } else { // single char
 418                                         cc.addRange(c, c)
 419                                 }
 420                         case left <= c: // second of pair
 421                                 cc.addRange(left, c)
 422                                 left = -1
 423                         default:
 424                                 p.error(ErrBadRange)
 425                         }
 426                 }
 427         }
 428         return nil
 429 }
 430
 431 func (p *parser) term() (start, end instr) {
 432         switch c := p.c(); c {
 433         case '|', endOfFile:
 434                 return nil, nil
 435         case '*', '+':
 436                 p.error(ErrBareClosure)
 437         case ')':
 438                 if p.nlpar == 0 {
 439                         p.error(ErrUnmatchedRpar)
 440                 }
 441                 return nil, nil
 442         case ']':
 443                 p.error(ErrUnmatchedRbkt)
 444         case '^':
 445                 p.nextc()
 446                 start = p.re.add(new(_Bot))
 447                 return start, start
 448         case '$':
 449                 p.nextc()
 450                 start = p.re.add(new(_Eot))
 451                 return start, start
 452         case '.':
 453                 p.nextc()
 454                 start = p.re.add(new(_Any))
 455                 return start, start
 456         case '[':
 457                 p.nextc()
 458                 start = p.charClass()
 459                 if p.c() != ']' {
 460                         p.error(ErrUnmatchedLbkt)
 461                 }
 462                 p.nextc()
 463                 return start, start
 464         case '(':
 465                 p.nextc()
 466                 p.nlpar++
 467                 p.re.nbra++ // increment first so first subexpr is \1
 468                 nbra := p.re.nbra
 469                 start, end = p.regexp()
 470                 if p.c() != ')' {
 471                         p.error(ErrUnmatchedLpar)
 472                 }
 473                 p.nlpar--
 474                 p.nextc()
 475                 bra := new(_Bra)
 476                 p.re.add(bra)
 477                 ebra := new(_Ebra)
 478                 p.re.add(ebra)
 479                 bra.n = nbra
 480                 ebra.n = nbra
 481                 if start == nil {
 482                         if end == nil {
 483                                 p.error(ErrInternal)
 484                                 return
 485                         }
 486                         start = ebra
 487                 } else {
 488                         end.setNext(ebra)
 489                 }
 490                 bra.setNext(start)
 491                 return bra, ebra
 492         case '\\':
 493                 c = p.nextc()
 494                 switch {
 495                 case c == endOfFile:
 496                         p.error(ErrExtraneousBackslash)
 497                 case ispunct(c):
 498                         // c is as delivered
 499                 case escape(c) >= 0:
 500                         c = int(escaped[escape(c)])
 501                 default:
 502                         p.error(ErrBadBackslash)
 503                 }
 504                 fallthrough
 505         default:
 506                 p.nextc()
 507                 start = newChar(c)
 508                 p.re.add(start)
 509                 return start, start
 510         }
 511         panic("unreachable")
 512 }
 513
 514 func (p *parser) closure() (start, end instr) {
 515         start, end = p.term()
 516         if start == nil {
 517                 return
 518         }
 519         switch p.c() {
 520         case '*':
 521                 // (start,end)*:
 522                 alt := new(_Alt)
 523                 p.re.add(alt)
 524                 end.setNext(alt) // after end, do alt
 525                 alt.left = start // alternate brach: return to start
 526                 start = alt      // alt becomes new (start, end)
 527                 end = alt
 528         case '+':
 529                 // (start,end)+:
 530                 alt := new(_Alt)
 531                 p.re.add(alt)
 532                 end.setNext(alt) // after end, do alt
 533                 alt.left = start // alternate brach: return to start
 534                 end = alt        // start is unchanged; end is alt
 535         case '?':
 536                 // (start,end)?:
 537                 alt := new(_Alt)
 538                 p.re.add(alt)
 539                 nop := new(_Nop)
 540                 p.re.add(nop)
 541                 alt.left = start // alternate branch is start
 542                 alt.setNext(nop) // follow on to nop
 543                 end.setNext(nop) // after end, go to nop
 544                 start = alt      // start is now alt
 545                 end = nop        // end is nop pointed to by both branches
 546         default:
 547                 return
 548         }
 549         switch p.nextc() {
 550         case '*', '+', '?':
 551                 p.error(ErrBadClosure)
 552         }
 553         return
 554 }
 555
 556 func (p *parser) concatenation() (start, end instr) {
 557         for {
 558                 nstart, nend := p.closure()
 559                 switch {
 560                 case nstart == nil: // end of this concatenation
 561                         if start == nil { // this is the empty string
 562                                 nop := p.re.add(new(_Nop))
 563                                 return nop, nop
 564                         }
 565                         return
 566                 case start == nil: // this is first element of concatenation
 567                         start, end = nstart, nend
 568                 default:
 569                         end.setNext(nstart)
 570                         end = nend
 571                 }
 572         }
 573         panic("unreachable")
 574 }
 575
 576 func (p *parser) regexp() (start, end instr) {
 577         start, end = p.concatenation()
 578         for {
 579                 switch p.c() {
 580                 default:
 581                         return
 582                 case '|':
 583                         p.nextc()
 584                         nstart, nend := p.concatenation()
 585                         alt := new(_Alt)
 586                         p.re.add(alt)
 587                         alt.left = start
 588                         alt.setNext(nstart)
 589                         nop := new(_Nop)
 590                         p.re.add(nop)
 591                         end.setNext(nop)
 592                         nend.setNext(nop)
 593                         start, end = alt, nop
 594                 }
 595         }
 596         panic("unreachable")
 597 }
 598
 599 func unNop(i instr) instr {
 600         for i.kind() == _NOP {
 601                 i = i.next()
 602         }
 603         return i
 604 }
 605
 606 func (re *Regexp) eliminateNops() {
 607         for _, inst := range re.inst {
 608                 if inst.kind() == _END {
 609                         continue
 610                 }
 611                 inst.setNext(unNop(inst.next()))
 612                 if inst.kind() == _ALT {
 613                         alt := inst.(*_Alt)
 614                         alt.left = unNop(alt.left)
 615                 }
 616         }
 617 }
 618
 619 func (re *Regexp) dump() {
 620         print("prefix <", re.prefix, ">\n")
 621         for _, inst := range re.inst {
 622                 print(inst.index(), ": ")
 623                 inst.print()
 624                 if inst.kind() != _END {
 625                         print(" -> ", inst.next().index())
 626                 }
 627                 print("\n")
 628         }
 629 }
 630
 631 func (re *Regexp) doParse() {
 632         p := newParser(re)
 633         start := new(_Start)
 634         re.add(start)
 635         s, e := p.regexp()
 636         start.setNext(s)
 637         re.start = start
 638         e.setNext(re.add(new(_End)))
 639
 640         if debug {
 641                 re.dump()
 642                 println()
 643         }
 644
 645         re.eliminateNops()
 646         if debug {
 647                 re.dump()
 648                 println()
 649         }
 650         re.setPrefix()
 651         if debug {
 652                 re.dump()
 653                 println()
 654         }
 655 }
 656
 657 // Extract regular text from the beginning of the pattern.
 658 // That text can be used by doExecute to speed up matching.
 659 func (re *Regexp) setPrefix() {
 660         var b []byte
 661         var utf = make([]byte, utf8.UTFMax)
 662         // First instruction is start; skip that.
 663         i := re.inst[0].next().index()
 664 Loop:
 665         for i < len(re.inst) {
 666                 inst := re.inst[i]
 667                 // stop if this is not a char
 668                 if inst.kind() != _CHAR {
 669                         break
 670                 }
 671                 // stop if this char can be followed by a match for an empty string,
 672                 // which includes closures, ^, and $.
 673                 switch re.inst[inst.next().index()].kind() {
 674                 case _BOT, _EOT, _ALT:
 675                         break Loop
 676                 }
 677                 n := utf8.EncodeRune(inst.(*_Char).char, utf)
 678                 b = bytes.Add(b, utf[0:n])
 679                 i = inst.next().index()
 680         }
 681         // point prefixStart instruction to first non-CHAR after prefix
 682         re.prefixStart = re.inst[i]
 683         re.prefixBytes = b
 684         re.prefix = string(b)
 685 }
 686
 687 // Compile parses a regular expression and returns, if successful, a Regexp
 688 // object that can be used to match against text.
 689 func Compile(str string) (regexp *Regexp, error os.Error) {
 690         regexp = new(Regexp)
 691         // doParse will panic if there is a parse error.
 692         defer func() {
 693                 if e := recover(); e != nil {
 694                         regexp = nil
 695                         error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception
 696                 }
 697         }()
 698         regexp.expr = str
 699         regexp.inst = make([]instr, 0, 10)
 700         regexp.doParse()
 701         return
 702 }
 703
 704 // MustCompile is like Compile but panics if the expression cannot be parsed.
 705 // It simplifies safe initialization of global variables holding compiled regular
 706 // expressions.
 707 func MustCompile(str string) *Regexp {
 708         regexp, error := Compile(str)
 709         if error != nil {
 710                 panic(`regexp: compiling "` + str + `": ` + error.String())
 711         }
 712         return regexp
 713 }
 714
 715 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
 716 func (re *Regexp) NumSubexp() int { return re.nbra }
 717
 718 // The match arena allows us to reduce the garbage generated by tossing
 719 // match vectors away as we execute.  Matches are ref counted and returned
 720 // to a free list when no longer active.  Increases a simple benchmark by 22X.
 721 type matchArena struct {
 722         head *matchVec
 723         len  int // length of match vector
 724 }
 725
 726 type matchVec struct {
 727         m    []int // pairs of bracketing submatches. 0th is start,end
 728         ref  int
 729         next *matchVec
 730 }
 731
 732 func (a *matchArena) new() *matchVec {
 733         if a.head == nil {
 734                 const N = 10
 735                 block := make([]matchVec, N)
 736                 for i := 0; i < N; i++ {
 737                         b := &block[i]
 738                         b.next = a.head
 739                         a.head = b
 740                 }
 741         }
 742         m := a.head
 743         a.head = m.next
 744         m.ref = 0
 745         if m.m == nil {
 746                 m.m = make([]int, a.len)
 747         }
 748         return m
 749 }
 750
 751 func (a *matchArena) free(m *matchVec) {
 752         m.ref--
 753         if m.ref == 0 {
 754                 m.next = a.head
 755                 a.head = m
 756         }
 757 }
 758
 759 func (a *matchArena) copy(m *matchVec) *matchVec {
 760         m1 := a.new()
 761         copy(m1.m, m.m)
 762         return m1
 763 }
 764
 765 func (a *matchArena) noMatch() *matchVec {
 766         m := a.new()
 767         for i := range m.m {
 768                 m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
 769         }
 770         m.ref = 1
 771         return m
 772 }
 773
 774 type state struct {
 775         inst     instr // next instruction to execute
 776         prefixed bool  // this match began with a fixed prefix
 777         match    *matchVec
 778 }
 779
 780 // Append new state to to-do list.  Leftmost-longest wins so avoid
 781 // adding a state that's already active.  The matchVec will be inc-ref'ed
 782 // if it is assigned to a state.
 783 func (a *matchArena) addState(s []state, inst instr, prefixed bool, match *matchVec, pos, end int) []state {
 784         switch inst.kind() {
 785         case _BOT:
 786                 if pos == 0 {
 787                         s = a.addState(s, inst.next(), prefixed, match, pos, end)
 788                 }
 789                 return s
 790         case _EOT:
 791                 if pos == end {
 792                         s = a.addState(s, inst.next(), prefixed, match, pos, end)
 793                 }
 794                 return s
 795         case _BRA:
 796                 n := inst.(*_Bra).n
 797                 match.m[2*n] = pos
 798                 s = a.addState(s, inst.next(), prefixed, match, pos, end)
 799                 return s
 800         case _EBRA:
 801                 n := inst.(*_Ebra).n
 802                 match.m[2*n+1] = pos
 803                 s = a.addState(s, inst.next(), prefixed, match, pos, end)
 804                 return s
 805         }
 806         index := inst.index()
 807         l := len(s)
 808         // States are inserted in order so it's sufficient to see if we have the same
 809         // instruction; no need to see if existing match is earlier (it is).
 810         for i := 0; i < l; i++ {
 811                 if s[i].inst.index() == index {
 812                         return s
 813                 }
 814         }
 815         s = append(s, state{inst, prefixed, match})
 816         match.ref++
 817         if inst.kind() == _ALT {
 818                 s = a.addState(s, inst.(*_Alt).left, prefixed, a.copy(match), pos, end)
 819                 // give other branch a copy of this match vector
 820                 s = a.addState(s, inst.next(), prefixed, a.copy(match), pos, end)
 821         }
 822         return s
 823 }
 824
 825 // Accepts either string or bytes - the logic is identical either way.
 826 // If bytes == nil, scan str.
 827 func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
 828         var s [2][]state
 829         s[0] = make([]state, 0, 10)
 830         s[1] = make([]state, 0, 10)
 831         in, out := 0, 1
 832         var final state
 833         found := false
 834         end := len(str)
 835         if bytestr != nil {
 836                 end = len(bytestr)
 837         }
 838         // fast check for initial plain substring
 839         prefixed := false // has this iteration begun by skipping a prefix?
 840         if re.prefix != "" {
 841                 var advance int
 842                 if bytestr == nil {
 843                         advance = strings.Index(str[pos:], re.prefix)
 844                 } else {
 845                         advance = bytes.Index(bytestr[pos:], re.prefixBytes)
 846                 }
 847                 if advance == -1 {
 848                         return nil
 849                 }
 850                 pos += advance + len(re.prefix)
 851                 prefixed = true
 852         }
 853         arena := &matchArena{nil, 2 * (re.nbra + 1)}
 854         for pos <= end {
 855                 if !found {
 856                         // prime the pump if we haven't seen a match yet
 857                         match := arena.noMatch()
 858                         match.m[0] = pos
 859                         if prefixed {
 860                                 s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end)
 861                                 prefixed = false // next iteration should start at beginning of machine.
 862                         } else {
 863                                 s[out] = arena.addState(s[out], re.start.next(), false, match, pos, end)
 864                         }
 865                         arena.free(match) // if addState saved it, ref was incremented
 866                 }
 867                 in, out = out, in // old out state is new in state
 868                 // clear out old state
 869                 old := s[out]
 870                 for _, state := range old {
 871                         arena.free(state.match)
 872                 }
 873                 s[out] = old[0:0] // truncate state vector
 874                 if found && len(s[in]) == 0 {
 875                         // machine has completed
 876                         break
 877                 }
 878                 charwidth := 1
 879                 c := endOfFile
 880                 if pos < end {
 881                         if bytestr == nil {
 882                                 c, charwidth = utf8.DecodeRuneInString(str[pos:end])
 883                         } else {
 884                                 c, charwidth = utf8.DecodeRune(bytestr[pos:end])
 885                         }
 886                 }
 887                 pos += charwidth
 888                 for _, st := range s[in] {
 889                         switch st.inst.kind() {
 890                         case _BOT:
 891                         case _EOT:
 892                         case _CHAR:
 893                                 if c == st.inst.(*_Char).char {
 894                                         s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end)
 895                                 }
 896                         case _CHARCLASS:
 897                                 if st.inst.(*_CharClass).matches(c) {
 898                                         s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end)
 899                                 }
 900                         case _ANY:
 901                                 if c != endOfFile {
 902                                         s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end)
 903                                 }
 904                         case _NOTNL:
 905                                 if c != endOfFile && c != '\n' {
 906                                         s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end)
 907                                 }
 908                         case _BRA:
 909                         case _EBRA:
 910                         case _ALT:
 911                         case _END:
 912                                 // choose leftmost longest
 913                                 if !found || // first
 914                                         st.match.m[0] < final.match.m[0] || // leftmost
 915                                         (st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest
 916                                         if final.match != nil {
 917                                                 arena.free(final.match)
 918                                         }
 919                                         final = st
 920                                         final.match.ref++
 921                                         final.match.m[1] = pos - charwidth
 922                                 }
 923                                 found = true
 924                         default:
 925                                 st.inst.print()
 926                                 panic("unknown instruction in execute")
 927                         }
 928                 }
 929         }
 930         if final.match == nil {
 931                 return nil
 932         }
 933         // if match found, back up start of match by width of prefix.
 934         if final.prefixed && len(final.match.m) > 0 {
 935                 final.match.m[0] -= len(re.prefix)
 936         }
 937         return final.match.m
 938 }
 939
 940 // MatchString returns whether the Regexp matches the string s.
 941 // The return value is a boolean: true for match, false for no match.
 942 func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 }
 943
 944 // Match returns whether the Regexp matches the byte slice b.
 945 // The return value is a boolean: true for match, false for no match.
 946 func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 }
 947
 948
 949 // MatchString checks whether a textual regular expression
 950 // matches a string.  More complicated queries need
 951 // to use Compile and the full Regexp interface.
 952 func MatchString(pattern string, s string) (matched bool, error os.Error) {
 953         re, err := Compile(pattern)
 954         if err != nil {
 955                 return false, err
 956         }
 957         return re.MatchString(s), nil
 958 }
 959
 960 // Match checks whether a textual regular expression
 961 // matches a byte slice.  More complicated queries need
 962 // to use Compile and the full Regexp interface.
 963 func Match(pattern string, b []byte) (matched bool, error os.Error) {
 964         re, err := Compile(pattern)
 965         if err != nil {
 966                 return false, err
 967         }
 968         return re.Match(b), nil
 969 }
 970
 971 // ReplaceAllString returns a copy of src in which all matches for the Regexp
 972 // have been replaced by repl.  No support is provided for expressions
 973 // (e.g. \1 or $1) in the replacement string.
 974 func (re *Regexp) ReplaceAllString(src, repl string) string {
 975         return re.ReplaceAllStringFunc(src, func(string) string { return repl })
 976 }
 977
 978 // ReplaceAllStringFunc returns a copy of src in which all matches for the
 979 // Regexp have been replaced by the return value of of function repl (whose
 980 // first argument is the matched string).  No support is provided for
 981 // expressions (e.g. \1 or $1) in the replacement string.
 982 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
 983         lastMatchEnd := 0 // end position of the most recent match
 984         searchPos := 0    // position where we next look for a match
 985         buf := new(bytes.Buffer)
 986         for searchPos <= len(src) {
 987                 a := re.doExecute(src, nil, searchPos)
 988                 if len(a) == 0 {
 989                         break // no more matches
 990                 }
 991
 992                 // Copy the unmatched characters before this match.
 993                 io.WriteString(buf, src[lastMatchEnd:a[0]])
 994
 995                 // Now insert a copy of the replacement string, but not for a
 996                 // match of the empty string immediately after another match.
 997                 // (Otherwise, we get double replacement for patterns that
 998                 // match both empty and nonempty strings.)
 999                 if a[1] > lastMatchEnd || a[0] == 0 {
1000                         io.WriteString(buf, repl(src[a[0]:a[1]]))
1001                 }
1002                 lastMatchEnd = a[1]
1003
1004                 // Advance past this match; always advance at least one character.
1005                 _, width := utf8.DecodeRuneInString(src[searchPos:])
1006                 if searchPos+width > a[1] {
1007                         searchPos += width
1008                 } else if searchPos+1 > a[1] {
1009                         // This clause is only needed at the end of the input
1010                         // string.  In that case, DecodeRuneInString returns width=0.
1011                         searchPos++
1012                 } else {
1013                         searchPos = a[1]
1014                 }
1015         }
1016
1017         // Copy the unmatched characters after the last match.
1018         io.WriteString(buf, src[lastMatchEnd:])
1019
1020         return buf.String()
1021 }
1022
1023 // ReplaceAll returns a copy of src in which all matches for the Regexp
1024 // have been replaced by repl.  No support is provided for expressions
1025 // (e.g. \1 or $1) in the replacement text.
1026 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
1027         return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
1028 }
1029
1030 // ReplaceAllFunc returns a copy of src in which all matches for the
1031 // Regexp have been replaced by the return value of of function repl (whose
1032 // first argument is the matched []byte).  No support is provided for
1033 // expressions (e.g. \1 or $1) in the replacement string.
1034 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
1035         lastMatchEnd := 0 // end position of the most recent match
1036         searchPos := 0    // position where we next look for a match
1037         buf := new(bytes.Buffer)
1038         for searchPos <= len(src) {
1039                 a := re.doExecute("", src, searchPos)
1040                 if len(a) == 0 {
1041                         break // no more matches
1042                 }
1043
1044                 // Copy the unmatched characters before this match.
1045                 buf.Write(src[lastMatchEnd:a[0]])
1046
1047                 // Now insert a copy of the replacement string, but not for a
1048                 // match of the empty string immediately after another match.
1049                 // (Otherwise, we get double replacement for patterns that
1050                 // match both empty and nonempty strings.)
1051                 if a[1] > lastMatchEnd || a[0] == 0 {
1052                         buf.Write(repl(src[a[0]:a[1]]))
1053                 }
1054                 lastMatchEnd = a[1]
1055
1056                 // Advance past this match; always advance at least one character.
1057                 _, width := utf8.DecodeRune(src[searchPos:])
1058                 if searchPos+width > a[1] {
1059                         searchPos += width
1060                 } else if searchPos+1 > a[1] {
1061                         // This clause is only needed at the end of the input
1062                         // string.  In that case, DecodeRuneInString returns width=0.
1063                         searchPos++
1064                 } else {
1065                         searchPos = a[1]
1066                 }
1067         }
1068
1069         // Copy the unmatched characters after the last match.
1070         buf.Write(src[lastMatchEnd:])
1071
1072         return buf.Bytes()
1073 }
1074
1075 // QuoteMeta returns a string that quotes all regular expression metacharacters
1076 // inside the argument text; the returned string is a regular expression matching
1077 // the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
1078 func QuoteMeta(s string) string {
1079         b := make([]byte, 2*len(s))
1080
1081         // A byte loop is correct because all metacharacters are ASCII.
1082         j := 0
1083         for i := 0; i < len(s); i++ {
1084                 if special(int(s[i])) {
1085                         b[j] = '\\'
1086                         j++
1087                 }
1088                 b[j] = s[i]
1089                 j++
1090         }
1091         return string(b[0:j])
1092 }
1093
1094 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
1095 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
1096         var end int
1097         if b == nil {
1098                 end = len(s)
1099         } else {
1100                 end = len(b)
1101         }
1102
1103         for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
1104                 matches := re.doExecute(s, b, pos)
1105                 if len(matches) == 0 {
1106                         break
1107                 }
1108
1109                 accept := true
1110                 if matches[1] == pos {
1111                         // We've found an empty match.
1112                         if matches[0] == prevMatchEnd {
1113                                 // We don't allow an empty match right
1114                                 // after a previous match, so ignore it.
1115                                 accept = false
1116                         }
1117                         var width int
1118                         if b == nil {
1119                                 _, width = utf8.DecodeRuneInString(s[pos:end])
1120                         } else {
1121                                 _, width = utf8.DecodeRune(b[pos:end])
1122                         }
1123                         if width > 0 {
1124                                 pos += width
1125                         } else {
1126                                 pos = end + 1
1127                         }
1128                 } else {
1129                         pos = matches[1]
1130                 }
1131                 prevMatchEnd = matches[1]
1132
1133                 if accept {
1134                         deliver(matches)
1135                         i++
1136                 }
1137         }
1138 }
1139
1140 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
1141 // A return value of nil indicates no match.
1142 func (re *Regexp) Find(b []byte) []byte {
1143         a := re.doExecute("", b, 0)
1144         if a == nil {
1145                 return nil
1146         }
1147         return b[a[0]:a[1]]
1148 }
1149
1150 // FindIndex returns a two-element slice of integers defining the location of
1151 // the leftmost match in b of the regular expression.  The match itself is at
1152 // b[loc[0]:loc[1]].
1153 // A return value of nil indicates no match.
1154 func (re *Regexp) FindIndex(b []byte) (loc []int) {
1155         a := re.doExecute("", b, 0)
1156         if a == nil {
1157                 return nil
1158         }
1159         return a[0:2]
1160 }
1161
1162 // FindString returns a string holding the text of the leftmost match in s of the regular
1163 // expression.  If there is no match, the return value is an empty string,
1164 // but it will also be empty if the regular expression successfully matches
1165 // an empty string.  Use FindStringIndex or FindStringSubmatch if it is
1166 // necessary to distinguish these cases.
1167 func (re *Regexp) FindString(s string) string {
1168         a := re.doExecute(s, nil, 0)
1169         if a == nil {
1170                 return ""
1171         }
1172         return s[a[0]:a[1]]
1173 }
1174
1175 // FindStringIndex returns a two-element slice of integers defining the
1176 // location of the leftmost match in s of the regular expression.  The match
1177 // itself is at s[loc[0]:loc[1]].
1178 // A return value of nil indicates no match.
1179 func (re *Regexp) FindStringIndex(s string) []int {
1180         a := re.doExecute(s, nil, 0)
1181         if a == nil {
1182                 return nil
1183         }
1184         return a[0:2]
1185 }
1186
1187 // FindSubmatch returns a slice of slices holding the text of the leftmost
1188 // match of the regular expression in b and the matches, if any, of its
1189 // subexpressions, as defined by the 'Submatch' descriptions in the package
1190 // comment.
1191 // A return value of nil indicates no match.
1192 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
1193         a := re.doExecute("", b, 0)
1194         if a == nil {
1195                 return nil
1196         }
1197         ret := make([][]byte, len(a)/2)
1198         for i := range ret {
1199                 if a[2*i] >= 0 {
1200                         ret[i] = b[a[2*i]:a[2*i+1]]
1201                 }
1202         }
1203         return ret
1204 }
1205
1206 // FindSubmatchIndex returns a slice holding the index pairs identifying the
1207 // leftmost match of the regular expression in b and the matches, if any, of
1208 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
1209 // in the package comment.
1210 // A return value of nil indicates no match.
1211 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
1212         return re.doExecute("", b, 0)
1213 }
1214
1215 // FindStringSubmatch returns a slice of strings holding the text of the
1216 // leftmost match of the regular expression in s and the matches, if any, of
1217 // its subexpressions, as defined by the 'Submatch' description in the
1218 // package comment.
1219 // A return value of nil indicates no match.
1220 func (re *Regexp) FindStringSubmatch(s string) []string {
1221         a := re.doExecute(s, nil, 0)
1222         if a == nil {
1223                 return nil
1224         }
1225         ret := make([]string, len(a)/2)
1226         for i := range ret {
1227                 if a[2*i] >= 0 {
1228                         ret[i] = s[a[2*i]:a[2*i+1]]
1229                 }
1230         }
1231         return ret
1232 }
1233
1234 // FindStringSubmatchIndex returns a slice holding the index pairs
1235 // identifying the leftmost match of the regular expression in s and the
1236 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
1237 // 'Index' descriptions in the package comment.
1238 // A return value of nil indicates no match.
1239 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
1240         return re.doExecute(s, nil, 0)
1241 }
1242
1243 const startSize = 10 // The size at which to start a slice in the 'All' routines.
1244
1245 // FindAll is the 'All' version of Find; it returns a slice of all successive
1246 // matches of the expression, as defined by the 'All' description in the
1247 // package comment.
1248 // A return value of nil indicates no match.
1249 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
1250         if n < 0 {
1251                 n = len(b) + 1
1252         }
1253         result := make([][]byte, 0, startSize)
1254         re.allMatches("", b, n, func(match []int) {
1255                 result = append(result, b[match[0]:match[1]])
1256         })
1257         if len(result) == 0 {
1258                 return nil
1259         }
1260         return result
1261 }
1262
1263 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
1264 // successive matches of the expression, as defined by the 'All' description
1265 // in the package comment.
1266 // A return value of nil indicates no match.
1267 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
1268         if n < 0 {
1269                 n = len(b) + 1
1270         }
1271         result := make([][]int, 0, startSize)
1272         re.allMatches("", b, n, func(match []int) {
1273                 result = append(result, match[0:2])
1274         })
1275         if len(result) == 0 {
1276                 return nil
1277         }
1278         return result
1279 }
1280
1281 // FindAllString is the 'All' version of FindString; it returns a slice of all
1282 // successive matches of the expression, as defined by the 'All' description
1283 // in the package comment.
1284 // A return value of nil indicates no match.
1285 func (re *Regexp) FindAllString(s string, n int) []string {
1286         if n < 0 {
1287                 n = len(s) + 1
1288         }
1289         result := make([]string, 0, startSize)
1290         re.allMatches(s, nil, n, func(match []int) {
1291                 result = append(result, s[match[0]:match[1]])
1292         })
1293         if len(result) == 0 {
1294                 return nil
1295         }
1296         return result
1297 }
1298
1299 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
1300 // slice of all successive matches of the expression, as defined by the 'All'
1301 // description in the package comment.
1302 // A return value of nil indicates no match.
1303 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
1304         if n < 0 {
1305                 n = len(s) + 1
1306         }
1307         result := make([][]int, 0, startSize)
1308         re.allMatches(s, nil, n, func(match []int) {
1309                 result = append(result, match[0:2])
1310         })
1311         if len(result) == 0 {
1312                 return nil
1313         }
1314         return result
1315 }
1316
1317 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
1318 // of all successive matches of the expression, as defined by the 'All'
1319 // description in the package comment.
1320 // A return value of nil indicates no match.
1321 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1322         if n < 0 {
1323                 n = len(b) + 1
1324         }
1325         result := make([][][]byte, 0, startSize)
1326         re.allMatches("", b, n, func(match []int) {
1327                 slice := make([][]byte, len(match)/2)
1328                 for j := range slice {
1329                         if match[2*j] >= 0 {
1330                                 slice[j] = b[match[2*j]:match[2*j+1]]
1331                         }
1332                 }
1333                 result = append(result, slice)
1334         })
1335         if len(result) == 0 {
1336                 return nil
1337         }
1338         return result
1339 }
1340
1341 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1342 // a slice of all successive matches of the expression, as defined by the
1343 // 'All' description in the package comment.
1344 // A return value of nil indicates no match.
1345 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1346         if n < 0 {
1347                 n = len(b) + 1
1348         }
1349         result := make([][]int, 0, startSize)
1350         re.allMatches("", b, n, func(match []int) {
1351                 result = append(result, match)
1352         })
1353         if len(result) == 0 {
1354                 return nil
1355         }
1356         return result
1357 }
1358
1359 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1360 // returns a slice of all successive matches of the expression, as defined by
1361 // the 'All' description in the package comment.
1362 // A return value of nil indicates no match.
1363 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1364         if n < 0 {
1365                 n = len(s) + 1
1366         }
1367         result := make([][]string, 0, startSize)
1368         re.allMatches(s, nil, n, func(match []int) {
1369                 slice := make([]string, len(match)/2)
1370                 for j := range slice {
1371                         if match[2*j] >= 0 {
1372                                 slice[j] = s[match[2*j]:match[2*j+1]]
1373                         }
1374                 }
1375                 result = append(result, slice)
1376         })
1377         if len(result) == 0 {
1378                 return nil
1379         }
1380         return result
1381 }
1382
1383 // FindAllStringSubmatchIndex is the 'All' version of
1384 // FindStringSubmatchIndex; it returns a slice of all successive matches of
1385 // the expression, as defined by the 'All' description in the package
1386 // comment.
1387 // A return value of nil indicates no match.
1388 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1389         if n < 0 {
1390                 n = len(s) + 1
1391         }
1392         result := make([][]int, 0, startSize)
1393         re.allMatches(s, nil, n, func(match []int) {
1394                 result = append(result, match)
1395         })
1396         if len(result) == 0 {
1397                 return nil
1398         }
1399         return result
1400 }