...
Run Format

Source file src/regexp/syntax/parse.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package syntax
     6	
     7	import (
     8		"sort"
     9		"strings"
    10		"unicode"
    11		"unicode/utf8"
    12	)
    13	
    14	// An Error describes a failure to parse a regular expression
    15	// and gives the offending expression.
    16	type Error struct {
    17		Code ErrorCode
    18		Expr string
    19	}
    20	
    21	func (e *Error) Error() string {
    22		return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
    23	}
    24	
    25	// An ErrorCode describes a failure to parse a regular expression.
    26	type ErrorCode string
    27	
    28	const (
    29		// Unexpected error
    30		ErrInternalError ErrorCode = "regexp/syntax: internal error"
    31	
    32		// Parse errors
    33		ErrInvalidCharClass      ErrorCode = "invalid character class"
    34		ErrInvalidCharRange      ErrorCode = "invalid character class range"
    35		ErrInvalidEscape         ErrorCode = "invalid escape sequence"
    36		ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
    37		ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
    38		ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
    39		ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
    40		ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
    41		ErrMissingBracket        ErrorCode = "missing closing ]"
    42		ErrMissingParen          ErrorCode = "missing closing )"
    43		ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
    44		ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
    45		ErrUnexpectedParen       ErrorCode = "unexpected )"
    46	)
    47	
    48	func (e ErrorCode) String() string {
    49		return string(e)
    50	}
    51	
    52	// Flags control the behavior of the parser and record information about regexp context.
    53	type Flags uint16
    54	
    55	const (
    56		FoldCase      Flags = 1 << iota // case-insensitive match
    57		Literal                         // treat pattern as literal string
    58		ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
    59		DotNL                           // allow . to match newline
    60		OneLine                         // treat ^ and $ as only matching at beginning and end of text
    61		NonGreedy                       // make repetition operators default to non-greedy
    62		PerlX                           // allow Perl extensions
    63		UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
    64		WasDollar                       // regexp OpEndText was $, not \z
    65		Simple                          // regexp contains no counted repetition
    66	
    67		MatchNL = ClassNL | DotNL
    68	
    69		Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
    70		POSIX Flags = 0                                         // POSIX syntax
    71	)
    72	
    73	// Pseudo-ops for parsing stack.
    74	const (
    75		opLeftParen = opPseudo + iota
    76		opVerticalBar
    77	)
    78	
    79	type parser struct {
    80		flags       Flags     // parse mode flags
    81		stack       []*Regexp // stack of parsed expressions
    82		free        *Regexp
    83		numCap      int // number of capturing groups seen
    84		wholeRegexp string
    85		tmpClass    []rune // temporary char class work space
    86	}
    87	
    88	func (p *parser) newRegexp(op Op) *Regexp {
    89		re := p.free
    90		if re != nil {
    91			p.free = re.Sub0[0]
    92			*re = Regexp{}
    93		} else {
    94			re = new(Regexp)
    95		}
    96		re.Op = op
    97		return re
    98	}
    99	
   100	func (p *parser) reuse(re *Regexp) {
   101		re.Sub0[0] = p.free
   102		p.free = re
   103	}
   104	
   105	// Parse stack manipulation.
   106	
   107	// push pushes the regexp re onto the parse stack and returns the regexp.
   108	func (p *parser) push(re *Regexp) *Regexp {
   109		if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
   110			// Single rune.
   111			if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
   112				return nil
   113			}
   114			re.Op = OpLiteral
   115			re.Rune = re.Rune[:1]
   116			re.Flags = p.flags &^ FoldCase
   117		} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
   118			re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
   119			unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
   120			unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
   121			re.Op == OpCharClass && len(re.Rune) == 2 &&
   122				re.Rune[0]+1 == re.Rune[1] &&
   123				unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
   124				unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
   125			// Case-insensitive rune like [Aa] or [Δδ].
   126			if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
   127				return nil
   128			}
   129	
   130			// Rewrite as (case-insensitive) literal.
   131			re.Op = OpLiteral
   132			re.Rune = re.Rune[:1]
   133			re.Flags = p.flags | FoldCase
   134		} else {
   135			// Incremental concatenation.
   136			p.maybeConcat(-1, 0)
   137		}
   138	
   139		p.stack = append(p.stack, re)
   140		return re
   141	}
   142	
   143	// maybeConcat implements incremental concatenation
   144	// of literal runes into string nodes. The parser calls this
   145	// before each push, so only the top fragment of the stack
   146	// might need processing. Since this is called before a push,
   147	// the topmost literal is no longer subject to operators like *
   148	// (Otherwise ab* would turn into (ab)*.)
   149	// If r >= 0 and there's a node left over, maybeConcat uses it
   150	// to push r with the given flags.
   151	// maybeConcat reports whether r was pushed.
   152	func (p *parser) maybeConcat(r rune, flags Flags) bool {
   153		n := len(p.stack)
   154		if n < 2 {
   155			return false
   156		}
   157	
   158		re1 := p.stack[n-1]
   159		re2 := p.stack[n-2]
   160		if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
   161			return false
   162		}
   163	
   164		// Push re1 into re2.
   165		re2.Rune = append(re2.Rune, re1.Rune...)
   166	
   167		// Reuse re1 if possible.
   168		if r >= 0 {
   169			re1.Rune = re1.Rune0[:1]
   170			re1.Rune[0] = r
   171			re1.Flags = flags
   172			return true
   173		}
   174	
   175		p.stack = p.stack[:n-1]
   176		p.reuse(re1)
   177		return false // did not push r
   178	}
   179	
   180	// newLiteral returns a new OpLiteral Regexp with the given flags
   181	func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
   182		re := p.newRegexp(OpLiteral)
   183		re.Flags = flags
   184		if flags&FoldCase != 0 {
   185			r = minFoldRune(r)
   186		}
   187		re.Rune0[0] = r
   188		re.Rune = re.Rune0[:1]
   189		return re
   190	}
   191	
   192	// minFoldRune returns the minimum rune fold-equivalent to r.
   193	func minFoldRune(r rune) rune {
   194		if r < minFold || r > maxFold {
   195			return r
   196		}
   197		min := r
   198		r0 := r
   199		for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
   200			if min > r {
   201				min = r
   202			}
   203		}
   204		return min
   205	}
   206	
   207	// literal pushes a literal regexp for the rune r on the stack
   208	// and returns that regexp.
   209	func (p *parser) literal(r rune) {
   210		p.push(p.newLiteral(r, p.flags))
   211	}
   212	
   213	// op pushes a regexp with the given op onto the stack
   214	// and returns that regexp.
   215	func (p *parser) op(op Op) *Regexp {
   216		re := p.newRegexp(op)
   217		re.Flags = p.flags
   218		return p.push(re)
   219	}
   220	
   221	// repeat replaces the top stack element with itself repeated according to op, min, max.
   222	// before is the regexp suffix starting at the repetition operator.
   223	// after is the regexp suffix following after the repetition operator.
   224	// repeat returns an updated 'after' and an error, if any.
   225	func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
   226		flags := p.flags
   227		if p.flags&PerlX != 0 {
   228			if len(after) > 0 && after[0] == '?' {
   229				after = after[1:]
   230				flags ^= NonGreedy
   231			}
   232			if lastRepeat != "" {
   233				// In Perl it is not allowed to stack repetition operators:
   234				// a** is a syntax error, not a doubled star, and a++ means
   235				// something else entirely, which we don't support!
   236				return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
   237			}
   238		}
   239		n := len(p.stack)
   240		if n == 0 {
   241			return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
   242		}
   243		sub := p.stack[n-1]
   244		if sub.Op >= opPseudo {
   245			return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
   246		}
   247	
   248		re := p.newRegexp(op)
   249		re.Min = min
   250		re.Max = max
   251		re.Flags = flags
   252		re.Sub = re.Sub0[:1]
   253		re.Sub[0] = sub
   254		p.stack[n-1] = re
   255	
   256		if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
   257			return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
   258		}
   259	
   260		return after, nil
   261	}
   262	
   263	// repeatIsValid reports whether the repetition re is valid.
   264	// Valid means that the combination of the top-level repetition
   265	// and any inner repetitions does not exceed n copies of the
   266	// innermost thing.
   267	// This function rewalks the regexp tree and is called for every repetition,
   268	// so we have to worry about inducing quadratic behavior in the parser.
   269	// We avoid this by only calling repeatIsValid when min or max >= 2.
   270	// In that case the depth of any >= 2 nesting can only get to 9 without
   271	// triggering a parse error, so each subtree can only be rewalked 9 times.
   272	func repeatIsValid(re *Regexp, n int) bool {
   273		if re.Op == OpRepeat {
   274			m := re.Max
   275			if m == 0 {
   276				return true
   277			}
   278			if m < 0 {
   279				m = re.Min
   280			}
   281			if m > n {
   282				return false
   283			}
   284			if m > 0 {
   285				n /= m
   286			}
   287		}
   288		for _, sub := range re.Sub {
   289			if !repeatIsValid(sub, n) {
   290				return false
   291			}
   292		}
   293		return true
   294	}
   295	
   296	// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
   297	func (p *parser) concat() *Regexp {
   298		p.maybeConcat(-1, 0)
   299	
   300		// Scan down to find pseudo-operator | or (.
   301		i := len(p.stack)
   302		for i > 0 && p.stack[i-1].Op < opPseudo {
   303			i--
   304		}
   305		subs := p.stack[i:]
   306		p.stack = p.stack[:i]
   307	
   308		// Empty concatenation is special case.
   309		if len(subs) == 0 {
   310			return p.push(p.newRegexp(OpEmptyMatch))
   311		}
   312	
   313		return p.push(p.collapse(subs, OpConcat))
   314	}
   315	
   316	// alternate replaces the top of the stack (above the topmost '(') with its alternation.
   317	func (p *parser) alternate() *Regexp {
   318		// Scan down to find pseudo-operator (.
   319		// There are no | above (.
   320		i := len(p.stack)
   321		for i > 0 && p.stack[i-1].Op < opPseudo {
   322			i--
   323		}
   324		subs := p.stack[i:]
   325		p.stack = p.stack[:i]
   326	
   327		// Make sure top class is clean.
   328		// All the others already are (see swapVerticalBar).
   329		if len(subs) > 0 {
   330			cleanAlt(subs[len(subs)-1])
   331		}
   332	
   333		// Empty alternate is special case
   334		// (shouldn't happen but easy to handle).
   335		if len(subs) == 0 {
   336			return p.push(p.newRegexp(OpNoMatch))
   337		}
   338	
   339		return p.push(p.collapse(subs, OpAlternate))
   340	}
   341	
   342	// cleanAlt cleans re for eventual inclusion in an alternation.
   343	func cleanAlt(re *Regexp) {
   344		switch re.Op {
   345		case OpCharClass:
   346			re.Rune = cleanClass(&re.Rune)
   347			if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
   348				re.Rune = nil
   349				re.Op = OpAnyChar
   350				return
   351			}
   352			if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
   353				re.Rune = nil
   354				re.Op = OpAnyCharNotNL
   355				return
   356			}
   357			if cap(re.Rune)-len(re.Rune) > 100 {
   358				// re.Rune will not grow any more.
   359				// Make a copy or inline to reclaim storage.
   360				re.Rune = append(re.Rune0[:0], re.Rune...)
   361			}
   362		}
   363	}
   364	
   365	// collapse returns the result of applying op to sub.
   366	// If sub contains op nodes, they all get hoisted up
   367	// so that there is never a concat of a concat or an
   368	// alternate of an alternate.
   369	func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
   370		if len(subs) == 1 {
   371			return subs[0]
   372		}
   373		re := p.newRegexp(op)
   374		re.Sub = re.Sub0[:0]
   375		for _, sub := range subs {
   376			if sub.Op == op {
   377				re.Sub = append(re.Sub, sub.Sub...)
   378				p.reuse(sub)
   379			} else {
   380				re.Sub = append(re.Sub, sub)
   381			}
   382		}
   383		if op == OpAlternate {
   384			re.Sub = p.factor(re.Sub, re.Flags)
   385			if len(re.Sub) == 1 {
   386				old := re
   387				re = re.Sub[0]
   388				p.reuse(old)
   389			}
   390		}
   391		return re
   392	}
   393	
   394	// factor factors common prefixes from the alternation list sub.
   395	// It returns a replacement list that reuses the same storage and
   396	// frees (passes to p.reuse) any removed *Regexps.
   397	//
   398	// For example,
   399	//     ABC|ABD|AEF|BCX|BCY
   400	// simplifies by literal prefix extraction to
   401	//     A(B(C|D)|EF)|BC(X|Y)
   402	// which simplifies by character class introduction to
   403	//     A(B[CD]|EF)|BC[XY]
   404	//
   405	func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
   406		if len(sub) < 2 {
   407			return sub
   408		}
   409	
   410		// Round 1: Factor out common literal prefixes.
   411		var str []rune
   412		var strflags Flags
   413		start := 0
   414		out := sub[:0]
   415		for i := 0; i <= len(sub); i++ {
   416			// Invariant: the Regexps that were in sub[0:start] have been
   417			// used or marked for reuse, and the slice space has been reused
   418			// for out (len(out) <= start).
   419			//
   420			// Invariant: sub[start:i] consists of regexps that all begin
   421			// with str as modified by strflags.
   422			var istr []rune
   423			var iflags Flags
   424			if i < len(sub) {
   425				istr, iflags = p.leadingString(sub[i])
   426				if iflags == strflags {
   427					same := 0
   428					for same < len(str) && same < len(istr) && str[same] == istr[same] {
   429						same++
   430					}
   431					if same > 0 {
   432						// Matches at least one rune in current range.
   433						// Keep going around.
   434						str = str[:same]
   435						continue
   436					}
   437				}
   438			}
   439	
   440			// Found end of a run with common leading literal string:
   441			// sub[start:i] all begin with str[0:len(str)], but sub[i]
   442			// does not even begin with str[0].
   443			//
   444			// Factor out common string and append factored expression to out.
   445			if i == start {
   446				// Nothing to do - run of length 0.
   447			} else if i == start+1 {
   448				// Just one: don't bother factoring.
   449				out = append(out, sub[start])
   450			} else {
   451				// Construct factored form: prefix(suffix1|suffix2|...)
   452				prefix := p.newRegexp(OpLiteral)
   453				prefix.Flags = strflags
   454				prefix.Rune = append(prefix.Rune[:0], str...)
   455	
   456				for j := start; j < i; j++ {
   457					sub[j] = p.removeLeadingString(sub[j], len(str))
   458				}
   459				suffix := p.collapse(sub[start:i], OpAlternate) // recurse
   460	
   461				re := p.newRegexp(OpConcat)
   462				re.Sub = append(re.Sub[:0], prefix, suffix)
   463				out = append(out, re)
   464			}
   465	
   466			// Prepare for next iteration.
   467			start = i
   468			str = istr
   469			strflags = iflags
   470		}
   471		sub = out
   472	
   473		// Round 2: Factor out common simple prefixes,
   474		// just the first piece of each concatenation.
   475		// This will be good enough a lot of the time.
   476		//
   477		// Complex subexpressions (e.g. involving quantifiers)
   478		// are not safe to factor because that collapses their
   479		// distinct paths through the automaton, which affects
   480		// correctness in some cases.
   481		start = 0
   482		out = sub[:0]
   483		var first *Regexp
   484		for i := 0; i <= len(sub); i++ {
   485			// Invariant: the Regexps that were in sub[0:start] have been
   486			// used or marked for reuse, and the slice space has been reused
   487			// for out (len(out) <= start).
   488			//
   489			// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
   490			var ifirst *Regexp
   491			if i < len(sub) {
   492				ifirst = p.leadingRegexp(sub[i])
   493				if first != nil && first.Equal(ifirst) &&
   494					// first must be a character class OR a fixed repeat of a character class.
   495					(isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
   496					continue
   497				}
   498			}
   499	
   500			// Found end of a run with common leading regexp:
   501			// sub[start:i] all begin with first but sub[i] does not.
   502			//
   503			// Factor out common regexp and append factored expression to out.
   504			if i == start {
   505				// Nothing to do - run of length 0.
   506			} else if i == start+1 {
   507				// Just one: don't bother factoring.
   508				out = append(out, sub[start])
   509			} else {
   510				// Construct factored form: prefix(suffix1|suffix2|...)
   511				prefix := first
   512				for j := start; j < i; j++ {
   513					reuse := j != start // prefix came from sub[start]
   514					sub[j] = p.removeLeadingRegexp(sub[j], reuse)
   515				}
   516				suffix := p.collapse(sub[start:i], OpAlternate) // recurse
   517	
   518				re := p.newRegexp(OpConcat)
   519				re.Sub = append(re.Sub[:0], prefix, suffix)
   520				out = append(out, re)
   521			}
   522	
   523			// Prepare for next iteration.
   524			start = i
   525			first = ifirst
   526		}
   527		sub = out
   528	
   529		// Round 3: Collapse runs of single literals into character classes.
   530		start = 0
   531		out = sub[:0]
   532		for i := 0; i <= len(sub); i++ {
   533			// Invariant: the Regexps that were in sub[0:start] have been
   534			// used or marked for reuse, and the slice space has been reused
   535			// for out (len(out) <= start).
   536			//
   537			// Invariant: sub[start:i] consists of regexps that are either
   538			// literal runes or character classes.
   539			if i < len(sub) && isCharClass(sub[i]) {
   540				continue
   541			}
   542	
   543			// sub[i] is not a char or char class;
   544			// emit char class for sub[start:i]...
   545			if i == start {
   546				// Nothing to do - run of length 0.
   547			} else if i == start+1 {
   548				out = append(out, sub[start])
   549			} else {
   550				// Make new char class.
   551				// Start with most complex regexp in sub[start].
   552				max := start
   553				for j := start + 1; j < i; j++ {
   554					if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
   555						max = j
   556					}
   557				}
   558				sub[start], sub[max] = sub[max], sub[start]
   559	
   560				for j := start + 1; j < i; j++ {
   561					mergeCharClass(sub[start], sub[j])
   562					p.reuse(sub[j])
   563				}
   564				cleanAlt(sub[start])
   565				out = append(out, sub[start])
   566			}
   567	
   568			// ... and then emit sub[i].
   569			if i < len(sub) {
   570				out = append(out, sub[i])
   571			}
   572			start = i + 1
   573		}
   574		sub = out
   575	
   576		// Round 4: Collapse runs of empty matches into a single empty match.
   577		start = 0
   578		out = sub[:0]
   579		for i := range sub {
   580			if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
   581				continue
   582			}
   583			out = append(out, sub[i])
   584		}
   585		sub = out
   586	
   587		return sub
   588	}
   589	
   590	// leadingString returns the leading literal string that re begins with.
   591	// The string refers to storage in re or its children.
   592	func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
   593		if re.Op == OpConcat && len(re.Sub) > 0 {
   594			re = re.Sub[0]
   595		}
   596		if re.Op != OpLiteral {
   597			return nil, 0
   598		}
   599		return re.Rune, re.Flags & FoldCase
   600	}
   601	
   602	// removeLeadingString removes the first n leading runes
   603	// from the beginning of re. It returns the replacement for re.
   604	func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
   605		if re.Op == OpConcat && len(re.Sub) > 0 {
   606			// Removing a leading string in a concatenation
   607			// might simplify the concatenation.
   608			sub := re.Sub[0]
   609			sub = p.removeLeadingString(sub, n)
   610			re.Sub[0] = sub
   611			if sub.Op == OpEmptyMatch {
   612				p.reuse(sub)
   613				switch len(re.Sub) {
   614				case 0, 1:
   615					// Impossible but handle.
   616					re.Op = OpEmptyMatch
   617					re.Sub = nil
   618				case 2:
   619					old := re
   620					re = re.Sub[1]
   621					p.reuse(old)
   622				default:
   623					copy(re.Sub, re.Sub[1:])
   624					re.Sub = re.Sub[:len(re.Sub)-1]
   625				}
   626			}
   627			return re
   628		}
   629	
   630		if re.Op == OpLiteral {
   631			re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
   632			if len(re.Rune) == 0 {
   633				re.Op = OpEmptyMatch
   634			}
   635		}
   636		return re
   637	}
   638	
   639	// leadingRegexp returns the leading regexp that re begins with.
   640	// The regexp refers to storage in re or its children.
   641	func (p *parser) leadingRegexp(re *Regexp) *Regexp {
   642		if re.Op == OpEmptyMatch {
   643			return nil
   644		}
   645		if re.Op == OpConcat && len(re.Sub) > 0 {
   646			sub := re.Sub[0]
   647			if sub.Op == OpEmptyMatch {
   648				return nil
   649			}
   650			return sub
   651		}
   652		return re
   653	}
   654	
   655	// removeLeadingRegexp removes the leading regexp in re.
   656	// It returns the replacement for re.
   657	// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
   658	func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
   659		if re.Op == OpConcat && len(re.Sub) > 0 {
   660			if reuse {
   661				p.reuse(re.Sub[0])
   662			}
   663			re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
   664			switch len(re.Sub) {
   665			case 0:
   666				re.Op = OpEmptyMatch
   667				re.Sub = nil
   668			case 1:
   669				old := re
   670				re = re.Sub[0]
   671				p.reuse(old)
   672			}
   673			return re
   674		}
   675		if reuse {
   676			p.reuse(re)
   677		}
   678		return p.newRegexp(OpEmptyMatch)
   679	}
   680	
   681	func literalRegexp(s string, flags Flags) *Regexp {
   682		re := &Regexp{Op: OpLiteral}
   683		re.Flags = flags
   684		re.Rune = re.Rune0[:0] // use local storage for small strings
   685		for _, c := range s {
   686			if len(re.Rune) >= cap(re.Rune) {
   687				// string is too long to fit in Rune0.  let Go handle it
   688				re.Rune = []rune(s)
   689				break
   690			}
   691			re.Rune = append(re.Rune, c)
   692		}
   693		return re
   694	}
   695	
   696	// Parsing.
   697	
   698	// Parse parses a regular expression string s, controlled by the specified
   699	// Flags, and returns a regular expression parse tree. The syntax is
   700	// described in the top-level comment.
   701	func Parse(s string, flags Flags) (*Regexp, error) {
   702		if flags&Literal != 0 {
   703			// Trivial parser for literal string.
   704			if err := checkUTF8(s); err != nil {
   705				return nil, err
   706			}
   707			return literalRegexp(s, flags), nil
   708		}
   709	
   710		// Otherwise, must do real work.
   711		var (
   712			p          parser
   713			err        error
   714			c          rune
   715			op         Op
   716			lastRepeat string
   717		)
   718		p.flags = flags
   719		p.wholeRegexp = s
   720		t := s
   721		for t != "" {
   722			repeat := ""
   723		BigSwitch:
   724			switch t[0] {
   725			default:
   726				if c, t, err = nextRune(t); err != nil {
   727					return nil, err
   728				}
   729				p.literal(c)
   730	
   731			case '(':
   732				if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
   733					// Flag changes and non-capturing groups.
   734					if t, err = p.parsePerlFlags(t); err != nil {
   735						return nil, err
   736					}
   737					break
   738				}
   739				p.numCap++
   740				p.op(opLeftParen).Cap = p.numCap
   741				t = t[1:]
   742			case '|':
   743				if err = p.parseVerticalBar(); err != nil {
   744					return nil, err
   745				}
   746				t = t[1:]
   747			case ')':
   748				if err = p.parseRightParen(); err != nil {
   749					return nil, err
   750				}
   751				t = t[1:]
   752			case '^':
   753				if p.flags&OneLine != 0 {
   754					p.op(OpBeginText)
   755				} else {
   756					p.op(OpBeginLine)
   757				}
   758				t = t[1:]
   759			case '$':
   760				if p.flags&OneLine != 0 {
   761					p.op(OpEndText).Flags |= WasDollar
   762				} else {
   763					p.op(OpEndLine)
   764				}
   765				t = t[1:]
   766			case '.':
   767				if p.flags&DotNL != 0 {
   768					p.op(OpAnyChar)
   769				} else {
   770					p.op(OpAnyCharNotNL)
   771				}
   772				t = t[1:]
   773			case '[':
   774				if t, err = p.parseClass(t); err != nil {
   775					return nil, err
   776				}
   777			case '*', '+', '?':
   778				before := t
   779				switch t[0] {
   780				case '*':
   781					op = OpStar
   782				case '+':
   783					op = OpPlus
   784				case '?':
   785					op = OpQuest
   786				}
   787				after := t[1:]
   788				if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
   789					return nil, err
   790				}
   791				repeat = before
   792				t = after
   793			case '{':
   794				op = OpRepeat
   795				before := t
   796				min, max, after, ok := p.parseRepeat(t)
   797				if !ok {
   798					// If the repeat cannot be parsed, { is a literal.
   799					p.literal('{')
   800					t = t[1:]
   801					break
   802				}
   803				if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
   804					// Numbers were too big, or max is present and min > max.
   805					return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
   806				}
   807				if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
   808					return nil, err
   809				}
   810				repeat = before
   811				t = after
   812			case '\\':
   813				if p.flags&PerlX != 0 && len(t) >= 2 {
   814					switch t[1] {
   815					case 'A':
   816						p.op(OpBeginText)
   817						t = t[2:]
   818						break BigSwitch
   819					case 'b':
   820						p.op(OpWordBoundary)
   821						t = t[2:]
   822						break BigSwitch
   823					case 'B':
   824						p.op(OpNoWordBoundary)
   825						t = t[2:]
   826						break BigSwitch
   827					case 'C':
   828						// any byte; not supported
   829						return nil, &Error{ErrInvalidEscape, t[:2]}
   830					case 'Q':
   831						// \Q ... \E: the ... is always literals
   832						var lit string
   833						if i := strings.Index(t, `\E`); i < 0 {
   834							lit = t[2:]
   835							t = ""
   836						} else {
   837							lit = t[2:i]
   838							t = t[i+2:]
   839						}
   840						for lit != "" {
   841							c, rest, err := nextRune(lit)
   842							if err != nil {
   843								return nil, err
   844							}
   845							p.literal(c)
   846							lit = rest
   847						}
   848						break BigSwitch
   849					case 'z':
   850						p.op(OpEndText)
   851						t = t[2:]
   852						break BigSwitch
   853					}
   854				}
   855	
   856				re := p.newRegexp(OpCharClass)
   857				re.Flags = p.flags
   858	
   859				// Look for Unicode character group like \p{Han}
   860				if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
   861					r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
   862					if err != nil {
   863						return nil, err
   864					}
   865					if r != nil {
   866						re.Rune = r
   867						t = rest
   868						p.push(re)
   869						break BigSwitch
   870					}
   871				}
   872	
   873				// Perl character class escape.
   874				if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
   875					re.Rune = r
   876					t = rest
   877					p.push(re)
   878					break BigSwitch
   879				}
   880				p.reuse(re)
   881	
   882				// Ordinary single-character escape.
   883				if c, t, err = p.parseEscape(t); err != nil {
   884					return nil, err
   885				}
   886				p.literal(c)
   887			}
   888			lastRepeat = repeat
   889		}
   890	
   891		p.concat()
   892		if p.swapVerticalBar() {
   893			// pop vertical bar
   894			p.stack = p.stack[:len(p.stack)-1]
   895		}
   896		p.alternate()
   897	
   898		n := len(p.stack)
   899		if n != 1 {
   900			return nil, &Error{ErrMissingParen, s}
   901		}
   902		return p.stack[0], nil
   903	}
   904	
   905	// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
   906	// If s is not of that form, it returns ok == false.
   907	// If s has the right form but the values are too big, it returns min == -1, ok == true.
   908	func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
   909		if s == "" || s[0] != '{' {
   910			return
   911		}
   912		s = s[1:]
   913		var ok1 bool
   914		if min, s, ok1 = p.parseInt(s); !ok1 {
   915			return
   916		}
   917		if s == "" {
   918			return
   919		}
   920		if s[0] != ',' {
   921			max = min
   922		} else {
   923			s = s[1:]
   924			if s == "" {
   925				return
   926			}
   927			if s[0] == '}' {
   928				max = -1
   929			} else if max, s, ok1 = p.parseInt(s); !ok1 {
   930				return
   931			} else if max < 0 {
   932				// parseInt found too big a number
   933				min = -1
   934			}
   935		}
   936		if s == "" || s[0] != '}' {
   937			return
   938		}
   939		rest = s[1:]
   940		ok = true
   941		return
   942	}
   943	
   944	// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
   945	// like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
   946	// The caller must have ensured that s begins with "(?".
   947	func (p *parser) parsePerlFlags(s string) (rest string, err error) {
   948		t := s
   949	
   950		// Check for named captures, first introduced in Python's regexp library.
   951		// As usual, there are three slightly different syntaxes:
   952		//
   953		//   (?P<name>expr)   the original, introduced by Python
   954		//   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
   955		//   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
   956		//
   957		// Perl 5.10 gave in and implemented the Python version too,
   958		// but they claim that the last two are the preferred forms.
   959		// PCRE and languages based on it (specifically, PHP and Ruby)
   960		// support all three as well. EcmaScript 4 uses only the Python form.
   961		//
   962		// In both the open source world (via Code Search) and the
   963		// Google source tree, (?P<expr>name) is the dominant form,
   964		// so that's the one we implement. One is enough.
   965		if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
   966			// Pull out name.
   967			end := strings.IndexRune(t, '>')
   968			if end < 0 {
   969				if err = checkUTF8(t); err != nil {
   970					return "", err
   971				}
   972				return "", &Error{ErrInvalidNamedCapture, s}
   973			}
   974	
   975			capture := t[:end+1] // "(?P<name>"
   976			name := t[4:end]     // "name"
   977			if err = checkUTF8(name); err != nil {
   978				return "", err
   979			}
   980			if !isValidCaptureName(name) {
   981				return "", &Error{ErrInvalidNamedCapture, capture}
   982			}
   983	
   984			// Like ordinary capture, but named.
   985			p.numCap++
   986			re := p.op(opLeftParen)
   987			re.Cap = p.numCap
   988			re.Name = name
   989			return t[end+1:], nil
   990		}
   991	
   992		// Non-capturing group. Might also twiddle Perl flags.
   993		var c rune
   994		t = t[2:] // skip (?
   995		flags := p.flags
   996		sign := +1
   997		sawFlag := false
   998	Loop:
   999		for t != "" {
  1000			if c, t, err = nextRune(t); err != nil {
  1001				return "", err
  1002			}
  1003			switch c {
  1004			default:
  1005				break Loop
  1006	
  1007			// Flags.
  1008			case 'i':
  1009				flags |= FoldCase
  1010				sawFlag = true
  1011			case 'm':
  1012				flags &^= OneLine
  1013				sawFlag = true
  1014			case 's':
  1015				flags |= DotNL
  1016				sawFlag = true
  1017			case 'U':
  1018				flags |= NonGreedy
  1019				sawFlag = true
  1020	
  1021			// Switch to negation.
  1022			case '-':
  1023				if sign < 0 {
  1024					break Loop
  1025				}
  1026				sign = -1
  1027				// Invert flags so that | above turn into &^ and vice versa.
  1028				// We'll invert flags again before using it below.
  1029				flags = ^flags
  1030				sawFlag = false
  1031	
  1032			// End of flags, starting group or not.
  1033			case ':', ')':
  1034				if sign < 0 {
  1035					if !sawFlag {
  1036						break Loop
  1037					}
  1038					flags = ^flags
  1039				}
  1040				if c == ':' {
  1041					// Open new group
  1042					p.op(opLeftParen)
  1043				}
  1044				p.flags = flags
  1045				return t, nil
  1046			}
  1047		}
  1048	
  1049		return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
  1050	}
  1051	
  1052	// isValidCaptureName reports whether name
  1053	// is a valid capture name: [A-Za-z0-9_]+.
  1054	// PCRE limits names to 32 bytes.
  1055	// Python rejects names starting with digits.
  1056	// We don't enforce either of those.
  1057	func isValidCaptureName(name string) bool {
  1058		if name == "" {
  1059			return false
  1060		}
  1061		for _, c := range name {
  1062			if c != '_' && !isalnum(c) {
  1063				return false
  1064			}
  1065		}
  1066		return true
  1067	}
  1068	
  1069	// parseInt parses a decimal integer.
  1070	func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
  1071		if s == "" || s[0] < '0' || '9' < s[0] {
  1072			return
  1073		}
  1074		// Disallow leading zeros.
  1075		if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
  1076			return
  1077		}
  1078		t := s
  1079		for s != "" && '0' <= s[0] && s[0] <= '9' {
  1080			s = s[1:]
  1081		}
  1082		rest = s
  1083		ok = true
  1084		// Have digits, compute value.
  1085		t = t[:len(t)-len(s)]
  1086		for i := 0; i < len(t); i++ {
  1087			// Avoid overflow.
  1088			if n >= 1e8 {
  1089				n = -1
  1090				break
  1091			}
  1092			n = n*10 + int(t[i]) - '0'
  1093		}
  1094		return
  1095	}
  1096	
  1097	// can this be represented as a character class?
  1098	// single-rune literal string, char class, ., and .|\n.
  1099	func isCharClass(re *Regexp) bool {
  1100		return re.Op == OpLiteral && len(re.Rune) == 1 ||
  1101			re.Op == OpCharClass ||
  1102			re.Op == OpAnyCharNotNL ||
  1103			re.Op == OpAnyChar
  1104	}
  1105	
  1106	// does re match r?
  1107	func matchRune(re *Regexp, r rune) bool {
  1108		switch re.Op {
  1109		case OpLiteral:
  1110			return len(re.Rune) == 1 && re.Rune[0] == r
  1111		case OpCharClass:
  1112			for i := 0; i < len(re.Rune); i += 2 {
  1113				if re.Rune[i] <= r && r <= re.Rune[i+1] {
  1114					return true
  1115				}
  1116			}
  1117			return false
  1118		case OpAnyCharNotNL:
  1119			return r != '\n'
  1120		case OpAnyChar:
  1121			return true
  1122		}
  1123		return false
  1124	}
  1125	
  1126	// parseVerticalBar handles a | in the input.
  1127	func (p *parser) parseVerticalBar() error {
  1128		p.concat()
  1129	
  1130		// The concatenation we just parsed is on top of the stack.
  1131		// If it sits above an opVerticalBar, swap it below
  1132		// (things below an opVerticalBar become an alternation).
  1133		// Otherwise, push a new vertical bar.
  1134		if !p.swapVerticalBar() {
  1135			p.op(opVerticalBar)
  1136		}
  1137	
  1138		return nil
  1139	}
  1140	
  1141	// mergeCharClass makes dst = dst|src.
  1142	// The caller must ensure that dst.Op >= src.Op,
  1143	// to reduce the amount of copying.
  1144	func mergeCharClass(dst, src *Regexp) {
  1145		switch dst.Op {
  1146		case OpAnyChar:
  1147			// src doesn't add anything.
  1148		case OpAnyCharNotNL:
  1149			// src might add \n
  1150			if matchRune(src, '\n') {
  1151				dst.Op = OpAnyChar
  1152			}
  1153		case OpCharClass:
  1154			// src is simpler, so either literal or char class
  1155			if src.Op == OpLiteral {
  1156				dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1157			} else {
  1158				dst.Rune = appendClass(dst.Rune, src.Rune)
  1159			}
  1160		case OpLiteral:
  1161			// both literal
  1162			if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
  1163				break
  1164			}
  1165			dst.Op = OpCharClass
  1166			dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
  1167			dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1168		}
  1169	}
  1170	
  1171	// If the top of the stack is an element followed by an opVerticalBar
  1172	// swapVerticalBar swaps the two and returns true.
  1173	// Otherwise it returns false.
  1174	func (p *parser) swapVerticalBar() bool {
  1175		// If above and below vertical bar are literal or char class,
  1176		// can merge into a single char class.
  1177		n := len(p.stack)
  1178		if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
  1179			re1 := p.stack[n-1]
  1180			re3 := p.stack[n-3]
  1181			// Make re3 the more complex of the two.
  1182			if re1.Op > re3.Op {
  1183				re1, re3 = re3, re1
  1184				p.stack[n-3] = re3
  1185			}
  1186			mergeCharClass(re3, re1)
  1187			p.reuse(re1)
  1188			p.stack = p.stack[:n-1]
  1189			return true
  1190		}
  1191	
  1192		if n >= 2 {
  1193			re1 := p.stack[n-1]
  1194			re2 := p.stack[n-2]
  1195			if re2.Op == opVerticalBar {
  1196				if n >= 3 {
  1197					// Now out of reach.
  1198					// Clean opportunistically.
  1199					cleanAlt(p.stack[n-3])
  1200				}
  1201				p.stack[n-2] = re1
  1202				p.stack[n-1] = re2
  1203				return true
  1204			}
  1205		}
  1206		return false
  1207	}
  1208	
  1209	// parseRightParen handles a ) in the input.
  1210	func (p *parser) parseRightParen() error {
  1211		p.concat()
  1212		if p.swapVerticalBar() {
  1213			// pop vertical bar
  1214			p.stack = p.stack[:len(p.stack)-1]
  1215		}
  1216		p.alternate()
  1217	
  1218		n := len(p.stack)
  1219		if n < 2 {
  1220			return &Error{ErrUnexpectedParen, p.wholeRegexp}
  1221		}
  1222		re1 := p.stack[n-1]
  1223		re2 := p.stack[n-2]
  1224		p.stack = p.stack[:n-2]
  1225		if re2.Op != opLeftParen {
  1226			return &Error{ErrUnexpectedParen, p.wholeRegexp}
  1227		}
  1228		// Restore flags at time of paren.
  1229		p.flags = re2.Flags
  1230		if re2.Cap == 0 {
  1231			// Just for grouping.
  1232			p.push(re1)
  1233		} else {
  1234			re2.Op = OpCapture
  1235			re2.Sub = re2.Sub0[:1]
  1236			re2.Sub[0] = re1
  1237			p.push(re2)
  1238		}
  1239		return nil
  1240	}
  1241	
  1242	// parseEscape parses an escape sequence at the beginning of s
  1243	// and returns the rune.
  1244	func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
  1245		t := s[1:]
  1246		if t == "" {
  1247			return 0, "", &Error{ErrTrailingBackslash, ""}
  1248		}
  1249		c, t, err := nextRune(t)
  1250		if err != nil {
  1251			return 0, "", err
  1252		}
  1253	
  1254	Switch:
  1255		switch c {
  1256		default:
  1257			if c < utf8.RuneSelf && !isalnum(c) {
  1258				// Escaped non-word characters are always themselves.
  1259				// PCRE is not quite so rigorous: it accepts things like
  1260				// \q, but we don't. We once rejected \_, but too many
  1261				// programs and people insist on using it, so allow \_.
  1262				return c, t, nil
  1263			}
  1264	
  1265		// Octal escapes.
  1266		case '1', '2', '3', '4', '5', '6', '7':
  1267			// Single non-zero digit is a backreference; not supported
  1268			if t == "" || t[0] < '0' || t[0] > '7' {
  1269				break
  1270			}
  1271			fallthrough
  1272		case '0':
  1273			// Consume up to three octal digits; already have one.
  1274			r = c - '0'
  1275			for i := 1; i < 3; i++ {
  1276				if t == "" || t[0] < '0' || t[0] > '7' {
  1277					break
  1278				}
  1279				r = r*8 + rune(t[0]) - '0'
  1280				t = t[1:]
  1281			}
  1282			return r, t, nil
  1283	
  1284		// Hexadecimal escapes.
  1285		case 'x':
  1286			if t == "" {
  1287				break
  1288			}
  1289			if c, t, err = nextRune(t); err != nil {
  1290				return 0, "", err
  1291			}
  1292			if c == '{' {
  1293				// Any number of digits in braces.
  1294				// Perl accepts any text at all; it ignores all text
  1295				// after the first non-hex digit. We require only hex digits,
  1296				// and at least one.
  1297				nhex := 0
  1298				r = 0
  1299				for {
  1300					if t == "" {
  1301						break Switch
  1302					}
  1303					if c, t, err = nextRune(t); err != nil {
  1304						return 0, "", err
  1305					}
  1306					if c == '}' {
  1307						break
  1308					}
  1309					v := unhex(c)
  1310					if v < 0 {
  1311						break Switch
  1312					}
  1313					r = r*16 + v
  1314					if r > unicode.MaxRune {
  1315						break Switch
  1316					}
  1317					nhex++
  1318				}
  1319				if nhex == 0 {
  1320					break Switch
  1321				}
  1322				return r, t, nil
  1323			}
  1324	
  1325			// Easy case: two hex digits.
  1326			x := unhex(c)
  1327			if c, t, err = nextRune(t); err != nil {
  1328				return 0, "", err
  1329			}
  1330			y := unhex(c)
  1331			if x < 0 || y < 0 {
  1332				break
  1333			}
  1334			return x*16 + y, t, nil
  1335	
  1336		// C escapes. There is no case 'b', to avoid misparsing
  1337		// the Perl word-boundary \b as the C backspace \b
  1338		// when in POSIX mode. In Perl, /\b/ means word-boundary
  1339		// but /[\b]/ means backspace. We don't support that.
  1340		// If you want a backspace, embed a literal backspace
  1341		// character or use \x08.
  1342		case 'a':
  1343			return '\a', t, err
  1344		case 'f':
  1345			return '\f', t, err
  1346		case 'n':
  1347			return '\n', t, err
  1348		case 'r':
  1349			return '\r', t, err
  1350		case 't':
  1351			return '\t', t, err
  1352		case 'v':
  1353			return '\v', t, err
  1354		}
  1355		return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
  1356	}
  1357	
  1358	// parseClassChar parses a character class character at the beginning of s
  1359	// and returns it.
  1360	func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
  1361		if s == "" {
  1362			return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
  1363		}
  1364	
  1365		// Allow regular escape sequences even though
  1366		// many need not be escaped in this context.
  1367		if s[0] == '\\' {
  1368			return p.parseEscape(s)
  1369		}
  1370	
  1371		return nextRune(s)
  1372	}
  1373	
  1374	type charGroup struct {
  1375		sign  int
  1376		class []rune
  1377	}
  1378	
  1379	// parsePerlClassEscape parses a leading Perl character class escape like \d
  1380	// from the beginning of s. If one is present, it appends the characters to r
  1381	// and returns the new slice r and the remainder of the string.
  1382	func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
  1383		if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
  1384			return
  1385		}
  1386		g := perlGroup[s[0:2]]
  1387		if g.sign == 0 {
  1388			return
  1389		}
  1390		return p.appendGroup(r, g), s[2:]
  1391	}
  1392	
  1393	// parseNamedClass parses a leading POSIX named character class like [:alnum:]
  1394	// from the beginning of s. If one is present, it appends the characters to r
  1395	// and returns the new slice r and the remainder of the string.
  1396	func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
  1397		if len(s) < 2 || s[0] != '[' || s[1] != ':' {
  1398			return
  1399		}
  1400	
  1401		i := strings.Index(s[2:], ":]")
  1402		if i < 0 {
  1403			return
  1404		}
  1405		i += 2
  1406		name, s := s[0:i+2], s[i+2:]
  1407		g := posixGroup[name]
  1408		if g.sign == 0 {
  1409			return nil, "", &Error{ErrInvalidCharRange, name}
  1410		}
  1411		return p.appendGroup(r, g), s, nil
  1412	}
  1413	
  1414	func (p *parser) appendGroup(r []rune, g charGroup) []rune {
  1415		if p.flags&FoldCase == 0 {
  1416			if g.sign < 0 {
  1417				r = appendNegatedClass(r, g.class)
  1418			} else {
  1419				r = appendClass(r, g.class)
  1420			}
  1421		} else {
  1422			tmp := p.tmpClass[:0]
  1423			tmp = appendFoldedClass(tmp, g.class)
  1424			p.tmpClass = tmp
  1425			tmp = cleanClass(&p.tmpClass)
  1426			if g.sign < 0 {
  1427				r = appendNegatedClass(r, tmp)
  1428			} else {
  1429				r = appendClass(r, tmp)
  1430			}
  1431		}
  1432		return r
  1433	}
  1434	
  1435	var anyTable = &unicode.RangeTable{
  1436		R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
  1437		R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
  1438	}
  1439	
  1440	// unicodeTable returns the unicode.RangeTable identified by name
  1441	// and the table of additional fold-equivalent code points.
  1442	func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
  1443		// Special case: "Any" means any.
  1444		if name == "Any" {
  1445			return anyTable, anyTable
  1446		}
  1447		if t := unicode.Categories[name]; t != nil {
  1448			return t, unicode.FoldCategory[name]
  1449		}
  1450		if t := unicode.Scripts[name]; t != nil {
  1451			return t, unicode.FoldScript[name]
  1452		}
  1453		return nil, nil
  1454	}
  1455	
  1456	// parseUnicodeClass parses a leading Unicode character class like \p{Han}
  1457	// from the beginning of s. If one is present, it appends the characters to r
  1458	// and returns the new slice r and the remainder of the string.
  1459	func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
  1460		if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
  1461			return
  1462		}
  1463	
  1464		// Committed to parse or return error.
  1465		sign := +1
  1466		if s[1] == 'P' {
  1467			sign = -1
  1468		}
  1469		t := s[2:]
  1470		c, t, err := nextRune(t)
  1471		if err != nil {
  1472			return
  1473		}
  1474		var seq, name string
  1475		if c != '{' {
  1476			// Single-letter name.
  1477			seq = s[:len(s)-len(t)]
  1478			name = seq[2:]
  1479		} else {
  1480			// Name is in braces.
  1481			end := strings.IndexRune(s, '}')
  1482			if end < 0 {
  1483				if err = checkUTF8(s); err != nil {
  1484					return
  1485				}
  1486				return nil, "", &Error{ErrInvalidCharRange, s}
  1487			}
  1488			seq, t = s[:end+1], s[end+1:]
  1489			name = s[3:end]
  1490			if err = checkUTF8(name); err != nil {
  1491				return
  1492			}
  1493		}
  1494	
  1495		// Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
  1496		if name != "" && name[0] == '^' {
  1497			sign = -sign
  1498			name = name[1:]
  1499		}
  1500	
  1501		tab, fold := unicodeTable(name)
  1502		if tab == nil {
  1503			return nil, "", &Error{ErrInvalidCharRange, seq}
  1504		}
  1505	
  1506		if p.flags&FoldCase == 0 || fold == nil {
  1507			if sign > 0 {
  1508				r = appendTable(r, tab)
  1509			} else {
  1510				r = appendNegatedTable(r, tab)
  1511			}
  1512		} else {
  1513			// Merge and clean tab and fold in a temporary buffer.
  1514			// This is necessary for the negative case and just tidy
  1515			// for the positive case.
  1516			tmp := p.tmpClass[:0]
  1517			tmp = appendTable(tmp, tab)
  1518			tmp = appendTable(tmp, fold)
  1519			p.tmpClass = tmp
  1520			tmp = cleanClass(&p.tmpClass)
  1521			if sign > 0 {
  1522				r = appendClass(r, tmp)
  1523			} else {
  1524				r = appendNegatedClass(r, tmp)
  1525			}
  1526		}
  1527		return r, t, nil
  1528	}
  1529	
  1530	// parseClass parses a character class at the beginning of s
  1531	// and pushes it onto the parse stack.
  1532	func (p *parser) parseClass(s string) (rest string, err error) {
  1533		t := s[1:] // chop [
  1534		re := p.newRegexp(OpCharClass)
  1535		re.Flags = p.flags
  1536		re.Rune = re.Rune0[:0]
  1537	
  1538		sign := +1
  1539		if t != "" && t[0] == '^' {
  1540			sign = -1
  1541			t = t[1:]
  1542	
  1543			// If character class does not match \n, add it here,
  1544			// so that negation later will do the right thing.
  1545			if p.flags&ClassNL == 0 {
  1546				re.Rune = append(re.Rune, '\n', '\n')
  1547			}
  1548		}
  1549	
  1550		class := re.Rune
  1551		first := true // ] and - are okay as first char in class
  1552		for t == "" || t[0] != ']' || first {
  1553			// POSIX: - is only okay unescaped as first or last in class.
  1554			// Perl: - is okay anywhere.
  1555			if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
  1556				_, size := utf8.DecodeRuneInString(t[1:])
  1557				return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
  1558			}
  1559			first = false
  1560	
  1561			// Look for POSIX [:alnum:] etc.
  1562			if len(t) > 2 && t[0] == '[' && t[1] == ':' {
  1563				nclass, nt, err := p.parseNamedClass(t, class)
  1564				if err != nil {
  1565					return "", err
  1566				}
  1567				if nclass != nil {
  1568					class, t = nclass, nt
  1569					continue
  1570				}
  1571			}
  1572	
  1573			// Look for Unicode character group like \p{Han}.
  1574			nclass, nt, err := p.parseUnicodeClass(t, class)
  1575			if err != nil {
  1576				return "", err
  1577			}
  1578			if nclass != nil {
  1579				class, t = nclass, nt
  1580				continue
  1581			}
  1582	
  1583			// Look for Perl character class symbols (extension).
  1584			if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
  1585				class, t = nclass, nt
  1586				continue
  1587			}
  1588	
  1589			// Single character or simple range.
  1590			rng := t
  1591			var lo, hi rune
  1592			if lo, t, err = p.parseClassChar(t, s); err != nil {
  1593				return "", err
  1594			}
  1595			hi = lo
  1596			// [a-] means (a|-) so check for final ].
  1597			if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
  1598				t = t[1:]
  1599				if hi, t, err = p.parseClassChar(t, s); err != nil {
  1600					return "", err
  1601				}
  1602				if hi < lo {
  1603					rng = rng[:len(rng)-len(t)]
  1604					return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
  1605				}
  1606			}
  1607			if p.flags&FoldCase == 0 {
  1608				class = appendRange(class, lo, hi)
  1609			} else {
  1610				class = appendFoldedRange(class, lo, hi)
  1611			}
  1612		}
  1613		t = t[1:] // chop ]
  1614	
  1615		// Use &re.Rune instead of &class to avoid allocation.
  1616		re.Rune = class
  1617		class = cleanClass(&re.Rune)
  1618		if sign < 0 {
  1619			class = negateClass(class)
  1620		}
  1621		re.Rune = class
  1622		p.push(re)
  1623		return t, nil
  1624	}
  1625	
  1626	// cleanClass sorts the ranges (pairs of elements of r),
  1627	// merges them, and eliminates duplicates.
  1628	func cleanClass(rp *[]rune) []rune {
  1629	
  1630		// Sort by lo increasing, hi decreasing to break ties.
  1631		sort.Sort(ranges{rp})
  1632	
  1633		r := *rp
  1634		if len(r) < 2 {
  1635			return r
  1636		}
  1637	
  1638		// Merge abutting, overlapping.
  1639		w := 2 // write index
  1640		for i := 2; i < len(r); i += 2 {
  1641			lo, hi := r[i], r[i+1]
  1642			if lo <= r[w-1]+1 {
  1643				// merge with previous range
  1644				if hi > r[w-1] {
  1645					r[w-1] = hi
  1646				}
  1647				continue
  1648			}
  1649			// new disjoint range
  1650			r[w] = lo
  1651			r[w+1] = hi
  1652			w += 2
  1653		}
  1654	
  1655		return r[:w]
  1656	}
  1657	
  1658	// appendLiteral returns the result of appending the literal x to the class r.
  1659	func appendLiteral(r []rune, x rune, flags Flags) []rune {
  1660		if flags&FoldCase != 0 {
  1661			return appendFoldedRange(r, x, x)
  1662		}
  1663		return appendRange(r, x, x)
  1664	}
  1665	
  1666	// appendRange returns the result of appending the range lo-hi to the class r.
  1667	func appendRange(r []rune, lo, hi rune) []rune {
  1668		// Expand last range or next to last range if it overlaps or abuts.
  1669		// Checking two ranges helps when appending case-folded
  1670		// alphabets, so that one range can be expanding A-Z and the
  1671		// other expanding a-z.
  1672		n := len(r)
  1673		for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
  1674			if n >= i {
  1675				rlo, rhi := r[n-i], r[n-i+1]
  1676				if lo <= rhi+1 && rlo <= hi+1 {
  1677					if lo < rlo {
  1678						r[n-i] = lo
  1679					}
  1680					if hi > rhi {
  1681						r[n-i+1] = hi
  1682					}
  1683					return r
  1684				}
  1685			}
  1686		}
  1687	
  1688		return append(r, lo, hi)
  1689	}
  1690	
  1691	const (
  1692		// minimum and maximum runes involved in folding.
  1693		// checked during test.
  1694		minFold = 0x0041
  1695		maxFold = 0x1e943
  1696	)
  1697	
  1698	// appendFoldedRange returns the result of appending the range lo-hi
  1699	// and its case folding-equivalent runes to the class r.
  1700	func appendFoldedRange(r []rune, lo, hi rune) []rune {
  1701		// Optimizations.
  1702		if lo <= minFold && hi >= maxFold {
  1703			// Range is full: folding can't add more.
  1704			return appendRange(r, lo, hi)
  1705		}
  1706		if hi < minFold || lo > maxFold {
  1707			// Range is outside folding possibilities.
  1708			return appendRange(r, lo, hi)
  1709		}
  1710		if lo < minFold {
  1711			// [lo, minFold-1] needs no folding.
  1712			r = appendRange(r, lo, minFold-1)
  1713			lo = minFold
  1714		}
  1715		if hi > maxFold {
  1716			// [maxFold+1, hi] needs no folding.
  1717			r = appendRange(r, maxFold+1, hi)
  1718			hi = maxFold
  1719		}
  1720	
  1721		// Brute force. Depend on appendRange to coalesce ranges on the fly.
  1722		for c := lo; c <= hi; c++ {
  1723			r = appendRange(r, c, c)
  1724			f := unicode.SimpleFold(c)
  1725			for f != c {
  1726				r = appendRange(r, f, f)
  1727				f = unicode.SimpleFold(f)
  1728			}
  1729		}
  1730		return r
  1731	}
  1732	
  1733	// appendClass returns the result of appending the class x to the class r.
  1734	// It assume x is clean.
  1735	func appendClass(r []rune, x []rune) []rune {
  1736		for i := 0; i < len(x); i += 2 {
  1737			r = appendRange(r, x[i], x[i+1])
  1738		}
  1739		return r
  1740	}
  1741	
  1742	// appendFolded returns the result of appending the case folding of the class x to the class r.
  1743	func appendFoldedClass(r []rune, x []rune) []rune {
  1744		for i := 0; i < len(x); i += 2 {
  1745			r = appendFoldedRange(r, x[i], x[i+1])
  1746		}
  1747		return r
  1748	}
  1749	
  1750	// appendNegatedClass returns the result of appending the negation of the class x to the class r.
  1751	// It assumes x is clean.
  1752	func appendNegatedClass(r []rune, x []rune) []rune {
  1753		nextLo := '\u0000'
  1754		for i := 0; i < len(x); i += 2 {
  1755			lo, hi := x[i], x[i+1]
  1756			if nextLo <= lo-1 {
  1757				r = appendRange(r, nextLo, lo-1)
  1758			}
  1759			nextLo = hi + 1
  1760		}
  1761		if nextLo <= unicode.MaxRune {
  1762			r = appendRange(r, nextLo, unicode.MaxRune)
  1763		}
  1764		return r
  1765	}
  1766	
  1767	// appendTable returns the result of appending x to the class r.
  1768	func appendTable(r []rune, x *unicode.RangeTable) []rune {
  1769		for _, xr := range x.R16 {
  1770			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1771			if stride == 1 {
  1772				r = appendRange(r, lo, hi)
  1773				continue
  1774			}
  1775			for c := lo; c <= hi; c += stride {
  1776				r = appendRange(r, c, c)
  1777			}
  1778		}
  1779		for _, xr := range x.R32 {
  1780			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1781			if stride == 1 {
  1782				r = appendRange(r, lo, hi)
  1783				continue
  1784			}
  1785			for c := lo; c <= hi; c += stride {
  1786				r = appendRange(r, c, c)
  1787			}
  1788		}
  1789		return r
  1790	}
  1791	
  1792	// appendNegatedTable returns the result of appending the negation of x to the class r.
  1793	func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
  1794		nextLo := '\u0000' // lo end of next class to add
  1795		for _, xr := range x.R16 {
  1796			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1797			if stride == 1 {
  1798				if nextLo <= lo-1 {
  1799					r = appendRange(r, nextLo, lo-1)
  1800				}
  1801				nextLo = hi + 1
  1802				continue
  1803			}
  1804			for c := lo; c <= hi; c += stride {
  1805				if nextLo <= c-1 {
  1806					r = appendRange(r, nextLo, c-1)
  1807				}
  1808				nextLo = c + 1
  1809			}
  1810		}
  1811		for _, xr := range x.R32 {
  1812			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1813			if stride == 1 {
  1814				if nextLo <= lo-1 {
  1815					r = appendRange(r, nextLo, lo-1)
  1816				}
  1817				nextLo = hi + 1
  1818				continue
  1819			}
  1820			for c := lo; c <= hi; c += stride {
  1821				if nextLo <= c-1 {
  1822					r = appendRange(r, nextLo, c-1)
  1823				}
  1824				nextLo = c + 1
  1825			}
  1826		}
  1827		if nextLo <= unicode.MaxRune {
  1828			r = appendRange(r, nextLo, unicode.MaxRune)
  1829		}
  1830		return r
  1831	}
  1832	
  1833	// negateClass overwrites r and returns r's negation.
  1834	// It assumes the class r is already clean.
  1835	func negateClass(r []rune) []rune {
  1836		nextLo := '\u0000' // lo end of next class to add
  1837		w := 0             // write index
  1838		for i := 0; i < len(r); i += 2 {
  1839			lo, hi := r[i], r[i+1]
  1840			if nextLo <= lo-1 {
  1841				r[w] = nextLo
  1842				r[w+1] = lo - 1
  1843				w += 2
  1844			}
  1845			nextLo = hi + 1
  1846		}
  1847		r = r[:w]
  1848		if nextLo <= unicode.MaxRune {
  1849			// It's possible for the negation to have one more
  1850			// range - this one - than the original class, so use append.
  1851			r = append(r, nextLo, unicode.MaxRune)
  1852		}
  1853		return r
  1854	}
  1855	
  1856	// ranges implements sort.Interface on a []rune.
  1857	// The choice of receiver type definition is strange
  1858	// but avoids an allocation since we already have
  1859	// a *[]rune.
  1860	type ranges struct {
  1861		p *[]rune
  1862	}
  1863	
  1864	func (ra ranges) Less(i, j int) bool {
  1865		p := *ra.p
  1866		i *= 2
  1867		j *= 2
  1868		return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
  1869	}
  1870	
  1871	func (ra ranges) Len() int {
  1872		return len(*ra.p) / 2
  1873	}
  1874	
  1875	func (ra ranges) Swap(i, j int) {
  1876		p := *ra.p
  1877		i *= 2
  1878		j *= 2
  1879		p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
  1880	}
  1881	
  1882	func checkUTF8(s string) error {
  1883		for s != "" {
  1884			rune, size := utf8.DecodeRuneInString(s)
  1885			if rune == utf8.RuneError && size == 1 {
  1886				return &Error{Code: ErrInvalidUTF8, Expr: s}
  1887			}
  1888			s = s[size:]
  1889		}
  1890		return nil
  1891	}
  1892	
  1893	func nextRune(s string) (c rune, t string, err error) {
  1894		c, size := utf8.DecodeRuneInString(s)
  1895		if c == utf8.RuneError && size == 1 {
  1896			return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
  1897		}
  1898		return c, s[size:], nil
  1899	}
  1900	
  1901	func isalnum(c rune) bool {
  1902		return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
  1903	}
  1904	
  1905	func unhex(c rune) rune {
  1906		if '0' <= c && c <= '9' {
  1907			return c - '0'
  1908		}
  1909		if 'a' <= c && c <= 'f' {
  1910			return c - 'a' + 10
  1911		}
  1912		if 'A' <= c && c <= 'F' {
  1913			return c - 'A' + 10
  1914		}
  1915		return -1
  1916	}
  1917	

View as plain text