package syntax import ( "bytes" "fmt" "math" ) // similar to prog.go in the go regex package...also with comment 'may not belong in this package' // File provides operator constants for use by the Builder and the Machine. // Implementation notes: // // Regexps are built into RegexCodes, which contain an operation array, // a string table, and some constants. // // Each operation is one of the codes below, followed by the integer // operands specified for each op. // // Strings and sets are indices into a string table. type InstOp int const ( // lef/back operands description Onerep InstOp = 0 // lef,back char,min,max a {n} Notonerep = 1 // lef,back char,min,max .{n} Setrep = 2 // lef,back set,min,max [\d]{n} Oneloop = 3 // lef,back char,min,max a {,n} Notoneloop = 4 // lef,back char,min,max .{,n} Setloop = 5 // lef,back set,min,max [\d]{,n} Onelazy = 6 // lef,back char,min,max a {,n}? Notonelazy = 7 // lef,back char,min,max .{,n}? Setlazy = 8 // lef,back set,min,max [\d]{,n}? One = 9 // lef char a Notone = 10 // lef char [^a] Set = 11 // lef set [a-z\s] \w \s \d Multi = 12 // lef string abcd Ref = 13 // lef group \# Bol = 14 // ^ Eol = 15 // $ Boundary = 16 // \b Nonboundary = 17 // \B Beginning = 18 // \A Start = 19 // \G EndZ = 20 // \Z End = 21 // \Z Nothing = 22 // Reject! // Primitive control structures Lazybranch = 23 // back jump straight first Branchmark = 24 // back jump branch first for loop Lazybranchmark = 25 // back jump straight first for loop Nullcount = 26 // back val set counter, null mark Setcount = 27 // back val set counter, make mark Branchcount = 28 // back jump,limit branch++ if zero<=c<limit Lazybranchcount = 29 // back jump,limit same, but straight first Nullmark = 30 // back save position Setmark = 31 // back save position Capturemark = 32 // back group define group Getmark = 33 // back recall position Setjump = 34 // back save backtrack state Backjump = 35 // zap back to saved state Forejump = 36 // zap backtracking state Testref = 37 // backtrack if ref undefined Goto = 38 // jump just go Prune = 39 // prune it baby Stop = 40 // done! ECMABoundary = 41 // \b NonECMABoundary = 42 // \B // Modifiers for alternate modes Mask = 63 // Mask to get unmodified ordinary operator Rtl = 64 // bit to indicate that we're reverse scanning. Back = 128 // bit to indicate that we're backtracking. Back2 = 256 // bit to indicate that we're backtracking on a second branch. Ci = 512 // bit to indicate that we're case-insensitive. ) type Code struct { Codes []int // the code Strings [][]rune // string table Sets []*CharSet //character set table TrackCount int // how many instructions use backtracking Caps map[int]int // mapping of user group numbers -> impl group slots Capsize int // number of impl group slots FcPrefix *Prefix // the set of candidate first characters (may be null) BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null) Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc) RightToLeft bool // true if right to left } func opcodeBacktracks(op InstOp) bool { op &= Mask switch op { case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark, Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump, Forejump, Goto: return true default: return false } } func opcodeSize(op InstOp) int { op &= Mask switch op { case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ, End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop: return 1 case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark, Prune, Set: return 2 case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setlazy, Setrep, Setloop: return 3 default: panic(fmt.Errorf("Unexpected op code: %v", op)) } } var codeStr = []string{ "Onerep", "Notonerep", "Setrep", "Oneloop", "Notoneloop", "Setloop", "Onelazy", "Notonelazy", "Setlazy", "One", "Notone", "Set", "Multi", "Ref", "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End", "Nothing", "Lazybranch", "Branchmark", "Lazybranchmark", "Nullcount", "Setcount", "Branchcount", "Lazybranchcount", "Nullmark", "Setmark", "Capturemark", "Getmark", "Setjump", "Backjump", "Forejump", "Testref", "Goto", "Prune", "Stop", "ECMABoundary", "NonECMABoundary", } func operatorDescription(op InstOp) string { desc := codeStr[op&Mask] if (op & Ci) != 0 { desc += "-Ci" } if (op & Rtl) != 0 { desc += "-Rtl" } if (op & Back) != 0 { desc += "-Back" } if (op & Back2) != 0 { desc += "-Back2" } return desc } // OpcodeDescription is a humman readable string of the specific offset func (c *Code) OpcodeDescription(offset int) string { buf := &bytes.Buffer{} op := InstOp(c.Codes[offset]) fmt.Fprintf(buf, "%06d ", offset) if opcodeBacktracks(op & Mask) { buf.WriteString("*") } else { buf.WriteString(" ") } buf.WriteString(operatorDescription(op)) buf.WriteString("(") op &= Mask switch op { case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy: buf.WriteString("Ch = ") buf.WriteString(CharDescription(rune(c.Codes[offset+1]))) case Set, Setrep, Setloop, Setlazy: buf.WriteString("Set = ") buf.WriteString(c.Sets[c.Codes[offset+1]].String()) case Multi: fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]])) case Ref, Testref: fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) case Capturemark: fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) if c.Codes[offset+2] != -1 { fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2]) } case Nullcount, Setcount: fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1]) case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount: fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1]) } switch op { case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy: buf.WriteString(", Rep = ") if c.Codes[offset+2] == math.MaxInt32 { buf.WriteString("inf") } else { fmt.Fprintf(buf, "%d", c.Codes[offset+2]) } case Branchcount, Lazybranchcount: buf.WriteString(", Limit = ") if c.Codes[offset+2] == math.MaxInt32 { buf.WriteString("inf") } else { fmt.Fprintf(buf, "%d", c.Codes[offset+2]) } } buf.WriteString(")") return buf.String() } func (c *Code) Dump() string { buf := &bytes.Buffer{} if c.RightToLeft { fmt.Fprintln(buf, "Direction: right-to-left") } else { fmt.Fprintln(buf, "Direction: left-to-right") } if c.FcPrefix == nil { fmt.Fprintln(buf, "Firstchars: n/a") } else { fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String()) } if c.BmPrefix == nil { fmt.Fprintln(buf, "Prefix: n/a") } else { fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String())) } fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors) fmt.Fprintln(buf) if c.BmPrefix != nil { fmt.Fprintln(buf, "BoyerMoore:") fmt.Fprintln(buf, c.BmPrefix.Dump(" ")) } for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) { fmt.Fprintln(buf, c.OpcodeDescription(i)) } return buf.String() }