This module is rated as beta, and is ready for widespread use. It is still new and should be used with some caution to ensure the results are as expected. |
This is a port to Scribunto lua of lpeglabel, which itself is a minor extension of the lua lpeg parser generator. The API should be identical to that of lpeglabel, except that the regular expressions are "natively unicode"; that is, the .
pattern matches a single unicode codepoint, not a single byte as in the original.
These files have been packaged using a script to turn them into a single lua file.
Usage
edit{{#invoke:User:Cscott/llpeg|function_name}}
return (function()
local builders = {}
local function register(name, f)
builders[name] = f
end
register('advent.compat', function() return require [[Module:User:Cscott/compat]] end)
register('llpeg.types', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local CHARMAX = 0x7F -- maximum codepoint for charsets
-- metatable for pattern objects; will be filled in later
local metareg = {}
local enum = function(keys)
local Enum = {}
Enum.__index = Enum
function Enum:__tostring() return self.name end
function Enum:pairs() return keys end
function Enum:type() return Enum end
for name, value in pairs(keys) do
Enum[name] = setmetatable({ name = name, value = value }, Enum)
end
return Enum
end
local CapKind = enum{
close = "close", -- not used in trees */
position = "position",
const = "constant", -- ktable[key] is Lua constant
backref = "backref", -- ktable[key] is "name" of group to get capture
arg = "argument", -- 'key' is arg's number
simple = "simple", -- next node is pattern
table = "table", -- next node is pattern
["function"] = "function", -- ktable[key] is function; next node is pattern
acc = "acc", -- ktable[key] is function; next node is pattern
query = "query", -- ktable[key] is table; next node is pattern
string = "string", -- ktable[key] is string; next node is pattern
num = "num", -- numbered capture; 'key' is number of value to return
subst = "substitution", -- substitution capture; next node is pattern
fold = "fold", -- ktable[key] is function; next node is pattern
runtime = "runtime", -- not used in trees (is uses another type for tree)
group = "group", -- ktable[key] is group's "name"
}
local TTag = enum{
Char = "char", -- 'n' has unicode codepoint
Set = "set", -- 'set' has sparse array codepoint->true for codepoint <=CHARMAX
-- 'rest' indicates whether all codepoints > CHARMAX should be
-- part of the set (true) or not (false)
Any = "any",
True = "true",
False = "false",
UTFR = "utf8.range", --[[ range of UTF-8 codepoints;
'from' has initial codepoint; 'to' has final codepoint ]]--
Rep = "rep", -- 'sib1' *
Seq = "seq", -- 'sib1' 'sib2'
Choice = "choice", -- 'sib1' / 'sib2'
Not = "not", -- !'sib1'
And = "and", -- &'sib1'
Call = "call", -- 'sib2' is rule being called; otherwise same as TOpenCall
OpenCall = "opencall", -- 'key' is rule name
Rule = "rule", --[[ 'key' is rule name (but key == nil for unused rules);
'sib1' is rule's pattern pre-rule; 'sib2' is next rule;
'n' is rule's sequential number, 'name' is rule name (even
for unused rules) ]]--
XInfo = "xinfo", -- extra info (not used)
Grammar = "grammar", -- 'sib1' is initial (and first) rule, 'n' is # rules
Behind = "behind", -- 'sib1' is pattern, 'n' is how much to go back
Capture = "capture", --[[ captures: 'cap' is kind of capture (enum 'CapKind');
'key' is Lua value associated with capture;
'sib1' is capture body ]]--
RunTime = "run-time", --[[ run-time capture: 'key' is Lua function;
'sib1' is capture body ]]--
Throw = "throw", -- labeled failure: 'key' is label's name,
-- sib2 is associated recovery rule
}
local PE = enum{
nullable = "nullable",
nofail = "nofail",
}
-- virtual machine instructions
local Opcode = enum{
Any = "any", -- if no char, fail
Char = "char", -- if char != aux, fail
Set = "set", -- if char not in buff, fail
TestAny = "testany", -- in no char, jump to 'offset'
TestChar = "testchar", -- if char != aux, jump to 'offset'
TestSet = "testset", -- if char not in buff, jump to 'offset'
Span = "span", -- read a span of chars in buff
UTFR = "utf-range", -- if codepoint not in range [offset, utf_to], fail
Behind = "behind", -- walk back 'aux' characters (fail if not possible)
Ret = "ret", -- return from a rule
End = "end", -- end of pattern
Choice = "choice", -- stack a choice; next fail will jump to 'offset'
PredChoice = "pred_choice", -- labeled failure: stack a choice; changes label env next fail will jump to 'offset'
Jmp = "jmp", -- jump to 'offset'
Call = "call", -- call rule at 'offset'
OpenCall = "open_call", -- call rule number 'key' (must be closed to a ICall)
Commit = "commit", -- pop choice and jump to 'offset'
PartialCommit = "partial_commit", -- update top choice to current position and jump
BackCommit = "back_commit", -- backtrack like "fail" but jump to its own 'offset'
FailTwice = "failtwice", -- pop one choice and then fail
Fail = "fail", -- go back to saved state on choice and jump to saved offset
Giveup = "giveup", -- internal use
FullCapture = "fullcapture", -- complete capture of last 'off' chars
OpenCapture = "opencapture", -- start a capture
CloseCapture = "closecapture",
CloseRunTime = "closeruntime",
Throw = "throw", -- fails with a given label --labeled failure
ThrowRec = "throw_rec", -- fails with a given label and call rule at 'offset' --labeled failure
Empty = "--", -- to fill empty slots left by optimizations
}
-- helper for visitor pattern definitions
function define(dispatch, which, f)
for _,v in pairs(which) do
assert(v ~= nil) -- catch typos
dispatch[v] = f
end
end
local numsiblings = {}
define(numsiblings, {
TTag.Char, TTag.Set, TTag.Any,
TTag.True, TTag.False, TTag.UTFR,
TTag.Call, TTag.OpenCall,
TTag.Throw,
}, 0)
define(numsiblings, {
TTag.Rep, TTag.Not, TTag.And, TTag.Grammar,
TTag.Behind, TTag.Capture, TTag.RunTime,
}, 1)
define(numsiblings, {
TTag.Seq, TTag.Choice, TTag.Rule,
}, 2)
-- more help for visitor functions
local function_name_registry = {}
function register_fname(name, f)
assert(type(name) == "string")
assert(type(f) == "function")
function_name_registry[f] = name
end
function report_ferror(f, msg)
local fname = function_name_registry[f]
if fname ~= nil then
msg = fname .. ": " .. msg
end
error(msg)
end
function define_type_visitor(tbl)
local dispatch = {}
for keys,func in pairs(tbl) do
if type(keys) ~= "table" then
keys = { keys }
end
define(dispatch, keys, func)
end
local visit
visit = function(val, ...)
local a = dispatch["assert"]
if a ~= nil then a(val, ...) end -- assert preconditions
local ty = type(val)
if ty == 'table' and getmetatable(val) == metareg then
ty = 'pattern'
end
local f = dispatch[ty]
if f ~= nil then return f(val, ...) end
f = dispatch.default
if f == nil then
report_ferror(visit, "no default for " .. ty)
end
return f(val, ...)
end
return visit
end
function define_tree_visitor(tbl, opt_name)
local dispatch = {}
for keys,func in pairs(tbl) do
if type(keys) ~= "table" or getmetatable(keys) == TTag then
keys = { keys }
end
define(dispatch, keys, func)
end
local visit
visit = function(tree, ...)
if tree == nil then report_ferror(visit, "nil tree") end
local a = dispatch["assert"]
if a ~= nil then a(tree, ...) end -- assert preconditions
local f = dispatch[tree.tag]
if f ~= nil then return f(tree, ...) end
f = dispatch.default
if f == nil then
report_ferror(visit, "no default for " .. tree.tag)
end
return f(tree, ...)
end
return visit
end
function define_opcode_visitor(tbl)
local dispatch = {}
for keys,func in pairs(tbl) do
if type(keys) ~= "table" or getmetatable(keys) == Opcode then
keys = { keys }
end
define(dispatch, keys, func)
end
local visit
visit = function(op, ...)
if op == nil then report_ferror(visit, "nil op") end
local a = dispatch["assert"]
if a ~= nil then a(op, ...) end -- assert preconditions
local f = dispatch[op.code]
if f ~= nil then return f(op, ...) end
f = dispatch.default
if f == nil then
report_ferror(visit, "no default for " .. op.code)
end
return f(op, ...)
end
return visit
end
-- helper for module imports
function from(mod, list)
local result = {}
for _,v in ipairs(list) do
table.insert(result, mod[v])
end
return compat.unpack(result)
end
function newcharset()
return setmetatable({
tag = TTag.Set,
code = nil,
rest = false,
set = {}
}, metareg)
end
local fullset = newcharset()
for i = 0,CHARMAX do
fullset.set[i] = true
end
fullset.rest = true -- make sure non-ascii unicode chars are included!
assert(fullset.tag == TTag.Set)
return {
CHARMAX = CHARMAX,
CapKind = CapKind,
Opcode = Opcode,
PE = PE,
TTag = TTag,
define = define,
define_tree_visitor = define_tree_visitor,
enum = enum,
from = from,
fullset = fullset,
metareg = metareg,
newcharset = newcharset,
numsiblings = numsiblings,
register_fname = register_fname,
}
end)
register('llpeg.print', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
Opcode,
TTag,
define,
define_tree_visitor,
numsiblings,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'Opcode',
'TTag',
'define',
'define_tree_visitor',
'numsiblings',
})
function printcharset(tree)
local result = "["
local i = 0
while i <= CHARMAX do
local first = i
while tree.set[i] and i <= CHARMAX do
i = i + 1
end
if first == (i - 1) then -- unary range
result = result .. string.format("(%02x)", first)
elseif first < (i-1) then -- non-empty range
result = result .. string.format("(%02x-%02x)", first, i - 1)
end
i = i + 1
end
if tree.rest then
result = result .. "(80-FFFF)"
end
return result .. "]"
end
function printjmp(op, pc)
return "-> " .. op.target
end
local printinst_helper = define_opcode_visitor{
[Opcode.Char] = function(op, pc)
return string.format("'%c' (%02x)", op.aux, op.aux)
end,
[Opcode.TestChar] = function(op, pc)
return string.format("'%c' (%02x)", op.aux, op.aux) .. printjmp(op, pc)
end,
[Opcode.UTFR] = function(op, pc)
return string.format("%d - %d", op.from, op.to)
end,
[Opcode.FullCapture] = function(op, pc)
return string.format("%s (size = %s) (idx = %s)",
op.cap.value, op.aux, op.key)
end,
[Opcode.OpenCapture] = function(op, pc)
return string.format("%s (idx = %s)",
op.cap.value, op.key)
end,
[Opcode.Set] = function(op, pc)
return printcharset(op)
end,
[Opcode.TestSet] = function(op, pc)
return printcharset(op) .. printjmp(op, pc)
end,
[Opcode.Span] = function(op, pc)
return printcharset(op)
end,
[Opcode.OpenCall] = function(op, pc)
return string.format("-> %d", op.target) -- rule number
end,
[Opcode.Behind] = function(op, pc)
return string.format("%d", op.aux)
end,
[{Opcode.Jmp, Opcode.Call, Opcode.Commit, Opcode.Choice,
Opcode.PartialCommit, Opcode.BackCommit, Opcode.TestAny,
Opcode.PredChoice}] = function(op, pc)
return printjmp(op, pc)
end,
[Opcode.Throw] = function(op, pc) -- labeled failure
return string.format("(idx = %s)", op.key)
end,
[Opcode.ThrowRec] = function(op, pc)
return printjmp(op, pc) .. string.format("(idx = %s)", op.key)
end,
default = function() return '' end,
}
function printinst(pc, op, accum)
table.insert(accum, string.format("%02d: %s ", pc, op.code.value))
table.insert(accum, printinst_helper(op, pc))
table.insert(accum, "\n")
return accum
end
function printpatt(code, accum)
for pc,op in ipairs(code) do
printinst(pc, op, accum)
end
return accum
end
function printcap(cap, indent)
print(string.format("%s%s", string.rep(' ', indent), cap))
end
function printcap2close(captures, ncaptures, i, indent)
local head = captures[i]
i = i + 1
printcap(head, indent) -- print head capture
while i <= ncaptures and head:inside(captures[i]) do
i = printcap2close(captures, ncaptures, i, indent + 2) -- print nested captures
end
if i <= ncaptures and head:isopencap() then
assert(captures[i]:isclosecap())
printcap(captures[i], indent) -- print and skip close capture
i = i + 1
end
return i
end
function printcaplist(captures, ncaptures)
-- for debugging, first print a raw list of captures
if ncaptures == nil then ncaptures = #captures end
for i=1,ncaptures do
printcap(captures[i], 0)
end
print(">======");
local i=1
while i <= ncaptures and not captures[i]:isclosecap() do
i = printcap2close(captures, ncaptures, i, 0)
end
if i > ncaptures then
print("<unmatched>")
end
print("=======");
end
local printtree_helper = define_tree_visitor{
[TTag.Char] = function(tree)
local c = compat.utf8char(tree.n)
if c:find("%C") ~= nil then -- printable?
return " '" .. c .. "'"
else
return string.format(" (%02X)", tree.n)
end
end,
[TTag.Set] = function(tree)
return printcharset(tree)
end,
[TTag.UTFR] = function(tree)
return " " .. tree.from .. " - " .. tree.to
end,
[{TTag.OpenCall, TTag.Call}] = function(tree)
local ret = string.format(" key: %s", tree.key)
local rule = tree.sib2
if rule ~= nil then
ret = ret .. " (rule: " .. rule.n .. ")"
end
return ret
end,
[TTag.Behind] = function(tree)
return " " .. tree.n
end,
[TTag.Capture] = function(tree)
return string.format(" kind: '%s' key: %s", tree.cap.value, tree.key)
end,
[TTag.Rule] = function(tree)
return string.format(" key: %s", tree.key)
end,
[TTag.XInfo] = function(tree)
return " n: " .. tree.n
end,
[TTag.Grammar] = function(tree)
return " " .. tree.n -- number of rules
end,
[TTag.Throw] = function(tree)
return string.format(" key: %s", tree.key)
end,
default = function(tree) return '' end
}
function printtree(tree, indent, accum)
local sibs = numsiblings[tree.tag]
table.insert(accum, string.rep(' ', indent))
table.insert(accum, tree.tag.value)
table.insert(accum, printtree_helper(tree))
table.insert(accum, "\n")
if tree.tag == TTag.Rule then
sibs = 1 -- don't print sib2
elseif tree.tag == TTag.Grammar then
local rule = tree.sib1
for i=1,tree.n do
printtree(rule, indent + 2, accum)
rule = rule.sib2
end
sibs = 0 -- siblings already handled
end
if sibs >= 1 then
printtree(tree.sib1, indent + 2, accum)
if sibs >= 2 then
printtree(tree.sib2, indent + 2, accum)
end
end
return accum
end
local PREFIX = "" -- could also be "l." or "lpeg." etc
local printrepl_helper
printrepl_helper = define_tree_visitor{
[TTag.True] = function(tree, buf)
table.insert(buf, PREFIX .. 'P""')
end,
[TTag.Any] = function(tree, buf)
table.insert(buf, PREFIX .. 'P(1)')
end,
[TTag.Char] = function(tree, buf)
table.insert(buf, PREFIX .. 'P"')
local c = compat.utf8char(tree.n)
if c:find("%C") ~= nil then -- printable?
table.insert(buf, c)
else
table.insert(buf, string.format('\\%02X', tree.n))
end
table.insert(buf, '"')
end,
[TTag.Set] = function(tree, buf)
local nbuf = {}
local insertchar = function(cp)
local c = compat.utf8char(cp)
if string.find(c, "^[^%w%p ]") ~= nil then
table.insert(nbuf, string.format('\\x%02X', cp))
else
table.insert(nbuf, c)
end
end
local nargs = 0
local inserttwo = function(cp1, cp2)
if nargs > 0 then table.insert(nbuf, ',') end
nargs = nargs + 1
table.insert(nbuf, '"')
insertchar(cp1)
insertchar(cp2)
table.insert(nbuf, '"')
end
local i = 0
while i <= CHARMAX do
local first = i
while tree.set[i] and i <= CHARMAX do
i = i + 1
end
if first == (i - 1) then -- unary range
inserttwo(first, first)
elseif first < (i-1) then -- non-empty range
inserttwo(first, i-1)
end
i = i + 1
end
local r = table.concat(nbuf)
if nargs == 1 then
r = PREFIX .. 'S' .. r
else
r = PREFIX .. 'S(' .. r .. ')'
end
if tree.rest then
table.insert(buf, '(')
table.insert(buf, r)
table.insert(buf, ' + ')
table.insert(buf, PREFIX)
table.insert(buf, 'utfR(0x80, 0x10FFFF))')
else
table.insert(buf, r)
end
end,
[TTag.UTFR] = function(tree, buf)
table.insert(buf, string.format("%sutfR(0x%04X, 0x%04X)", PREFIX, tree.from, tree.to))
end,
[{TTag.OpenCall, TTag.Call}] = function(tree, buf)
table.insert(buf, string.format('%sV"%s"', PREFIX, tree.key))
end,
[TTag.Not] = function(tree, buf)
table.insert(buf, '-(')
printrepl_helper(tree.sib1, buf)
table.insert(buf, ')')
end,
[TTag.Seq] = function(tree, buf)
table.insert(buf, "(")
printrepl_helper(tree.sib1, buf)
table.insert(buf, " * ")
printrepl_helper(tree.sib2, buf)
table.insert(buf, ")")
end,
[TTag.Choice] = function(tree, buf)
table.insert(buf, "(")
printrepl_helper(tree.sib1, buf)
table.insert(buf, " + ")
printrepl_helper(tree.sib2, buf)
table.insert(buf, ")")
end,
[TTag.Rep] = function(tree, buf)
printrepl_helper(tree.sib1, buf)
table.insert(buf, "^0")
end,
--[[
[TTag.Behind] = function(tree)
return " " .. tree.n
end,
]]--
[TTag.Capture] = function(tree, buf)
local repl = define_type_visitor{
string = function(v)
return '"' .. v .. '"' -- xxx should handle escapes
end,
default = tostring,
}
local name = nil
local show_patt = false
local show_key = false
if tree.cap == CapKind.simple then
name = 'C'
show_patt = true
elseif tree.cap == CapKind.subst then
name = 'Cs'
show_patt = true
elseif tree.cap == CapKind.table then
name = 'Ct'
show_patt = true
elseif tree.cap == CapKind.pos then
name = 'Cp'
elseif tree.cap == CapKind.arg then
name = 'Carg'
show_key = true
elseif tree.cap == CapKind.backref then
name = 'Cb'
show_key = true
elseif tree.cap == CapKind.group then
name = 'Cg'
show_patt = true
show_key = (tree.key ~= nil)
end
if name ~= nil then
table.insert(buf, PREFIX)
table.insert(buf, name)
table.insert(buf, '(')
if show_patt then
printrepl_helper(tree.sib1, buf)
if show_key then
table.insert(buf, ', ')
end
end
if show_key then
table.insert(buf, repl(tree.key))
end
table.insert(buf, ')')
return
end
if tree.cap == CapKind.string or
tree.cap == CapKind.num or
tree.cap == CapKind.query or
tree.cap == CapKind['function'] then
printrepl_helper(tree.sib1, buf)
table.insert(buf, ' / ')
table.insert(buf, repl(tree.key))
return
end
-- fallback
table.insert(buf, string.format("<pattern %s>", tostring(tree.tag)))
end,
[TTag.Rule] = function(tree, buf)
local key = tree.name
if type(key) == 'number' then key = string.format("[%d]", key) end
table.insert(buf, key)
table.insert(buf, " = ")
printrepl_helper(tree.sib1, buf)
end,
[TTag.Grammar] = function(tree, buf)
table.insert(buf, PREFIX .. "P{")
local rule = tree.sib1
local r = {}
local first_rule_name = rule.name
r[first_rule_name] = rule
rule = rule.sib2
local names = {}
for i=2,tree.n do
table.insert(names, rule.name)
r[rule.name] = rule
rule = rule.sib2
end
-- sort rule names
table.sort(names)
table.insert(names, 1, first_rule_name)
-- now print in order
for _,name in ipairs(names) do
printrepl_helper(r[name], buf)
table.insert(buf, ", ")
end
table.insert(buf, "}")
end,
--[[
[TTag.Throw] = function(tree)
return " key: " .. tree.key .. " (rule: " .. tree.sib2.cap .. ")"
end,
]]--
default = function(tree, buf)
table.insert(buf, string.format("<pattern %s>", tostring(tree.tag)))
end,
}
function printrepl(tree)
local buf = {}
printrepl_helper(tree, buf)
return table.concat(buf)
end
return {
printcaplist = printcaplist,
printcharset = printcharset,
printinst = printinst,
printpatt = printpatt,
printrepl = printrepl,
printtree = printtree,
}
end)
register('llpeg.code', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
Opcode,
PE,
TTag,
define,
define_tree_visitor,
fullset,
newcharset,
numsiblings,
register_fname,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'Opcode',
'PE',
'TTag',
'define',
'define_tree_visitor',
'fullset',
'newcharset',
'numsiblings',
'register_fname',
})
local printinst = myrequire('llpeg.print').printinst
local TRACE_INSTRUCTIONS = false
-- signals a "no-instruction"
local NOINST = nil
-- don't optimize captures longer than this
local MAXOFF = 15
-- forward declarations
local codegen
local CompileState = {}
CompileState.__index = CompileState
--[[
** {======================================================
** Analysis and some optimizations
** =======================================================
]]--
--[[
** Check whether a charset is empty (returns IFail), singleton (IChar),
** full (IAny), or none of those (ISet). When singleton, '*c' returns
** which character it is. (When generic set, the set was the input,
** so there is no need to return it.)
]]--
function charsettype(cs)
local count = 0
local candidate
for i,_ in pairs(cs.set) do
candidate = i
count = count + 1
end
if cs.rest then
if count == (CHARMAX + 1) then
return Opcode.Any -- full set
end
elseif count == 0 then
return Opcode.Fail -- empty set
elseif count == 1 then
return Opcode.Char, candidate -- single char
end
return Opcode.Set -- neither full nor empty nor singleton
end
-- A few basic operations on charsets; returns new object
function cs_clone(cs)
local result = newcharset()
for i,_ in pairs(cs.set) do
result.set[i] = true
end
result.rest = cs.rest
return result
end
function cs_complement(cs)
local result = newcharset()
for i=0,CHARMAX do
if not cs.set[i] then
result.set[i] = true
end
end
result.rest = not cs.rest
return result
end
function cs_intersection(a, b)
local result = newcharset()
for i,_ in pairs(a.set) do
if a.set[i] and b.set[i] then
result.set[i] = true
end
end
result.rest = a.rest and b.rest
return result
end
function cs_union(a, b)
local result = newcharset()
for i=0,CHARMAX do
if a.set[i] or b.set[i] then
result.set[i] = true
end
end
result.rest = a.rest or b.rest
return result
end
function cs_diff(a, b)
local result = newcharset()
for i=0,CHARMAX do
if a.set[i] and not b.set[i] then
result.set[i] = true
end
end
result.rest = a.rest and not b.rest
return result
end
function cs_disjoint(a, b)
if a.rest == b.rest then return false end
for i,_ in pairs(a.set) do
if b.set[i] then return false end
end
for i,_ in pairs(b.set) do
if a.set[i] then return false end
end
return true
end
function cs_equal(a, b)
if a.rest ~= b.rest then return false end
for i,_ in pairs(a.set) do
if not b.set[i] then return false end
end
for i,_ in pairs(b.set) do
if not a.set[i] then return false end
end
return true
end
--[[
** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
** charset and return it; else return nil.
]]--
local tocharset = define_tree_visitor{
[TTag.Set] = function(v)
return v -- copy set
end,
[TTag.Char] = function(v)
-- only one char
if v.n <= CHARMAX then
local t = newcharset()
t.set[v.n] = true
return t
else
return nil
end
end,
[TTag.Any] = function(v)
return fullset
end,
[TTag.False] = function(v)
return newcharset()
end,
default = function(v)
return nil
end,
}
register_fname("tocharset", tocharset)
--[[
** Visit a TCall node taking care to stop recursion. If node not yet
** visited, return 'f(rule for call)', otherwise return 'def' (default
** value)
]]--
function CompileState:callrecursive(tree, f, default_value, ...)
if tree.tag ~= TTag.Call then
error("unexpected tree tag")
end
local rule = self.grammar.ruletab[tree.key]
if rule.tag ~= TTag.Rule then
error("unexpected tree sibling")
end
if tree.seen == true then
return default_value -- node already visited
else
-- first visit
local oldseen = tree.seen
tree.seen = true
local result = f(rule, ...)
tree.seen = oldseen -- restore tree
return result
end
end
--[[
** Check whether a pattern tree has captures
]]--
local hascaptures
hascaptures = define_tree_visitor{
[{TTag.Capture, TTag.RunTime}] = function(tree, cs)
return true
end,
[TTag.Call] = function(tree, cs)
assert(cs ~= nil)
return cs:callrecursive(tree, hascaptures, false, cs)
end,
[TTag.Rule] = function(tree, cs)
-- do not follow siblings
return hascaptures(tree.sib1, cs)
end,
[TTag.OpenCall] = function(tree, cs)
error("should not happen")
end,
[TTag.Grammar] = function(tree, cs)
-- make a fake compile state to hold the grammar, if necessary
if cs == nil then cs = CompileState:new(nil) end
return cs:withGrammar(tree, hascaptures, tree.sib1, cs)
end,
default = function(tree, cs)
local n = numsiblings[tree.tag]
if n == 1 then
return hascaptures(tree.sib1, cs) -- tail call
elseif n == 2 then
if hascaptures(tree.sib1, cs) then return true end
return hascaptures(tree.sib2, cs) -- tail call
elseif n == 0 then
return false
else
error("how many siblings does this have?")
end
end,
}
function CompileState:hascaptures(t) return hascaptures(t, self) end
register_fname("hascaptures", hascaptures)
--[[
** Checks how a pattern behaves regarding the empty string,
** in one of two different ways:
** A pattern is *nullable* if it can match without consuming any character;
** A pattern is *nofail* if it never fails for any string
** (including the empty string).
** The difference is only for predicates and run-time captures;
** for other patterns, the two properties are equivalent.
** (With predicates, &'a' is nullable but not nofail. Of course,
** nofail => nullable.)
** These functions are all convervative in the following way:
** p is nullable => nullable(p)
** nofail(p) => p cannot fail
** The function assumes that TOpenCall is not nullable;
** this will be checked again when the grammar is fixed.
** Run-time captures can do whatever they want, so the result
** is conservative.
]]--
local checkaux
checkaux = define_tree_visitor{
[{
TTag.Char, TTag.Set, TTag.Any, TTag.UTFR, TTag.False,
TTag.OpenCall, TTag.Throw,
}] = function(tree, pred, cs)
return false -- not nullable
end,
[{TTag.Rep,TTag.True}] = function(tree, pred, cs)
return true -- no fail
end,
[{TTag.Not,TTag.Behind}] = function(tree, pred, cs)
-- can match empty, but can fail
if pred == PE.nofail then
return false
else
return true
end
end,
[TTag.And] = function(tree, pred, cs)
-- can match empty; fail iff body does
if pred == PE.nullable then
return true
end
return checkaux(tree.sib1, pred, cs) -- tail call
end,
[TTag.RunTime] = function(tree, pred, cs)
-- can fail; match empty iff body does
if pred == PE.nofail then
return false
end
return checkaux(tree.sib1, pred, cs) -- tail call
end,
[TTag.Seq] = function(tree, pred, cs)
if not checkaux(tree.sib1, pred, cs) then
return false
end
return checkaux(tree.sib2, pred, cs) -- tail call
end,
[TTag.Choice] = function(tree, pred, cs)
if checkaux(tree.sib2, pred, cs) then
return true
end
return checkaux(tree.sib1, pred, cs) -- tail call
end,
[{ TTag.Capture, TTag.Rule, TTag.XInfo, }] = function(tree, pred, cs)
return checkaux(tree.sib1, pred, cs)
end,
[TTag.Grammar] = function(tree, pred, cs)
-- make a fake compile state to hold the grammar, if necessary
if cs == nil then cs = CompileState:new(nil) end
return cs:withGrammar(tree, checkaux, tree.sib1, pred, cs)
end,
[TTag.Call] = function(tree, pred, cs)
-- open calls are assumed not nullable; checked again after grammar
-- is fixed
if cs == nil then return false end
return checkaux(cs.grammar.ruletab[tree.key], pred, cs)
end,
}
register_fname("checkaux", checkaux)
function nofail(t, cs) return checkaux(t, PE.nofail, cs) end
function CompileState:nofail(t) return nofail(t, self) end
function nullable(t, cs) return checkaux(t, PE.nullable, cs) end
function CompileState:nullable(t) return nullable(t, self) end
function nullable_with_grammar(t, grm)
local cs = CompileState:new(nil)
return cs:withGrammar(grm, nullable, t, cs)
end
-- Note that we are counting characters, not bytes
local fixedlen, fixedlen_helper
fixedlen_helper = define_tree_visitor{
[{TTag.Char, TTag.Set, TTag.Any, TTag.UTFR}] = function(tree, len)
return len + 1
end,
[{TTag.False, TTag.True, TTag.Not, TTag.And, TTag.Behind}] = function(tree, len)
return len
end,
[{TTag.Rep, TTag.RunTime, TTag.OpenCall, TTag.Throw,}] = function(tree, len)
return -1 -- variable
end,
[{TTag.Capture, TTag.Rule, TTag.XInfo,}] = function(tree, len, cs)
return fixedlen_helper(tree.sib1, len, cs)
end,
[TTag.Grammar] = function(tree, len, cs)
-- make a fake compile state to hold the grammar, if necessary
if cs == nil then cs = CompileState:new(nil) end
return cs:withGrammar(tree, fixedlen_helper, tree.sib1, len, cs)
end,
[TTag.Call] = function(tree, len, cs)
-- If evaluating outside the context of a grammar, conservatively
-- return "variable"
if cs == nil then return -1 end
-- otherwise, carefully recurse
local n1 = cs:callrecursive(tree, fixedlen, -1, cs)
if n1 < 0 then return -1 end -- variable
return len + n1
end,
[TTag.Seq] = function(tree, len, cs)
local n1 = fixedlen_helper(tree.sib1, len, cs)
if n1 < 0 then return -1 end -- variable
return fixedlen_helper(tree.sib2, n1, cs)
end,
[TTag.Choice] = function(tree, len, cs)
local n1 = fixedlen_helper(tree.sib1, len, cs)
local n2 = fixedlen_helper(tree.sib2, len, cs)
if n1 ~= n2 or n1 < 0 then
return -1
else
return n1
end
end,
}
function fixedlen(tree, cs)
return fixedlen_helper(tree, 0, cs) -- supply default 0 for 'len'
end
function CompileState:fixedlen(t) return fixedlen(t, self) end
register_fname("fixedlen_helper", fixedlen_helper)
--[[
** Computes the 'first set' of a pattern.
** The result is a conservative aproximation:
** match p ax -> x (for some x) ==> a belongs to first(p)
** or
** a not in first(p) ==> match p ax -> fail (for all x)
**
** The set 'follow' is the first set of what follows the
** pattern (full set if nothing follows it).
**
** The function returns 0 when this resulting set can be used for
** test instructions that avoid the pattern altogether.
** A non-zero return can happen for two reasons:
** 1) match p '' -> '' ==> return has bit 1 set
** (tests cannot be used because they would always fail for an empty input);
** 2) there is a match-time capture ==> return has bit 2 set
** (optimizations should not bypass match-time captures).
]]--
local getfirst
getfirst = define_tree_visitor{
[TTag.Char] = function(t, follow, cs)
if t.n <= CHARMAX then return 0, tocharset(t) end
-- conservative approximation!
local s = newcharset()
s.rest = true
return 0, s
end,
[{ TTag.Set, TTag.Any, TTag.False }] = function(t, follow, cs)
return 0, tocharset(t)
end,
[TTag.UTFR] = function(t, follow, cs)
-- conservative approximation!
local firstset = newcharset()
if t.from <= CHARMAX then
for i=t.from, math.min(CHARMAX, t.to) do
firstset.set[i] = true
end
end
if t.to > CHARMAX then
-- conservative approximation of non-ascii unicode range
firstset.rest = true
end
return 0, firstset
end,
[TTag.True] = function(t, follow, cs)
return 1, follow -- 1 because this accepts the empty string
end,
[TTag.Throw] = function(t, follow, cs)
-- labeled failure: must always throw the label
return 1, fullset
end,
[TTag.Choice] = function(t, follow, cs)
local firstset = newcharset()
local e1,e1set = getfirst(t.sib1, follow, cs)
local e2,e2set = getfirst(t.sib2, follow, cs)
local firstset = cs_union(e1set, e2set)
local ret = 0 -- awkward lua5.1 way to say "e1 | e2"
if (e1 % 2) == 1 or (e2 % 2) == 1 then
ret = ret + 1
end
e1,e2 = compat.rshift(e1, 1), compat.rshift(e2, 1)
if (e1 % 2) == 1 or (e2 % 2) == 1 then
ret = ret + 2
end
return ret, firstset
end,
[TTag.Seq] = function(t, follow, cs)
if not nullable(t.sib1, cs) then
-- when p1 is not nullable, p2 has nothing to contribute
return getfirst(t.sib1, fullset, cs) -- tail call
else -- FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl))
local e2,csaux = getfirst(t.sib2, follow, cs)
local e1,firstset = getfirst(t.sib1, csaux, cs)
if e1 == 0 then
return 0, firstset -- 'e1' ensures that first can be used
elseif compat.rshift(e1, 1) % 2 == 1 or compat.rshift(e2, 1) % 2 == 1 then
-- one of the children has a matchtime?
return 2, firstset -- pattern has a matchtime capture
else
return e2, firstset -- else depends on e2
end
end
end,
[TTag.Rep] = function(t, follow, cs)
local _,firstset = getfirst(t.sib1, follow, cs)
return 1, cs_union(firstset, follow, cs) -- accepts the empty string
end,
[{ TTag.Capture,TTag.Rule,TTag.XInfo }] = function(t, follow, cs)
return getfirst(t.sib1, follow, cs) -- tail call
end,
[TTag.Grammar] = function(t, follow, cs)
return cs:withGrammar(t, getfirst, t.sib1, follow, cs)
end,
[TTag.RunTime] = function(t, follow, cs)
-- function invalidates any follow info
local e,firstset = getfirst(t.sib1, fullset, cs)
if e ~= 0 then
-- function is not "protected"?
return 2,firstset
else
-- pattern inside capture ensures first can be used
return 0,firstset
end
end,
[TTag.Call] = function(t, follow, cs)
local rule = cs.grammar.ruletab[t.key]
return getfirst(rule, follow, cs) -- tail call
end,
[TTag.And] = function(t, follow, cs)
local e,firstset = getfirst(t.sib1, follow, cs)
return e, cs_intersection(firstset, follow, cs)
end,
[{ TTag.Not, TTag.Behind }] = function(t, follow, cs)
if t.tag == TTag.Not then
local firstset = tocharset(t.sib1)
if firstset ~= nil then
return 1,cs_complement(firstset) -- could match empty input
end
end
-- the TNot or TBehind gives no new information
-- call getfirst only to check for math-time captures
local e,_ = getfirst(t.sib1, follow, cs)
if e%2 == 0 then e = e + 1 end -- set the lsb; could match empty input
return e, follow -- uses follow
end,
}
function CompileState:getfirst(t, follow) return getfirst(t, follow, self) end
register_fname("getfirst", getfirst)
--[[
** If 'headfail(tree)' true, then 'tree' can fail only depending on the
** next character of the subject.
-- ie, a single character of lookahead is enough to evaluate the pattern
-- rooted at this node
]]--
local headfail
headfail = define_tree_visitor{
[{TTag.Char, TTag.Set, TTag.Any,
TTag.False}] = function(t, cs)
return true
end,
[{TTag.True, TTag.Rep, TTag.RunTime, TTag.Not,
-- even though we are codepoint-based, we don't have a TestUTFR instruction
-- so we can't use a Test instruction on the first character, which is
-- implicitly what headfail is checking for.
TTag.UTFR,
TTag.Behind, TTag.Throw}] = function(t, cs)
return false
end,
[{TTag.Capture, TTag.Rule,
TTag.XInfo, TTag.And}] = function(t, cs)
return headfail(t.sib1, cs) -- tail call
end,
[TTag.Grammar] = function(t, cs)
return cs:withGrammar(t, headfail, t.sib1, cs)
end,
[TTag.Call] = function(t, cs)
local rule = cs.grammar.ruletab[t.key]
return headfail(rule, cs) -- tail call
end,
[TTag.Seq] = function(t, cs)
if not nofail(t.sib2, cs) then
-- if the second child could possibly fail, then we can't
-- evaluate the entire seq based just on the first child
return false
end
return headfail(t.sib1, cs) -- tail call
end,
[TTag.Choice] = function(t, cs)
-- both children need to be headfail for this to be headfail
if not headfail(t.sib1, cs) then
return false
end
return headfail(t.sib2, cs) -- tail call
end,
}
function CompileState:headfail(t) return headfail(t, self) end
register_fname("headfail", headfail)
--[[
** Check whether the code generation for the given tree can benefit
** from a follow set (to avoid computing the follow set when it is
** not needed)
]]--
local needfollow
needfollow = define_tree_visitor{
[{TTag.Char, TTag.Set, TTag.Any, TTag.UTFR,
TTag.False, TTag.True, TTag.And, TTag.Not,
TTag.RunTime, TTag.Grammar, TTag.Call, TTag.Behind,
TTag.Throw, }] = function(tree) return false end,
[{TTag.Choice, TTag.Rep}] = function(tree) return true end,
[TTag.Capture] = function(tree) return needfollow(tree.sib1) end,
[TTag.Seq] = function(tree) return needfollow(tree.sib2) end,
}
register_fname("needfollow", needfollow)
--[[
** ======================================================
** Code generation
** ======================================================
]]--
local Instruction = {}
Instruction.__index = Instruction
function Instruction:new(arg)
local opcode = arg[1]
if opcode == nil then error("no opcode") end
-- target is rule # for open calls before correction, and absolute pc after
local instr = {
code = opcode,
exec = opcode.exec, -- copy the exec function from the opcode!
aux = arg.aux, -- used for the "primary argument"
key = arg.key, -- used for string-valued arguments
target = arg.target, -- used for jmp-like instructions
from = arg.from, -- inclusive start, for ranges
to = arg.to, -- inclusive end, for ranges
set = arg.set, -- charset <= CHARMAX
rest = arg.rest, -- include characters above CHARMAX?
cap = arg.cap, -- used for "capture kind"
}
setmetatable(instr, self)
instr:setCode(opcode) -- opportunity to add tracing logic!
return instr
end
function Instruction:setCode(opcode)
self.code = opcode
local exec = opcode.exec
if TRACE_INSTRUCTIONS then
local str
self.exec = function(self, state, ...)
if str == nil then
str = table.concat(printinst(0, self, { "Executing " })):gsub("\n","")
end
print(state.bytePos, state.codepoint, str)
return exec(self, state, ...)
end
else
self.exec = exec
end
end
-- state for the compiler
function CompileState:new(p)
local cs = {
p = p,
}
setmetatable(cs, self)
return cs
end
function CompileState:withGrammar(g, f, ...)
local lastGrammar = self.grammar
self.grammar = g
local result = compat.pack(f(...))
self.grammar = lastGrammar
return compat.unpack(result)
end
function CompileState:codegen(tree, opt, tt, fl)
assert(fl.tag == TTag.Set)
-- just a little helper
return codegen(tree, self, opt, tt, fl)
end
function CompileState:getinstr(i)
return self.p.code[i]
end
function CompileState:addinstruction(arg)
local code = self.p.code
table.insert(code, Instruction:new(arg))
return #code
end
function CompileState:gethere()
local code = self.p.code
return 1 + #code
end
function CompileState:jumptothere(pc, where)
if pc ~= NOINST then
local code = self.p.code
code[pc].target = where
end
end
function CompileState:jumptohere(pc)
self:jumptothere(pc, self:gethere())
end
function codethrow(cs, throw)
local rule = nil
if cs.grammar ~= nil then
-- we only lookup/match *string* rule names, not numeric indices
rule = cs.grammar.ruletab[tostring(throw.key)]
end
if rule ~= nil then
return cs:addinstruction{
Opcode.ThrowRec,
key=throw.key, -- rule name / error label
target=rule.n -- recovery rule number
}
else
return cs:addinstruction{
Opcode.Throw,
key=throw.key, -- rule name / error label
-- no recovery rule
}
end
end
function codeutfr(cs, tree)
return cs:addinstruction{
Opcode.UTFR,
from = tree.from,
to = tree.to,
}
end
--[[
** Code an IChar instruction, or IAny if there is an equivalent
** test dominating it
]]--
function codechar(cs, codepoint, tt)
if tt ~= NOINST and
cs:getinstr(tt).code == Opcode.TestChar and
cs:getinstr(tt).aux == codepoint then
cs:addinstruction{Opcode.Any}
else
cs:addinstruction{Opcode.Char, aux=codepoint,}
end
end
--[[
** Add a charset posfix to an instruction
]]--
function addcharset(cs, codepoint)
--[[
static void addcharset (CompileState *compst, const byte *cs) {
int p = gethere(compst);
int i;
for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
nextinstruction(compst); /* space for buffer */
/* fill buffer with charset */
loopset(j, getinstr(compst, p).buff[j] = cs[j]);
]]--
end
--[[
** code a char set, optimizing unit sets for IChar, "complete"
** sets for IAny, and empty sets for IFail; also use an IAny
** when instruction is dominated by an equivalent test.
]]--
function codecharset(cs, tree, tt)
local op,codepoint = charsettype(tree)
if op == Opcode.Char then
return codechar(cs, codepoint, tt)
elseif op == Opcode.Set then
-- non-trivial set?
if tt ~= NOINST and
cs:getinstr(tt).code == Opcode.TestSet and
cs_equal(tree, cs:getinstr(tt)) then
return cs:addinstruction{Opcode.Any}
else
return cs:addinstruction{
Opcode.Set,
set = tree.set, -- XXX ensure immutable
rest = tree.rest,
}
end
else
return cs:addinstruction{op} -- Any or Fail
end
end
--[[
** code a test set, optimizing unit sets for ITestChar, "complete"
** sets for ITestAny, and empty sets for IJmp (always fails).
** 'e' is nonzero iff test should accept the empty string. (Test
** instructions in the current VM never accept the empty string.)
]]--
function codetestset(cs, tree, e)
if e ~= 0 then return NOINST end
local op,codepoint = charsettype(tree)
if op == Opcode.Fail then
return cs:addinstruction{Opcode.Jmp, target = NOINST} -- always jump
elseif op == Opcode.Any then
return cs:addinstruction{Opcode.TestAny, target = NOINST}
elseif op == Opcode.Char then
return cs:addinstruction{
Opcode.TestChar,
target = NOINST,
aux = codepoint,
}
elseif op == Opcode.Set then
return cs:addinstruction{
Opcode.TestSet,
target = NOINST,
set = tree.set, -- XXX ensure immutable
rest = tree.rest,
}
else
error("unreachable")
end
end
--[[
** <behind(p)> == behind n; <p> (where n = fixedlen(p))
]]--
function codebehind(cs, tree)
if tree.n > 0 then
cs:addinstruction{ Opcode.Behind, aux = tree.n }
end
return cs:codegen(tree.sib1, false, NOINST, fullset)
end
--[[
** Choice; optimizations:
** - when p1 is headfail or when first(p1) and first(p2) are disjoint,
** than a character not in first(p1) cannot go to p1 and a character
** in first(p1) cannot go to p2, either because p1 will accept
** (headfail) or because it is not in first(p2) (disjoint).
** (The second case is not valid if p1 accepts the empty string,
** as then there is no character at all...)
** - when p2 is empty and opt is true; a IPartialCommit can reuse
** the Choice already active in the stack.
]]--
function codechoice(cs, p1, p2, opt, fl)
local emptyp2 = (p2.tag == TTag.True)
local e1, cs1 = cs:getfirst(p1, fullset)
local headfailp1 = cs:headfail(p1)
local e2, cs2
if not headfailp1 and e1 == 0 then
e2, cs2 = cs:getfirst(p2, fl) -- avoid computing unless necessary
end
if headfailp1 or (e1 == 0 and cs_disjoint(cs1, cs2)) then
-- <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2:
local test = codetestset(cs, cs1, 0)
local jmp = NOINST
cs:codegen(p1, false, test, fl)
if not emptyp2 then
jmp = cs:addinstruction{Opcode.Jmp, target = NOINST }
end
cs:jumptohere(test)
cs:codegen(p2, opt, NOINST, fl)
cs:jumptohere(jmp)
elseif opt and emptyp2 then
-- p1? == IPartialCommit; p1
cs:jumptohere(cs:addinstruction{Opcode.PartialCommit, target = NOINST})
cs:codegen(p1, true, NOINST, fullset)
else
-- <p1 / p2> ==
-- test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2:
local test = codetestset(cs, cs1, e1)
local pchoice = cs:addinstruction{Opcode.Choice, target = NOINST}
cs:codegen(p1, emptyp2, test, fullset)
local pcommit = cs:addinstruction{Opcode.Commit, target = NOINST}
cs:jumptohere(pchoice)
cs:jumptohere(test)
cs:codegen(p2, opt, NOINST, fl)
cs:jumptohere(pcommit)
end
end
--[[
** And predicate
** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
** (valid only when 'p' has no captures)
]]--
function codeand(cs, tree, tt)
--[[ labeled failure: optimization disabled because in case of a failure it
does not report the expected error position (the current subject position
when begin the matching of <&p>) ]]--
local pchoice = cs:addinstruction{Opcode.PredChoice, target = NOINST}
cs:codegen(tree, false, tt, fullset)
local pcommit = cs:addinstruction{Opcode.BackCommit, target = NOINST}
cs:jumptohere(pchoice)
cs:addinstruction{Opcode.Fail}
cs:jumptohere(pcommit)
end
--[[
** Captures: if pattern has fixed (and not too big) length, and it
** has no nested captures, use a single IFullCapture instruction
** after the match; otherwise, enclose the pattern with OpenCapture -
** CloseCapture.
]]--
function codecapture(cs, tree, tt, fl)
local len = cs:fixedlen(tree.sib1)
if len >= 0 and len <= MAXOFF and not cs:hascaptures(tree.sib1) then
cs:codegen(tree.sib1, false, tt, fl)
cs:addinstruction{
Opcode.FullCapture,
cap = tree.cap,
key = tree.key, -- capture name
aux = len,
}
else
assert(tree.cap ~= nil)
cs:addinstruction({
Opcode.OpenCapture,
cap = tree.cap,
key = tree.key, -- capture name
})
cs:codegen(tree.sib1, false, tt, fl)
cs:addinstruction({
Opcode.CloseCapture,
cap = CapKind.close,
})
end
end
function coderuntime(cs, tree, tt)
cs:addinstruction({
Opcode.OpenCapture,
cap = CapKind.group,
key = tree.key, -- capture *function*
})
cs:codegen(tree.sib1, false, tt, fullset)
cs:addinstruction({
Opcode.CloseRunTime,
cap = CapKind.close,
})
end
--[[
** Repetition; optimizations:
** When pattern is a charset, can use special instruction ISpan.
** When pattern is head fail, or if it starts with characters that
** are disjoint from what follows the repetions, a simple test
** is enough (a fail inside the repetition would backtrack to fail
** again in the following pattern, so there is no need for a choice).
** When 'opt' is true, the repetion can reuse the Choice already
** active in the stack.
]]--
function coderep(cs, tree, opt, fl)
local st = tocharset(tree)
if st ~= nil then
return cs:addinstruction{
Opcode.Span,
set = st.set,
rest = st.rest,
}
end
local e1,st = cs:getfirst(tree, fullset)
if cs:headfail(tree) or (e1 == 0 and cs_disjoint(st, fl)) then
-- L1: test (fail(p1)) -> L2; <p>; jmp L1; L2:
local test = codetestset(cs, st, 0)
cs:codegen(tree, false, test, fullset)
local jmp = cs:addinstruction{Opcode.Jmp, target = NOINST}
cs:jumptohere(test)
cs:jumptothere(jmp, test)
else
-- test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2:
-- or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1;
local test = codetestset(cs, st, e1)
local pchoice = NOINST
if opt then
cs:jumptohere(cs:addinstruction{Opcode.PartialCommit, target = NOINST})
else
pchoice = cs:addinstruction{Opcode.Choice, target = NOINST}
end
local l2 = cs:gethere()
cs:codegen(tree, false, NOINST, fullset)
local commit = cs:addinstruction{Opcode.PartialCommit, target = NOINST}
cs:jumptothere(commit, l2)
cs:jumptohere(pchoice)
cs:jumptohere(test)
end
end
--[[
** Not predicate; optimizations:
** In any case, if first test fails, 'not' succeeds, so it can jump to
** the end. If pattern is headfail, that is all (it cannot fail
** in other parts); this case includes 'not' of simple sets. Otherwise,
** use the default code (a choice plus a failtwice).
]]--
function codenot(cs, tree)
local e,st = cs:getfirst(tree, fullset)
local test = codetestset(cs, st, e)
if cs:headfail(tree) then
-- test (fail(p1)) -> L1; fail; L1:
cs:addinstruction{Opcode.Fail}
else
-- test(fail(p))-> L1; choice L1; <p>; failtwice; L1:
local pchoice = cs:addinstruction{Opcode.PredChoice, target = NOINST }
cs:codegen(tree, false, NOINST, fullset)
cs:addinstruction{Opcode.FailTwice}
cs:jumptohere(pchoice)
end
cs:jumptohere(test)
end
-- find the final destination of a sequence of jumps
function finaltarget(code, pc)
while code[pc].code == Opcode.Jmp do
pc = code[pc].target
end
return pc
end
-- final label (after traversing any jumps)
function finallabel(code, pc)
return finaltarget(code, code[pc].target)
end
--[[
** change open calls to calls, using list 'positions' to find
** correct offsets; also optimize tail calls
]]--
function correctcalls(cs, positions, from, to)
local code = cs.p.code
for i=from,(to-1) do
local op = code[i]
if op.code == Opcode.OpenCall or op.code == Opcode.ThrowRec then
local n = op.target -- rule number
local rule = positions[n] -- rule position
if rule == from or code[rule - 1].code == Opcode.Ret then
-- sanity check! ok!
else
error("bad rule position")
end
if op.code == Opcode.OpenCall then
if code[finaltarget(code, i+1)].code == Opcode.Ret then
-- call; ret => tail call
op:setCode(Opcode.Jmp)
else
op:setCode(Opcode.Call) -- open call no more
end
end
op.target = rule
end
end
end
--[[
** Code for a grammar:
** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
]]--
function codegrammar(cs, tree)
local firstcall = cs:addinstruction{Opcode.Call, target = NOINST} -- call initial rule
local jumptoend = cs:addinstruction{Opcode.Jmp, target = NOINST} -- jump to the end
local start = cs:gethere() -- here starts the initial rule
cs:jumptohere(firstcall)
local positions = {}
local rule = tree.sib1
for i=1,tree.n do
local pattern = rule.sib1
positions[i] = cs:gethere() -- save rule position
cs:codegen(rule.sib1, false, NOINST, fullset) -- code rule
cs:addinstruction{Opcode.Ret}
rule = rule.sib2
end
if rule.tag ~= TTag.True then error("impossible") end
cs:jumptohere(jumptoend)
correctcalls(cs, positions, start, cs:gethere())
end
function codecall(cs, tree)
local rule = cs.grammar.ruletab[tree.key]
assert(rule ~= nil)
assert(rule.n ~= nil)
return cs:addinstruction{
Opcode.OpenCall, -- to be corrected later
target = rule.n -- rule number
}
end
--[[
** Code first child of a sequence
** (second child is called in-place to allow tail call)
** Return 'tt' for second child
]]--
function codeseq1(cs, p1, p2, tt, fl)
assert(fl.tag == TTag.Set)
if needfollow(p1) then
local _, fl1 = cs:getfirst(p2, fl) -- p1 follow is p2 first
cs:codegen(p1, false, tt, fl1)
else
-- use fullset as follow
cs:codegen(p1, false, tt, fullset)
end
if cs:fixedlen(p1) ~= 0 then -- can 'p1' consume anything?
return NOINST -- invalidate test
else
return tt -- else 'tt' still protects sib2
end
end
--[[
** Main code-generation function: dispatch to auxiliar functions
** according to kind of tree. ('needfollow' should return true
** only for consructions that use 'fl'.)
]]--
--[[
** code generation is recursive; 'opt' indicates that the code is being
** generated as the last thing inside an optional pattern (so, if that
** code is optional too, it can reuse the 'IChoice' already in place for
** the outer pattern). 'tt' points to a previous test protecting this
** code (or NOINST). 'fl' is the follow set of the pattern.
]]--
codegen = define_tree_visitor{
[TTag.Char] = function(tree, cs, opt, tt, fl)
return codechar(cs, tree.n, tt)
end,
[TTag.Any] = function(tree, cs, opt, tt, fl)
return cs:addinstruction{Opcode.Any}
end,
[TTag.Set] = function(tree, cs, opt, tt, fl)
return codecharset(cs, tree, tt)
end,
[TTag.True] = function(tree, cs, opt, tt, fl)
return -- do nothing
end,
[TTag.False] = function(tree, cs, opt, tt, fl)
return cs:addinstruction{Opcode.Fail}
end,
[TTag.UTFR] = function(tree, cs, opt, tt, fl)
return codeutfr(cs, tree)
end,
[TTag.Choice] = function(tree, cs, opt, tt, fl)
return codechoice(cs, tree.sib1, tree.sib2, opt, fl)
end,
[TTag.Rep] = function(tree, cs, opt, tt, fl)
return coderep(cs, tree.sib1, opt, fl)
end,
[TTag.Behind] = function(tree, cs, opt, tt, fl)
return codebehind(cs, tree)
end,
[TTag.Not] = function(tree, cs, opt, tt, fl)
return codenot(cs, tree.sib1)
end,
[TTag.And] = function(tree, cs, opt, tt, fl)
return codeand(cs, tree.sib1, tt)
end,
[TTag.Capture] = function(tree, cs, opt, tt, fl)
return codecapture(cs, tree, tt, fl)
end,
[TTag.RunTime] = function(tree, cs, opt, tt, fl)
return coderuntime(cs, tree, tt)
end,
[TTag.Grammar] = function(tree, cs, opt, tt, fl)
return cs:withGrammar(tree, codegrammar, cs, tree)
end,
[TTag.Call] = function(tree, cs, opt, tt, fl)
return codecall(cs, tree)
end,
[TTag.Seq] = function(tree, cs, opt, tt, fl)
tt = codeseq1(cs, tree.sib1, tree.sib2, tt, fl) -- code 'p1'
return cs:codegen(tree.sib2, opt, tt, fl) -- tail call
end,
[TTag.Throw] = function(tree, cs, opt, tt, fl)
return codethrow(cs, tree)
end,
["assert"] = function(tree, cs, opt, tt, fl)
assert(fl.tag == TTag.Set)
assert(opt ~= 0)
end,
}
register_fname("codegen", codegen)
--[[
** Optimize jumps and other jump-like instructions.
** * Update labels of instructions with labels to their final
** destinations (e.g., choice L1; ... L1: jmp L2: becomes
** choice L2)
** * Jumps to other instructions that do jumps become those
** instructions (e.g., jump to return becomes a return; jump
** to commit becomes a commit)
]]--
function peephole(cs)
local code = cs.p.code
local jmpswitch
local switch = define_opcode_visitor{
-- instructions with labels
[{Opcode.Choice, Opcode.Call, Opcode.Commit, Opcode.PartialCommit,
Opcode.BackCommit, Opcode.TestChar, Opcode.TestSet,
Opcode.TestAny}] = function(op, i)
cs:jumptothere(i, finallabel(code, i))
end,
[Opcode.Jmp] = function(op, i)
local ft = finaltarget(code, i)
jmpswitch(code[ft], i, ft) -- jumping to what?
end,
default = function() end,
}
jmpswitch = define_opcode_visitor{
-- instructions with unconditional implicit jumps
[{Opcode.Ret,Opcode.Fail,Opcode.FailTwice,Opcode.End}] = function(op, i, ft)
code[i]:setCode(code[ft].code) -- jump becomes that instruction
end,
-- instructions with unconditional explicit jumps
[{Opcode.Commit, Opcode.PartialCommit, Opcode.BackCommit}] = function(op, i, ft)
local fft = finallabel(code, ft)
code[i]:setCode(code[ft].code) -- jump becomes that instruction
cs:jumptothere(i, fft) -- with an optimized target
switch(code[i], i) -- reoptimize the label
end,
default = function(op, i, ft)
cs:jumptothere(i, ft) -- optimize label
end,
}
for i=1,#code do
switch(code[i], i)
end
end
-- thread the instructions to speed up dispatch during execution
function thread(cs)
local code = cs.p.code
for i=1,#code-1 do
code[i].next = code[i+1]
if code[i].target ~= nil then
code[i].branch = code[code[i].target]
end
end
end
function compile(p)
local compst = CompileState:new(p)
p.code = {}
assert(fullset.tag == TTag.Set)
compst:codegen(p, false, NOINST, fullset)
compst:addinstruction{Opcode.End}
peephole(compst)
thread(compst)
return p.code
end
return {
Instruction = Instruction,
compile = compile,
cs_clone = cs_clone,
cs_complement = cs_complement,
cs_diff = cs_diff,
cs_intersection = cs_intersection,
cs_union = cs_union,
fixedlen = fixedlen,
hascaptures = hascaptures,
nofail = nofail,
nullable = nullable,
nullable_with_grammar = nullable_with_grammar,
tocharset = tocharset,
}
end)
register('llpeg.utf8util', function(myrequire)
myrequire('strict')
local utf8util = {}
function utf8util.codepointAt(s, pos)
local c1 = string.byte(s, pos)
if c1 <= 0x7F then
return c1, 1
end
local c2 = string.byte(s, pos + 1)
if c1 <= 0xDF then
return ((c1 % 0x20 ) * 0x40) + (c2 % 0x40), 2
end
local c3 = string.byte(s, pos + 2)
if c1 <= 0xEF then
return (((c1 % 0x10) * 0x40) + (c2 % 0x40)) * 0x40 + (c3 % 0x40), 3
end
local c4 = string.byte(s, pos + 3)
if c1 <= 0xF7 then
return ((((c1 % 0x08) * 0x40) + (c2 % 0x40)) * 0x40 + (c3 % 0x40)) * 0x40 + (c4 % 0x40), 4
end
error( "bad utf8" )
end
-- same as utf8.offset in Lua 5.3 standard library
function utf8util.offset(s, n, i)
if n > 0 then error("unimplemented") end
while n < 0 do
i = i - 1
if i < 1 then return nil end
local c = string.byte(s, i)
if c < 0x80 or c > 0xBF then
n = n + 1
end
end
return i
end
return utf8util
end)
register('llpeg.list', function(myrequire)
local List = {}
List.__index = List
function List:new()
return setmetatable({ n = 0 }, self)
end
function List:__len()
return self.n
end
function List:push(val)
local n = self.n + 1
self[n] = val
self.n = n
end
function List:pop()
local n = self.n
assert(n > 0)
local old = self[n]
self[n] = nil
self.n = n - 1
return old
end
function List:insert(pos, val)
for i=self.n,pos,-1 do
self[i+1] = self[i]
end
self[pos] = val
self.n = self.n + 1
end
return List
end)
register('llpeg.cap', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CapKind,
_ = from(myrequire('llpeg.types'), {
'CapKind',
})
local
printcaplist,
_ = from(myrequire('llpeg.print'), {
'printcaplist',
})
local List = myrequire('llpeg.list')
local MAXRECLEVEL = 200
local Capture = {}
Capture.__index = Capture
-- kind is CapKind of the capture
-- bytePos is the subject position (in bytes)
-- byteLen is the length of the capture (in bytes)
-- extra is extra info (group name, arg index, etc)
function Capture:new(kind, bytePos, byteLen, extra)
assert(getmetatable(kind) == CapKind)
return setmetatable({
kind = kind, bytePos = bytePos, byteLen = byteLen, extra = extra,
}, self)
end
function Capture:__tostring()
return string.format("Capture{kind=%s, pos=%d, len=%s, extra=%s}",
self.kind, self.bytePos, self.byteLen, self.extra)
end
function Capture:isopencap()
return self.byteLen == nil
end
-- true if c2 is (any number of levels) inside self
function Capture:inside(c2)
if self:isopencap() then
return not c2:isclosecap()
else
return c2.bytePos < (self.bytePos + self.byteLen)
end
end
function Capture:isclosecap()
return self.kind == CapKind.close
end
--[[
** Return the size of capture 'cap'. If it is an open capture, 'close'
** must be its corresponding close.
]]--
function Capture:size(close)
if self:isopencap() then
assert(close:isclosecap())
return close.bytePos - self.bytePos
else
return self.byteLen
end
end
function CapKind:newCapture(bytePos, byteLen, extra)
return Capture:new(self, bytePos, byteLen, extra)
end
local CapState = {}
CapState.__index = CapState
-- Capture cap: current capture
-- Capture ocap: (original) capture list
-- int ptop: index of last argument to 'match'
-- string s: original string
-- int valuecached: value stored in cache slot
-- int reclevel: recursion level
function CapState:new(captures, source, extraArgs)
return setmetatable({
captures = captures,
index = 1,
source = source,
valuecached = {},
reclevel = 0,
extraArgs = extraArgs,
}, self)
end
function CapState:cap() -- helper
return self.captures[self.index]
end
function CapState:advance() -- helper
local i = self.index
local cap = self.captures[i]
self.index = i + 1
return cap, i
end
function CapState:substr(start, len) -- helper
return string.sub(self.source, start, start + len - 1)
end
function CapState:skipclose(head)
if head:isopencap() then
assert(self.captures[self.index]:isclosecap())
self.index = self.index + 1
end
end
function CapState:closesize(head)
return head:size(self:cap())
end
--[[
** Go to the next capture at the same level
]]--
function CapState:nextcap()
local cap = self:cap()
if cap:isopencap() then -- must look for a close
local n = 0 -- number of opens waiting a close
while true do -- look for corresponding close
self.index = self.index + 1
cap = self:cap()
if cap:isopencap() then
n = n + 1
elseif cap:isclosecap() then
if n == 0 then break end
n = n - 1
end
end
self.index = self.index + 1 -- skip last close (or entire single capture)
else
self.index = self.index + 1
while cap:inside(self:cap()) do
self.index = self.index + 1 -- skip captures inside the current one
end
end
end
--[[
** Goes back in a list of captures looking for an open capture
** corresponding to a close
]]--
function CapState:findopen(i) -- captures[i] is the close that we want to match
assert(self.captures[i]:isclosecap())
local n = 0 -- number of closes waiting an open
while i > 1 do
i = i - 1
local cap = self.captures[i]
if cap:isclosecap() then
n = n + 1 -- one more open to skip
elseif cap:isopencap() then
if n == 0 then return cap,i end
n = n - 1
end
end
error("couldn't find open")
end
--[[
** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is
** not nested inside a full capture that does not contain 'ref'. (We
** only need to care for full captures because the search at 'findback'
** skips open-end blocks; so, if 'grp' is nested in a non-full capture,
** 'ref' is also inside it.) To check this, we search backward for the
** inner full capture enclosing 'grp'. A full capture cannot contain
** non-full captures, so a close capture means we cannot be inside a
** full capture anymore.
]]--
function CapState:capvisible(igrp, ref)
local i = igrp
local grp = self.captures[igrp]
while i > 1 do
i = i - 1
local cap = self.captures[i]
if cap:isclosecap() then
return true -- can stop the search
elseif cap:inside(grp) then -- is 'grp' inside cap?
return cap:inside(ref) -- ok iff cap also contains ref
end
end
return true -- 'grp' is not inside any capture
end
--[[
** Try to find a named group capture with the name given;
** goes backward from 'i'.
]]--
function CapState:findback(name, i)
if i == nil then i = self.index end
local ref = self.captures[i]
while i > 1 do
i = i - 1
local cap = self.captures[i]
if cap:isclosecap() or not cap:inside(ref) then
if cap:isclosecap() then
cap,i = self:findopen(i)
end
if cap.kind == CapKind.group and self:capvisible(i, ref) then
if cap.extra == name then
return cap,i
end
end
end
end
error("back reference '"..name.."' not found")
end
function CapState:getcaptures()
local result = List:new()
while not self:cap():isclosecap() do
self:pushcapture(result)
end
return result
end
function CapState:pushcapture(result)
self.reclevel = self.reclevel + 1
if self.reclevel > MAXRECLEVEL then
error("subcapture nesting too deep")
end
local cap = self.captures[self.index]
assert(cap.kind.push ~= nil)
local res = cap.kind.push(self, cap, result)
self.reclevel = self.reclevel - 1
return res
end
-- helper functions for pushcapture
--[[
** Push on the Lua stack all values generated by nested captures inside
** the current capture. Returns number of values pushed. 'addextra'
** makes it push the entire match after all captured values. The
** entire match is pushed also if there are no other nested values,
** so the function never returns zero.
]]--
function CapState:pushnestedvalues(result, addextra)
local head = self:advance()
local n = 0 -- number of pushed subvalues
-- repeat for all nested patterns
while head:inside(self:cap()) do
n = n + self:pushcapture(result)
end
if addextra or n == 0 then -- need extra?
result:push(self:substr(head.bytePos, self:closesize(head)))
n = n + 1
end
self:skipclose(head)
return n
end
--[[
** Push only the first value generated by nested captures
]]--
function CapState:pushonenestedvalue(result)
local n = self:pushnestedvalues(result, false)
if n == 0 then
result:push(nil) -- ensure there's exactly one value
return 1
end
while n > 1 do
result:pop() -- pop extra values
n = n - 1
end
return n
end
-- visitor patterns for pushcapture
function CapKind.position.push(capstate, cap, result)
result:push(cap.bytePos)
capstate.index = capstate.index + 1
return 1
end
function CapKind.const.push(capstate, cap, result)
result:push(cap.extra)
capstate.index = capstate.index + 1
return 1
end
function CapKind.arg.push(capstate, cap, result)
local n = cap.extra
if n > capstate.extraArgs.n then
error(string.format("reference to absent extra argument #%d", n))
end
result:push(capstate.extraArgs[n])
capstate.index = capstate.index + 1
return 1
end
function CapKind.simple.push(capstate, cap, result)
local k = capstate:pushnestedvalues(result, true)
-- reorder so that the whole match is the first result, not the last
local last = result:pop()
result:insert(2 + #result - k, last)
return k
end
-- missing a bunch
--[[
** Table capture: creates a new table and populates it with nested
** captures.
]]--
function CapKind.table.push(capstate, cap, result) -- aka tablecap
local t = {}
result:push(t)
local head = capstate:advance()
local n = 0
while head:inside(capstate:cap()) do
cap = capstate:cap()
if cap.kind == CapKind.group and cap.extra ~= nil then -- named group?
capstate:pushonenestedvalue(result)
t[cap.extra] = result:pop() -- move it into table
else -- not a named group
local k = capstate:pushcapture(result)
for i=k,1,-1 do
t[n + i] = result:pop() -- move it into table (indexed)
end
n = n + k
end
end
capstate:skipclose(head)
return 1 -- number of values pushed (only the table)
end
--[[
** Table-query capture
]]--
function CapKind.query.push(capstate, cap, result) -- aka querycap
capstate:pushonenestedvalue(result)
local key = result:pop()
local tbl = cap.extra
assert(type(tbl) == "table")
local val = tbl[key]
if val ~= nil then
result:push(val)
return 1
else
return 0
end
end
--[[
** Fold capture
]]--
function CapKind.fold.push(capstate, cap, result) -- aka foldcap
local f = cap.extra
assert(type(f) == "function")
local head = capstate:advance()
if capstate:cap():isclosecap() then
-- no nested captures? (large subject)
error("no initial value for fold capture")
end
local args = List:new()
local n = capstate:pushcapture(args)
if n == 0 then
-- nested captures with no values?
error("no initial value for fold capture")
end
local accum = args[1] -- leave only one result for accumulator
while head:inside(capstate:cap()) do
args = List:new()
args:push( accum ) -- put accumulator first
n = capstate:pushcapture(args) -- get next capture's values
accum = f(compat.unpack(args))
end
capstate:skipclose(head)
-- only accumulator left in result
result:push(accum)
return 1
end
--[[
** Function capture
]]--
CapKind["function"].push = function(capstate, cap, result)
local f = cap.extra
assert(type(f) == "function")
local args = List:new()
local n = capstate:pushnestedvalues(args, false)
local r = compat.pack(f(compat.unpack(args)))
for i=1,r.n do
result:push(r[i])
end
return r.n
end
--[[
** Accumulator capture
]]--
function CapKind.acc.push(capstate, cap, result) -- aka accumulatorcap
if #result == 0 then
error("no previous value for accumulator capture")
end
local f = cap.extra
assert(type(f) == "function")
local prev = #result
local args = List:new()
args:push(result[prev])
local n = capstate:pushnestedvalues(args, false)
result[prev] = f(compat.unpack(args))
return 0 -- did not add any extra value
end
--[[
** Select capture
]]--
function CapKind.num.push(capstate, cap, result) -- aka numcap
local idx = cap.extra -- value to select
if idx == 0 then -- no values?
capstate:nextcap() -- skip entire capture
return 0 -- no value produced
else
local vals = List:new()
local n = capstate:pushnestedvalues(vals, false)
if n < idx then -- invalid index?
error("no capture '"..idx.."'")
else
result:push(vals[idx])
return 1
end
end
end
function CapState:runtimecap(closePos)
local close = self.captures[closePos]
local open,openPos = self:findopen(closePos) -- get open group capture
assert(open.kind == CapKind.group)
self.index = openPos -- set state to the open capture
local args = List:new()
args:push( self.source) -- original subject
args:push( close.bytePos ) -- current position
local n = self:pushnestedvalues(args, false) -- push nested captures
local func = open.extra
local funcRet = compat.pack(func(compat.unpack(args)))
local res = closePos - openPos -- number of captures to be removed
return res, funcRet
end
function CapKind.runtime.push(capstate, cap, result) -- aka runtimecap
result:push(cap.extra)
capstate.index = capstate.index + 1
return 1
end
local MAXSTRCAPS = 10
--[[
** Collect values from current capture into array 'cps'. Current
** capture must be Cstring (first call) or Csimple (recursive calls).
** (In first call, fills %0 with whole match for Cstring.)
** Returns number of elements in the array that were filled.
]]--
function CapState:getstrcaps(cps, n)
local k = n
n = n + 1
cps[k] = {
isstring = true, -- get string value
bytePos = self:cap().bytePos, -- starts here
}
local head = self:advance()
while head:inside(self:cap()) do
if n > MAXSTRCAPS then -- too many captures?
self:nextcap() -- skip extra captures (will not need them)
elseif self:cap().kind == CapKind.Simple then -- string?
n = self:getstrcaps(cps, n) -- put info into array
else
cps[n] = {
isstring = false, -- not a string
cap = self.index, -- keep original capture
}
self:nextcap()
n = n + 1
end
end
cps[k].endPos = head.bytePos + self:closesize(head)
self:skipclose(head)
return n
end
function CapState:addonestring(buffer, what)
local cap = self:cap()
if cap.kind == CapKind.string then
-- add capture directly to buffer
return stringcap(self, buffer)
elseif cap.kind == CapKind.subst then
-- add capture directly to buffer
return substcap(self, buffer)
elseif cap.kind == CapKind.acc then
error("invalid context for an accumulator capture")
end
local result = List:new()
local n = self:pushcapture(result)
if n == 0 then return 0 end -- no values to add
local val = result[1] -- take only one result (the first)
if type(val) == "number" then
val = tostring(val)
elseif type(val) ~= "string" then
error("invalid "..what.." value (a "..type(val)..")")
end
table.insert(buffer, val)
return 1
end
--[[
** String capture: add result to buffer 'b' (instead of pushing
** it into the stack)
]]--
function stringcap(capstate, buffer)
local fmt = capstate:cap().extra
local cps = {}
local n = capstate:getstrcaps(cps, 1) - 1 -- collect nested captures
local sawEscape = false
for _,c in compat.utf8codes(fmt) do
if sawEscape then
sawEscape = false
if c < 48 or c > 57 then -- not followed by a digit
table.insert(buffer, compat.utf8char(c))
else
local l = 1 + c - 48 -- capture index (1-based)
if l > n then
error("invalid capture index ("..(l-1)..")")
elseif cps[l].isstring then
table.insert(buffer, capstate:substr(cps[l].bytePos, cps[l].endPos - cps[l].bytePos))
else
-- go back to evaluate that nested capture
local curr = capstate.index
capstate.index = cps[l].cap
if capstate:addonestring(buffer, "capture") == 0 then
error("no values in capture index "..l)
end
capstate.index = curr
end
end
elseif c ~= 37 then -- not a % escape?
table.insert(buffer, compat.utf8char(c))
else
sawEscape = true
end
end
return 1
end
--[[
** Substitution capture: add result to buffer 'b'
]]--
function substcap(capstate, buffer)
local head = capstate:advance()
local curr = head.bytePos
while head:inside(capstate:cap()) do
local cap = capstate:cap()
local nextPos = cap.bytePos
local s = capstate:substr(curr, nextPos - curr)
table.insert(buffer, s) -- add text up to capture
if capstate:addonestring(buffer, "replacement") == 0 then
-- no capture value, keep original text in final result
curr = nextPos
else
-- continue after match
local lastCap = capstate.captures[capstate.index - 1]
curr = nextPos + cap:size(lastCap)
end
end
-- add last piece of text
local s = capstate:substr(curr, head.bytePos + capstate:closesize(head) - curr)
table.insert(buffer, s)
capstate:skipclose(head)
end
function CapKind.subst.push(capstate, cap, result) -- aka substcap
local buffer = {}
substcap(capstate, buffer)
result:push(table.concat(buffer))
return 1
end
function CapKind.string.push(capstate, cap, result) -- aka stringcap
local buffer = {}
stringcap(capstate, buffer)
result:push(table.concat(buffer))
return 1
end
function CapKind.group.push(capstate, cap, result)
if cap.extra == nil then -- anonymous group?
return capstate:pushnestedvalues(result, false) -- add all nested values
else -- named group: add no values
capstate:nextcap()
return 0
end
end
--[[
** Back-reference capture. Return number of values pushed.
]]--
function CapKind.backref.push(capstate, cap, result)
local curr = capstate.index
local _,i = capstate:findback(cap.extra)
capstate.index = i
local n = capstate:pushnestedvalues(result, false)
capstate.index = curr + 1
return n
end
return {
CapState = CapState,
Capture = Capture,
}
end)
register('llpeg.vm', function(myrequire)
myrequire('strict')
local compat = myrequire('advent.compat')
local utf8util = myrequire('llpeg.utf8util')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
Opcode,
enum,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'Opcode',
'enum',
})
local
CapState,
Capture,
__ = from(myrequire('llpeg.cap'), {
'CapState',
'Capture',
})
local
Instruction,
___ = from(myrequire('llpeg.code'), {
'Instruction',
})
local
printcaplist,
___ = from(myrequire('llpeg.print'), {
'printcaplist',
})
local LFAIL = {}
local InsidePred = enum{
OUTPRED = 0,
INPRED = 1,
}
local Stack = {}
Stack.__index = Stack
-- Stack prev: previous entry in the stack
-- int bytePos: saved position, or NULL for calls
-- Instruction pc: saved instruction
-- int caplevel
-- int labenv -- for labeled failure
-- bool predchoice -- for labeled failure
function Stack:new(prev, bytePos, pc, caplevel, labenv, predchoice)
return setmetatable({
prev = prev,
bytePos = bytePos, pc = pc, caplevel = caplevel,
labenv = labenv, predchoice = predchoice,
}, self)
end
function Stack:__tostring()
return string.format(
"Stack{ bytePos=%d, caplevel=%d, labenv=%s, predchoice=%s }",
self.bytePos, self.caplevel, self.labenv, self.predchoice
)
end
function Stack:print()
local s = self
while s ~= nil do
print("Stack", s)
s = s.prev
end
end
local MatchResult = {}
MatchResult.__index = MatchResult
function MatchResult:new()
return setmetatable({
labelf = nil, -- failure label
sfail = -1, -- farthest failure
}, self)
end
local State = {}
State.__index = State
function State:new(source, bytePos, ...)
local giveup = Instruction:new{Opcode.Giveup}
local insidepred = InsidePred.OUTPRED -- label environment is off inside predicates
local stack = Stack:new(nil, bytePos, giveup, 0, insidepred, nil)
local cp,cpLen
if bytePos <= #source then
cp, cpLen = utf8util.codepointAt(source, bytePos)
else
cp, cpLen = nil, nil
end
assert(bytePos ~= nil)
return setmetatable({
source = source, -- the source string
bytePos = bytePos, -- current position in the string, in bytes
codepoint = cp, -- the codepoint at 'bytePos' in 'source'
codepointLen = cpLen, -- the length of the codepoint at 'bytePos'
stack = stack, -- top of stack
captures = {}, -- list of captures
captop = 1, -- point to first empty slot in captures (1-based)
extraArgs = compat.pack(...),
-- labeled failures:
insidepred = insidepred,
labelf = nil, -- failure label
sfail = -1, -- farthest failure
}, self)
end
function State:advance()
return self:resetPosTo(self.bytePos + self.codepointLen)
end
function State:resetPosTo(newPos)
assert(newPos ~= nil)
self.bytePos = newPos
local source = self.source
if newPos <= #source then
local cp, cpLen = utf8util.codepointAt(source, newPos)
self.codepoint = cp
self.codepointLen = cpLen
return cp
else
self.codepoint = nil
self.codepointLen = nil
return nil
end
end
function State:backtrack(n)
local off = utf8util.offset(self.source, -n, self.bytePos)
if off == nil then return false end -- can't backtrack that far!
self:resetPosTo(off)
return true
end
function State:updatefarthest(label)
self.labelf = label
if self.bytePos > self.sfail then
self.sfail = self.bytePos
end
end
function State:pushcapture(cap)
self.captures[self.captop] = cap
self.captop = self.captop + 1
end
function State:fail()
-- pattern failed, try to backtrack
local lastStack
repeat
lastStack = self.stack
self.stack = lastStack.prev
until lastStack.bytePos ~= nil
self:resetPosTo(lastStack.bytePos)
self.captop = lastStack.caplevel
self.insidepred = lastStack.labenv -- labeled failure
return lastStack.pc:exec(self)
end
function State:giveup()
local r = nil
local lab = "fail"
local errpos = self.sfail
if self.labelf ~= nil and self.labelf ~= LFAIL then
lab = self.labelf
end
return r, lab, errpos
end
function State:getcaptures()
local results = {}
if self.captures[1].kind == CapKind.close then -- are there any captures?
return results -- no captures
end
return CapState:new(self.captures, self.source, self.extraArgs):getcaptures()
end
function Opcode.End:exec(state)
state:pushcapture(CapKind.close:newCapture(state.bytePos, 0))
-- trim table to proper length
while #state.captures > state.captop - 1 do
table.remove(state.captures)
end
-- printcaplist(state.captures, #state.captures) -- for debugging
local results = state:getcaptures()
if #results == 0 then -- no captured values?
return state.bytePos -- return only end position
else
return compat.unpack(results)
end
end
function Opcode.Giveup:exec(state)
return state:giveup()
end
function Opcode.Ret:exec(state)
local lastStack = state.stack
state.stack = lastStack.prev
return lastStack.pc:exec(state)
end
function Opcode.Any:exec(state)
if state.codepoint ~= nil then
state:advance()
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.TestAny:exec(state)
if state.codepoint ~= nil then
return self.next:exec(state)
else
return self.branch:exec(state)
end
end
function Opcode.UTFR:exec(state)
local cp = state.codepoint
if cp ~= nil and self.from <= cp and cp <= self.to then
state:advance()
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.Char:exec(state)
if state.codepoint == self.aux then
state:advance()
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.TestChar:exec(state)
if state.codepoint == self.aux then
return self.next:exec(state)
else
return self.branch:exec(state)
end
end
function Opcode.Set:exec(state)
local cp = state.codepoint
if cp ~= nil then
if cp <= CHARMAX then
if self.set[cp] then
state:advance()
return self.next:exec(state)
end
else
if self.rest then
state:advance()
return self.next:exec(state)
end
end
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.TestSet:exec(state)
local cp = state.codepoint
if cp ~= nil then
if cp <= CHARMAX then
if self.set[cp] then
return self.next:exec(state)
end
elseif self.rest then
return self.next:exec(state)
end
end
return self.branch:exec(state)
end
function Opcode.Behind:exec(state)
local n = self.aux
-- XXX SLOW self.aux is in *characters* not *bytes*
if state:backtrack(n) then
return self.next:exec(state)
end
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.Span:exec(state)
local cp = state.codepoint
while true do
local match = false
if cp ~= nil then
if cp <= CHARMAX then
if self.set[cp] then match = true end
else
if self.rest then match = true end
end
end
if not match then break end
cp = state:advance()
end
return self.next:exec(state)
end
function Opcode.Jmp:exec(state)
return self.branch:exec(state)
end
function Opcode.Choice:exec(state)
state.stack = Stack:new(
state.stack, state.bytePos, self.branch, state.captop, state.insidepred
)
return self.next:exec(state)
end
function Opcode.PredChoice:exec(state)
state.stack = Stack:new(
state.stack, state.bytePos, self.branch, state.captop, state.insidepred,
true -- predchoice
)
state.insidepred = InsidePred.INPRED
return self.next:exec(state)
end
function Opcode.Call:exec(state)
state.stack = Stack:new(
state.stack, nil, self.next
)
return self.branch:exec(state)
end
function Opcode.Commit:exec(state)
state.stack = state.stack.prev
return self.branch:exec(state)
end
function Opcode.PartialCommit:exec(state)
local stack = state.stack
stack.bytePos = state.bytePos
stack.caplevel = state.captop
return self.branch:exec(state)
end
function Opcode.BackCommit:exec(state)
local stack = state.stack
state.stack = stack.prev -- pop the stack
state:resetPosTo(stack.bytePos) -- but reset the position to that stored
state.insidepred = stack.labenv -- labeled failure
state.captop = stack.caplevel
return self.branch:exec(state)
end
function Opcode.Throw:exec(state)
if state.insidepred == InsidePred.OUTPRED then
state.labelf = self.key
-- pop entire stack
while state.stack.prev ~= nil do
state.stack = state.stack.prev
end
else
state.labelf = LFAIL
-- pop until you read a 'predchoice' state
while not state.stack.predchoice do
state.stack = state.stack.prev
end
end
state.sfail = state.bytePos
return state:fail()
end
function Opcode.ThrowRec:exec(state) -- with recovery
state.sfail = state.bytePos
if state.insidepred == InsidePred.OUTPRED then
state.labelf = self.key
state.stack = Stack:new(
state.stack, nil, self.next, state.captop
)
return self.branch:exec(state)
else
state.labelf = LFAIL
-- pop until you read a 'predchoice' state
while not state.stack.predchoice do
state.stack = state.stack.prev
end
return state:fail()
end
end
function Opcode.FailTwice:exec(state)
state.stack = state.stack.prev
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.Fail:exec(state)
state:updatefarthest(LFAIL)
return state:fail()
end
function Opcode.CloseRunTime:exec(state)
-- close the group
state:pushcapture(self.cap:newCapture(state.bytePos, 0))
-- trim table to proper length
while #state.captures > state.captop - 1 do
table.remove(state.captures)
end
local cs = CapState:new(state.captures, state.source, state.extraArgs)
local n, funcRet = cs:runtimecap(state.captop - 1)
state.captop = state.captop - n -- remove nested captures
-- resdyncaptures: resolve returned values in `funcRet`
-- first argument false=fail, true=keep current pos, number=next position
local firstArg = funcRet[1]
if funcRet.n == 0 then
firstArg = false -- returning void means we'll fail
end
if not firstArg then -- if it is falsey, discard rest of returned vals & fail
state:updatefarthest(LFAIL)
return state:fail() -- tail call
elseif type(firstArg) == "boolean" then
-- keep current position, nothing needs to be done
else
local npos = tonumber(firstArg)
if npos < state.bytePos or npos > (1 + #(state.source)) then
error("invalid position returned by match-time capture")
end
state:resetPosTo(npos)
end
-- push the rest of the funcRet values as new captures
local n = funcRet.n - 1 -- number of new captures
if n == 0 then -- no new captures?
state.captop = state.captop - 1 -- remove open group
else
-- new captures, keep original open group
-- add new captures + close group to 'capture' list
-- adddyncaptures:
assert(state.captures[state.captop - 1].kind == CapKind.group)
assert(state.captures[state.captop - 1]:isopencap())
-- make group capture an anonymous group (this used to hold match-time f)
state.captures[state.captop - 1].extra = nil
for i=2,funcRet.n do -- add runtime captures
state:pushcapture(CapKind.runtime:newCapture(state.bytePos, 0, funcRet[i]))
end
-- close group
state:pushcapture(CapKind.close:newCapture(state.bytePos, 0))
end
return self.next:exec(state)
end
local MAXLOP = 20
function findopen(captures, i, currPos)
i = i - 1 -- check last captop
local cap = captures[i]
if (not cap:isopencap()) and cap.bytePos == currPos then
return nil -- current one cannot be a full capture
end
-- else, look for an 'open' capture
for j=1,MAXLOP do
if cap:isopencap() then -- open capture?
return cap,i -- that's the one to be closed
elseif cap.kind == CapKind.close then
return nil -- a full capture should not nest a non-full one
end
i = i - 1
if i<1 then break end
cap = captures[i]
end
return nil -- not found within allowed search limit
end
function Opcode.CloseCapture:exec(state)
-- if possible, turn capture into a full capture
assert(state.captop > 1)
local open,_ = findopen(state.captures, state.captop, state.bytePos)
if open ~= nil then -- if possible, turn capture into a full capture
open.byteLen = state.bytePos - open.bytePos
else
-- non-nil length to mark entry as closed
state:pushcapture(self.cap:newCapture(state.bytePos, 0))
end
return self.next:exec(state)
end
function Opcode.OpenCapture:exec(state)
state:pushcapture(self.cap:newCapture(
-- byteLen = nil marks entry as open
state.bytePos, nil, self.key
))
return self.next:exec(state)
end
function Opcode.FullCapture:exec(state)
-- XXX SLOW: self.aux is in *characters* not *bytes*
local nPos = utf8util.offset(state.source, -self.aux, state.bytePos)
state:pushcapture(self.cap:newCapture(
nPos, state.bytePos - nPos, self.key
))
return self.next:exec(state)
end
function match(s, init, code, ...)
local state = State:new(s, init, ...)
return code[1]:exec(state)
end
return {
match = match,
}
end)
register('llpeg', function(myrequire)
local VERSION = '0.0.1'
local MAXSTACK = 400 -- maximum backtracking
local MAXBEHIND = 255 -- maximum look-behind
local MAXRULES = 1000 -- maximum number of rules
local LPEG_COMPAT = true
myrequire('strict')
local compat = myrequire('advent.compat')
local from = myrequire('llpeg.types').from
local
CHARMAX,
CapKind,
TTag,
define,
define_tree_visitor,
metareg,
newcharset,
numsiblings,
_ = from(myrequire('llpeg.types'), {
'CHARMAX',
'CapKind',
'TTag',
'define',
'define_tree_visitor',
'metareg',
'newcharset',
'numsiblings',
})
local
compile,
cs_diff,
cs_union,
fixedlen,
hascaptures,
nofail,
nullable,
nullable_with_grammar,
tocharset,
__ = from(myrequire('llpeg.code'), {
'compile',
'cs_diff',
'cs_union',
'fixedlen',
'hascaptures',
'nofail',
'nullable',
'nullable_with_grammar',
'tocharset',
})
local
match,
___ = from(myrequire('llpeg.vm'), {
'match',
})
local
printpatt,
printrepl,
printtree,
____ = from(myrequire('llpeg.print'), {
'printpatt',
'printrepl',
'printtree',
})
function checkint(v)
if type(v) == 'string' then
v = tonumber(v)
end
if type(v) ~= "number" then
error("not a number")
end
return math.floor(v)
end
local is_pattern = define_type_visitor{
pattern = function() return true end,
default = function() return false end,
}
local ptype = define_type_visitor{
pattern = function() return "pattern" end,
default = function(v) return type(v) end,
}
function val2str(v)
if type(v) == 'number' then return tostring(v) end
if type(v) == 'string' then return v end
return string.format("(a %s)", ptype(v))
end
--[[ lpltree.c ]]--
function newtree(tag)
local t = {
tag = tag,
code = nil,
}
setmetatable(t, metareg)
return t
end
function newleaf(tag, n)
return setmetatable({
tag = tag,
code = nil,
n = n,
}, metareg)
end
function newroot1sib(tag, sib1)
return setmetatable({
tag = tag,
code = nil,
sib1 = sib1,
}, metareg)
end
function newroot2sib(tag, sib1, sib2)
return setmetatable({
tag = tag,
code = nil,
sib1 = sib1,
sib2 = sib2,
}, metareg)
end
--[[ Build a sequence of #s nodes from the array 's' with the tag 'tag' ]]--
function fillseq(tag, s)
if type(s) == 'number' then
local len = checkint(s)
s = setmetatable({}, {__len = function() return len end})
end
if #s == 0 then
return newleaf(tag, 0)
end
local i = #s
local result = newleaf(tag, s[i])
while i > 1 do
i = i - 1
result = newroot2sib(
TTag.Seq,
newleaf(tag, s[i]),
result
)
end
return result
end
--[[ Numbers as patterns:
0 == true (always match); n == TAny repeated 'n' times;
-n == not (TAny repeated 'n' times)
]]--
function numtree(n)
n = checkint(n)
if n == 0 then
return newleaf(TTag.True)
elseif n > 0 then
return fillseq(TTag.Any, n) -- sequence of 'n' anys
else
return newroot1sib(TTag.Not, fillseq(TTag.Any, -n))
end
end
-- Convert value v to a pattern
local getpatt = define_type_visitor{
["string"] = function(s)
if #s == 0 then
return newleaf(TTag.True) -- always match if string is empty
end
local cp = {}
for _,c in compat.utf8codes(s) do
table.insert(cp, c)
end
return fillseq(TTag.Char, cp)
end,
["number"] = function(n)
return numtree(n)
end,
["boolean"] = function(b)
if b then
return newleaf(TTag.True)
else
return newleaf(TTag.False)
end
end,
["function"] = function(f)
return setmetatable({
tag = TTag.RunTime,
code = nil,
key = f,
sib1 = newleaf(TTag.True),
}, metareg)
end,
["pattern"] = function(v)
return v
end,
["table"] = function(v)
return newgrammar(v)
end,
default = function(v)
error("Not a pattern")
end,
}
-- labeled failure begin
function newthrowleaf(label)
return setmetatable({
tag = TTag.Throw,
code = nil,
sib2 = nil, -- no recovery rule associated (yet)
key = label,
}, metareg)
end
-- labeled failure end
function lp_P(v)
return getpatt(v)
end
--[[
** sequence operator; optimizations:
** false x => false, x true => x, true x => x
** (cannot do x . false => false because x may have runtime captures)
]]--
function lp_seq(tree1, tree2)
tree1 = getpatt(tree1)
tree2 = getpatt(tree2)
if tree1.tag == TTag.False or tree2.tag == TTag.True then
-- false . x = false, x . true = x
return tree1
elseif tree1.tag == TTag.True then
-- true . x = x
return tree2
else
return newroot2sib(TTag.Seq, tree1, tree2)
end
end
--[[
** choice operator; optimizations:
** charset / charset => charset
** true / x => true, x / false => x, false / x => x
** (x / true is not equivalent to true)
]]--
function lp_choice(t1, t2)
t1 = getpatt(t1)
t2 = getpatt(t2)
local t1c = tocharset(t1)
local t2c = tocharset(t2)
if t1c ~= nil and t2c ~= nil then
local t = cs_union(t1c, t2c)
return t
elseif nofail(t1) or t2.tag == TTag.False then
-- true / x => true, x / false => x
return t1
elseif t1.tag == TTag.False then
-- false / x => x
return t2
else
return newroot2sib(TTag.Choice, t1, t2)
end
end
--[[
p^n
]]--
function lp_star(p, n)
local tree1 = getpatt(p)
n = checkint(n)
if n >= 0 then -- seq tree1 (seq tree1 ... (seq tree1 (rep tree1)))
if nullable(tree1) then
error("loop body may accept empty string")
end
local tree = newroot1sib(TTag.Rep, tree1)
while n > 0 do
tree = newroot2sib(TTag.Seq, tree1, tree)
n = n - 1
end
return tree
else -- choice (seq tree1 ... choice tree1 true ...) true
n = -n
local tree = newroot2sib( -- at most 1
TTag.Choice,
tree1,
newleaf(TTag.True)
)
while n > 1 do
tree = newroot2sib( -- at most (n-1)
TTag.Seq,
tree1,
tree
)
tree = newroot2sib(TTag.Choice, tree, newleaf(TTag.True))
n = n - 1
end
return tree
end
end
--[[
** #p == &p
]]--
function lp_and(v)
return newroot1sib(TTag.And, getpatt(v))
end
--[[
** -p == !p
]]--
function lp_not(v)
return newroot1sib(TTag.Not, getpatt(v))
end
--[[
** [t1 - t2] == Seq (Not t2) t1
** If t1 and t2 are charsets, make their difference.
]]--
function lp_sub(t1, t2)
t1 = getpatt(t1)
t2 = getpatt(t2)
local t1c = tocharset(t1)
local t2c = tocharset(t2)
if t1c ~= nil and t2c ~= nil then
return cs_diff(t1c, t2c)
else
return newroot2sib(
TTag.Seq,
newroot1sib(TTag.Not, t2),
t1
)
end
end
--[[
A set with the given characters
]]--
function lp_set(s)
local t = newcharset()
local extra = nil
for _,c in compat.utf8codes(s) do
if c > CHARMAX then
-- non ascii, we can't use charset for these
local one = newleaf(TTag.Char, c)
if extra == nil then
extra = one
else
extra = newroot2sib(TTag.Choice, extra, one)
end
else
t.set[c] = true
end
end
if extra == nil then
return t
else
return newroot2sib(TTag.Choice, t, extra)
end
end
function lp_range(...)
local t = newcharset()
local extra = nil
for _,v in ipairs{...} do
if type(v) ~= "string" then
error("argument must be string")
else
local first, second
for _,c in compat.utf8codes(v) do
if first == nil then
first = c
elseif second == nil then
second = c
else
error("range must have two characters")
end
end
if first == nil or second == nil then
error("range must have two characters")
end
if first > second then
if LPEG_COMPAT then
-- ignore, just silently create an empty range
else
error("empty range")
end
elseif second <= CHARMAX then -- ascii range
for i = first, second do
t.set[i] = true
end
else
local r = lp_utfr(first, second)
if extra == nil then
extra = r
else
extra = newroot2sib(TTag.Choice, extra, one)
end
end
end
end
if extra == nil then
return t
else
return newroot2sib(TTag.Choice, t, extra)
end
end
function lp_utfr(from, to)
from = checkint(from)
to = checkint(to)
if from > to then
error("empty range")
end
if to > 0x10FFFF then
error("invalid code point")
end
if to <= CHARMAX then -- ascii range?
local t = newcharset() -- code it as a regular charset
for i = from, to do
t.set[i] = true
end
return t
end
-- multibyte utf-8 range
return setmetatable({
tag = TTag.UTFR,
code = nil,
from = from,
to = to,
}, metareg)
end
--[[
Look-behind predicate
]]--
function lp_behind(v)
local tree1 = getpatt(v)
local n = fixedlen(tree1)
if n < 0 then
error("pattern may not have fixed length")
end
if hascaptures(tree1) then
error("pattern has captures")
end
if n > MAXBEHIND then
error("pattern too long to look behind")
end
return setmetatable({
tag = TTag.Behind,
code = nil,
sib1 = tree1,
n = n,
}, metareg)
end
--[[ labeled failure begin ]]--
--[[
** Throws a label
]]--
local lp_throw = define_type_visitor{
[{"string","number"}] = newthrowleaf,
default = function() error("not a string") end,
}
--[[ labeled failure end ]]--
--[[
** Create a non-terminal
]]--
function lp_V(v)
if v == nil then
error("non-nil value expected")
end
return setmetatable({
tag = TTag.Call,
code = nil,
key = v,
}, metareg)
end
--[[
** Create a tree for a non-empty capture, with a body and
** optionally with an associated Lua value (at index 'labelidx' in the
** stack)
]]--
function capture_aux(capkind, patt, val)
local t = newroot1sib(TTag.Capture, getpatt(patt))
t.cap = capkind
t.key = val
return t
end
function newemptycap(capkind, val)
return capture_aux(capkind, newleaf(TTag.True), val)
end
--[[
** Captures with syntax p / v
** (function capture, query capture, string capture, or number capture)
]]--
local divcapture_helper = define_type_visitor{
["function"] = function(v, p)
return capture_aux(CapKind["function"], p, v)
end,
["table"] = function(v, p)
return capture_aux(CapKind.query, p, v)
end,
["string"] = function(v, p)
return capture_aux(CapKind.string, p, v)
end,
["number"] = function(v, p)
v = checkint(v)
if v < 0 or v > 65536 then
error("invalid number")
end
return capture_aux(CapKind.num, p, v)
end,
default = function(v)
error("unexpected "..ptype(v).." as 2nd operand to LPeg '/'") end,
}
function lp_divcapture(p, v)
return divcapture_helper(v, p) -- dispatch on v
end
function lp_acccapture(p, v)
return capture_aux(CapKind.acc, p, v)
end
-- the match for patt with the values from nested captures replacing their
-- matches
function lp_substcapture(patt)
return capture_aux(CapKind.subst, patt)
end
-- a table with all captures from patt
function lp_tablecapture(patt)
return capture_aux(CapKind.table, patt)
end
-- the values produced by patt, optionally tagged with key
function lp_groupcapture(patt, key)
-- key can be nil
return capture_aux(CapKind.group, patt, key)
end
-- folding capture (deprecated)
function lp_foldcapture(patt, func)
if type(func) ~= "function" then
error("Bad function argument")
end
return capture_aux(CapKind.fold, patt, func)
end
-- the match for patt plus all captures made by patt
function lp_simplecapture(patt)
return capture_aux(CapKind.simple, patt)
end
-- the current position (matches the empty string)
function lp_poscapture()
return newemptycap(CapKind.position)
end
-- the value of the nth extra argument to lpeg.match (matches the empty string)
function lp_argcapture(n)
n = checkint(n)
if n <= 0 or n > 65536 then error("invalid argument index") end
return newemptycap(CapKind.arg, n)
end
-- the value produced by the previous group capture named `key`
-- (matches the empty string)
function lp_backref(key)
return newemptycap(CapKind.backref, key)
end
-- Constant capture (matches the empty string)
function lp_constcapture(...)
local arg = compat.pack(...)
if arg.n == 0 then -- no values?
return newleaf(TTag.True) -- no capture
else
local i = arg.n
local tree = newemptycap(CapKind.const, arg[i])
while i > 1 do
i = i - 1
tree = newroot2sib(
TTag.Seq,
newemptycap(CapKind.const, arg[i]),
tree
)
end
if arg.n == 1 then
-- single constant capture
return tree
else
-- create a group capture with all values
return lp_groupcapture( tree )
end
end
end
-- the returns of function applied to the captures of patt; the application
-- is done at match time
function lp_matchtime(patt, func)
if type(func) ~= 'function' then
error('not a function')
end
return setmetatable({
tag = TTag.RunTime,
code = nil,
key = func,
sib1 = getpatt(patt),
}, metareg)
end
--[[======================================================]]--
--[[
** ======================================================
** Grammar - Tree generation
** ======================================================
]]--
--[[
** push on the stack the index and the pattern for the
** initial rule of grammar at index 'arg' in the stack;
** also add that index into position table.
]]--
function getfirstrule(tbl)
local first_name, first_rule
first_name = tbl[1]
-- is this the name of an initial rule?
if type(first_name) == 'number' or type(first_name) == 'string' then
first_rule = tbl[first_name] -- get associated rule
else
first_name,first_rule = 1,first_name
end
if not is_pattern(first_rule) then
if first_rule == nil then
error("grammar has no initial rule")
else
error(string.format("initial rule '%s' is not a pattern", first_name))
end
end
-- rule position (after TGrammar)
-- return map from name to position, and from position to name
return { [first_name] = 1 }, { first_name }
end
--[[
** traverse grammar at index 'arg', pushing all its keys and patterns
** into the stack. Create a new table (before all pairs key-pattern) to
** collect all keys and their associated positions in the final tree
** (the "position table").
** Return the number of rules and (in 'totalsize') the total size
** for the new tree.
]]--
function collectrules(tbl)
-- find the first rule and put in position table
local postab, rpostab = getfirstrule(tbl)
-- collect and sort rule names (for repeatability)
local names = {}
for k,v in pairs(tbl) do
if k == 1 or postab[k] == 1 then -- initial rule?
-- skip the initial rules, it's already in the position table
else
table.insert(names, k)
end
end
table.sort(names, function(a,b)
return tostring(a) < tostring(b)
end)
-- fill out rule, name, and position maps
for _,k in ipairs(names) do
local v = tbl[k]
if not is_pattern(v) then
error("rule '" .. val2str(k) .. "' is not a pattern")
end
table.insert(rpostab, k)
postab[k] = #rpostab
end
return postab, rpostab
end
function buildgrammar(g, tbl, postab, rpostab)
local trees = {}
for i,name in ipairs(rpostab) do
local rule = setmetatable({
tag = TTag.Rule,
code = nil,
key = nil, -- will be fixed when rule is used
n = i, -- rule number
name = name,
sib1 = tbl[name], -- pattern
sib2 = nil,
}, metareg)
table.insert(trees, rule)
g.ruletab[name] = rule
end
-- link up siblings
for i = 1, #trees-1 do
trees[i].sib2 = trees[i+1]
end
trees[#trees].sib2 = newleaf(TTag.True) -- finish list of rules
g.sib1 = trees[1]
end
--[[
** Check whether a tree has potential infinite loops
]]--
function checkloops(grammar, tree)
local n = numsiblings[tree.tag]
if tree.tag == TTag.Rep and nullable_with_grammar(tree.sib1, grammar) then
return true
elseif tree.tag == TTag.Grammar then
return false -- sub-grammars already checked
elseif n == 1 then
return checkloops(grammar, tree.sib1) -- tail call
elseif n == 2 then
if checkloops(grammar, tree.sib1) then
return true
else
return checkloops(grammar, tree.sib2) -- tail call
end
elseif n == 0 then
return false
else
error("surprising number of siblings")
end
end
--[[
** Give appropriate error message for 'verifyrule'. If a rule appears
** twice in 'passed', there is path from it back to itself without
** advancing the subject.
]]--
function verifyerror(grammar, passed, npassed)
local i, j
for i = npassed,1,-1 do -- search for a repetition
for j = i-1,1,-1 do
if passed[i] == passed[j] then
error(string.format("rule '%s' may be left recursive", val2str(passed[i])))
end
end
end
error("too many left calls in grammar")
end
--[[
** Check whether a rule can be left recursive; raise an error in that
** case; otherwise return 1 iff pattern is nullable.
** The return value is used to check sequences, where the second pattern
** is only relevant if the first is nullable.
** Parameter 'nb' works as an accumulator, to allow tail calls in
** choices. ('nb' true makes function returns true.)
** Parameter 'passed' is a list of already visited rules, 'npassed'
** counts the elements in 'passed'.
** Assume ktable at the top of the stack.
]]--
local verifyrule
verifyrule = define_tree_visitor{
[{
TTag.Char, TTag.Set, TTag.Any, TTag.False, TTag.UTFR,
TTag.Throw, -- labeled failure
}] = function(tree, g, passed, n, nb)
return nb -- cannot pass from here
end,
[{
TTag.True, TTag.Behind, -- look-behind cannot have calls
}] = function(tree, g, passed, n, nb)
return true
end,
[{ TTag.Not, TTag.And, TTag.Rep, }] = function(tree, g, passed, n, nb)
return verifyrule(tree.sib1, g, passed, n, true) -- tail call
end,
[{ TTag.Capture, TTag.RunTime, TTag.XInfo, }] = function(tree, g, passed, n, nb)
return verifyrule(tree.sib1, g, passed, n, nb) -- tail call
end,
[ TTag.Call ] = function(tree, g, passed, n, nb)
local rule = g.ruletab[tree.key] -- look up rule
return verifyrule(rule, g, passed, n, nb) -- tail call
end,
[ TTag.Seq ] = function(tree, g, passed, n, nb)
-- only check 2nd child if first is nb
if not verifyrule(tree.sib1, g, passed, n, false) then
return nb
else
-- note that we don't propagate new npassed from 1st child
return verifyrule(tree.sib2, g, passed, n, nb) -- tail call
end
end,
[ TTag.Choice ] = function(tree, g, passed, n, nb)
-- must check both children
nb = verifyrule(tree.sib1, g, passed, n, nb)
-- note that we don't propagate new npassed from 1st child
return verifyrule(tree.sib2, g, passed, n, nb) -- tail call
end,
[ TTag.Rule ] = function(tree, g, passed, n, nb)
if n >= MAXRULES then -- too many steps?
return verifyerror(g, passed, n) -- error
else
passed[n+1] = tree.key -- add rule to path
return verifyrule(tree.sib1, g, passed, n + 1, nb) -- tail call
end
end,
[ TTag.Grammar ] = function(tree, g, passed, n, nb)
return nullable(tree) -- sub-grammar cannot be left recursive
end,
}
function verifygrammar(grammar)
local passed = {}
-- check left-recursive rules
local rule = grammar.sib1
while rule.tag == TTag.Rule do
if rule.key ~= nil then -- skip unused rules
verifyrule(rule.sib1, grammar, passed, 0, false)
end
rule = rule.sib2
end
if rule.tag ~= TTag.True then
error("assertion failure")
end
-- check infinite loops inside rules
rule = grammar.sib1
while rule.tag == TTag.Rule do
if rule.key ~= nil then -- skip unused rules
if checkloops(grammar, rule.sib1) then
error("empty loop in rule '" .. val2str(rule.name) .. "'")
end
end
rule = rule.sib2
end
if rule.tag ~= TTag.True then
error("assertion failure")
end
end
--[[
** Fix a TOpenCall into a TCall node, using table 'postable' to
** translate a key to its rule address in the tree. Raises an
** error if key does not exist.
]]--
function fixonecall(g, t, postab)
local name = t.key
local rule = g.ruletab[name]
if t.tag == TTag.Call then
if rule == nil then
error(string.format("rule '%s' undefined in given grammar", val2str(name)))
end
-- unlike our upstream, we don't clone patterns when we build a grammar
-- so we can't mutate this tree w/o possibly breaking any other grammars
-- which might hold an alias of this call. So we don't distinguish
-- Call and OpenCall and we don't mutate the tag here and
-- don't link it up. However, we can mutate the Rule
-- as those are not shared
rule.key = name -- mark this as used
elseif rule ~= nil then -- TTag.Throw
-- As before, we can't mutate the tree
rule.key = name -- mark this as used
end
end
--[[
** Transform left associative constructions into right
** associative ones, for sequence and choice; that is:
** (t11 + t12) + t2 => t11 + (t12 + t2)
** (t11 * t12) * t2 => t11 * (t12 * t2)
** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2))
]]--
function correctassociativity (tree)
local tag = tree.tag
if tag ~= TTag.Choice and tag ~= TTag.Seq then
error("impossible")
end
local t1 = tree.sib1
while t1.tag == tree.tag do
local t11, t12 = t1.sib1, t1.sib2
local t2 = tree.sib2
-- don't mutate t1 in place as others may be keeping copies of it;
-- mutating 'tree' in place is okay as we're not changing its semantics
tree.sib1 = t11
tree.sib2 = newroot2sib(tag, t12, t2)
t1 = tree.sib1
end
return tree
end
--[[
** Make final adjustments in a tree. Fix open calls in tree 't',
** making them refer to their respective rules or raising appropriate
** errors (if not inside a grammar). Correct associativity of associative
** constructions (making them right associative). Assume that tree's
** ktable is at the top of the stack (for error messages).
]]--
local finalfix_helper = define_tree_visitor{
[TTag.Grammar] = function(t)
return t -- subgrammars were already fixed
end,
[TTag.Call] = function(t, g, postab)
if g == nil then
error("rule '" .. val2str(t.key) .. "' used outside a grammar")
else
return fixonecall(g, t, postab)
end
end,
[TTag.Throw] = function(t, g, postab)
if g ~= nil then
return fixonecall(g, t, postab)
end
end,
[{TTag.Seq, TTag.Choice}] = function(t, g, postab)
return correctassociativity(t)
end,
default = function(t) return t end,
}
function finalfix(g, t, postab)
finalfix_helper(t, g, postab)
if t.tag == TTag.Grammar then return end
local n = numsiblings[t.tag]
if n == 1 then
return finalfix(g, t.sib1, postab) -- tail call
elseif n == 2 then
finalfix(g, t.sib1, postab)
return finalfix(g, t.sib2, postab) -- tail call
elseif n == 0 then
return
else
error("strange number of siblings")
end
end
--[[
** Give a name for the initial rule if it is not referenced
]]--
function initialrulename(grammar)
if grammar.sib1.key == nil then -- initial rule is not referenced?
grammar.sib1.key = grammar.sib1.name
end
end
function newgrammar(tbl)
local postab, rpostab = collectrules(tbl)
local g = setmetatable({
tag = TTag.Grammar,
code = nil,
sib1 = nil, -- will fill this in
n = #rpostab, -- number of rules
ruletab = {}, -- map rule names to rules
}, metareg)
buildgrammar(g, tbl, postab, rpostab)
finalfix(g, g.sib1, postab)
initialrulename(g)
verifygrammar(g)
return g
end
--[[ ====================================================== ]]--
function prepcompile(p)
finalfix(nil, p, {}) -- correct associativity
return compile(p)
end
function lp_printtree(patt, c)
local tree = getpatt(patt)
if c then
finalfix(nil, tree, {}) -- correct associativity
end
print("[]") -- for compatibility, this is a fake 'ktable'
io.write(table.concat(printtree(tree, 0, {})))
end
function lp_printcode(patt)
local p = getpatt(patt)
if p.code == nil then
prepcompile(p)
end
print("[]") -- for compatibility, this is a fake 'ktable'
io.write(table.concat(printpatt(p.code, {})))
end
--[[
** Get the initial position for the match, interpreting negative
** values from the end of the subject. Result is 1-based.
]]--
function initposition(ii, len)
if ii > 0 then -- positive index?
if ii <= len then -- inside the string?
return ii -- return it (no correction to 0-base)
else
return len + 1 -- crop at the end
end
else -- negative index
if (-ii) <= len then -- inside the string?
return len + 1 - (-ii) -- return position from the end
else
return 1
end
end
end
-- Main match function
function lp_match(pattern, subject, init, ...)
local p = getpatt(pattern)
if p.code == nil then prepcompile(p) end
local code = p.code
if type(subject) ~= 'string' then error("subject is not a string") end
local i
if init == nil then
i = 1
else
i = initposition(checkint(init), #subject)
end
return match(subject, i, code, ...)
end
--[[
** ======================================================
** Library creation and functions not related to matching
** ======================================================
]]--
function lp_setmax(lim)
lim = 0 + lim -- convert to integer
if lim <= 0 then
error("out of range")
end
MAXSTACK = lim
end
local lp_type = define_type_visitor{
pattern = function() return "pattern" end,
default = function() return nil end,
}
function lp_gc(p)
p._code = nil
end
function createcat(charspec)
local t = newcharset()
for i=0,CHARMAX do -- XXX not unicode safe
local s = compat.utf8char(i)
if s:find(charspec) ~= nil then
t.set[i] = true
end
end
return t
end
function lp_locale(tbl)
if tbl == nil then
tbl = {}
end
tbl.alnum = createcat("%w")
tbl.alpha = createcat("%a")
tbl.cntrl = createcat("%c")
tbl.digit = createcat("%d")
tbl.graph = createcat("[%p%w]") -- printable except space
tbl.lower = createcat("%l")
tbl.print = createcat("%C") -- printable = "not a control character"?
tbl.punct = createcat("%p") -- "printable but not space or alnum
tbl.space = createcat("%s")
tbl.upper = createcat("%u")
tbl.xdigit = createcat("%x")
return tbl
end
--[[ lpltree.c ]]--
metareg.__mul = lp_seq
metareg.__add = lp_choice
metareg.__pow = lp_star
metareg.__gc = lp_gc
metareg.__len = lp_and
metareg.__div = lp_divcapture
metareg.__mod = lp_acccapture
metareg.__unm = lp_not
metareg.__sub = lp_sub
metareg.__tostring = printrepl
local pattreg = {
ptree = lp_printtree,
pcode = lp_printcode,
match = lp_match,
B = lp_behind,
V = lp_V,
C = lp_simplecapture,
Cc = lp_constcapture,
Cmt = lp_matchtime,
Cb = lp_backref,
Carg = lp_argcapture,
Cp = lp_poscapture,
Cs = lp_substcapture,
Ct = lp_tablecapture,
Cf = lp_foldcapture,
Cg = lp_groupcapture,
P = lp_P,
S = lp_set,
R = lp_range,
utfR = lp_utfr,
locale = lp_locale,
version = "LLPegLabel " .. VERSION,
setmaxstack = lp_setmax,
type = lp_type,
T = lp_throw, -- labeled failure throw
}
metareg.__index = pattreg
return pattreg
end)
local modules = {}
modules['bit32'] = require('bit32')
modules['string'] = require('string')
modules['strict'] = {}
modules['table'] = require('table')
local function myrequire(name)
if modules[name] == nil then
modules[name] = true
modules[name] = (builders[name])(myrequire)
end
return modules[name]
end
return myrequire('llpeg')
end)()