Module:User:Theknightwho/regex

-- 1. Parse pattern and build finite state machine. -- 2. Parse string using FSM, iterating each position.

local codepoint = require("Module:string utilities").codepoint local setmetatable = setmetatable

local m_parser = require("Module:parser")

local Parser, Node = m_parser.new

local charsets = {}

local Pattern = {} Pattern.__index = Pattern

function Pattern:new(t) t.len = #t return setmetatable(t, Pattern) end

function Pattern:match(str) str = {codepoint(str, 1, -1)} -- Temporary: input will be codepoint array. local i_str, len = 1, self.len + 1 self.head = 1 repeat local ch, head = str[i_str], self.head str_adv, pat_adv = self[head](self, ch, head) if str_adv then i_str = i_str + 1 if pat_adv then self.head = self.head + 1 end elseif pat_adv then self.head = self.head + 1 else return false end until self.head == len return true end

local export = {}

do local do_charset local handle_start local handle_charset local handle_percent local handle_range function do_charset(self) self:push_sublayer rawset(self[-1], "ind", {}) self:advance end function handle_start(self, this) self[-1].handler = handle_charset if this == 0x5D then -- ] self[-1].ind[this] = true elseif this == 0x5E then -- ^ self[-1].inverse = true else return self:consume end end function handle_charset(self, this) if this == 0x25 then -- % self[-1].handler = handle_percent elseif this == 0x5D then -- ] self[-1].handler = handle_suffix elseif self:read(1) == 0x2D then -- - self[-1].start = this self:advance(1) self[-1].handler = handle_range else self[-1].ind[this] = true end end function handle_percent(self, this) if charsets[this] then -- TODO else self[-1].ind[this] = true end end function handle_range(self, this) local start = self[-1].start if this == 0x5D then -- ] self[-1].ind[start] = true self[-1].ind[0x2D] = true -- - return self:pop elseif this >= start then self:emit(function(self, ch)				return ch >= start and ch <= this			end) end self[-1].handler = handle_charset end function handle_suffix(self, this) local chars = self:pop_sublayer local inverse = chars.inverse local ind = chars.ind local test = function(ch) if not ch then return false elseif ind[ch] then return not inverse end for i = 1, chars.len do				if chars[i](ch) then return not inverse end end return inverse end if this == 0x2A then -- * self:emit(function(self, ch)				local is = test(ch)				return is, not is			end) elseif this == 0x2B then -- + self:emit(function(self, ch)				local is = test(ch)				return is, is			end) self:emit(function(self, ch)				local is = test(ch)				return is, not is			end) elseif this == 0x2D then -- - elseif this == 0x3F then -- ? self:emit(function(self, ch)				return test(ch), true			end) else self:emit(function(self, ch)				local is = test(ch)				return is, is			end) self:advance(-1) end return self:pop end function Parser:parse_charset local charset = self:get(handle_start, do_charset) if charset == self[-1].bad_route then error("Missing close-bracket for character set beginning at pattern character " .. self.head .. ".") end self:emit_tokens(charset) end end

do local function main_handler(self, this) if this == 0x24 then -- $ elseif this == 0x25 then -- % elseif this == 0x28 then -- (		elseif this == 0x29 then -- ) elseif this == 0x2E then --. return self:emit(function(self, ch)				local is = ch and true				return is, is			end) elseif this == 0x5B then -- [ return self:parse_charset elseif this == 0x5D then -- ] elseif this == "" then return self:pop end local nxt = self:read(1) if nxt == 0x2A then -- * self:emit(function(self, ch)				local is = ch == this				return is, not is			end) self:advance elseif nxt == 0x2B then -- + self:emit(function(self, ch)				local is = ch == this				return is, is			end) self:emit(function(self, ch)				local is = ch == this				return is, not is			end) self:advance elseif nxt == 0x2D then -- - self:emit(function(self, ch, head)				head = head + 1				local test = self[head]				if not test then					return false, true				end				local str_adv, pat_adv = test(self, ch, head)				if str_adv or pat_adv then					self.head = self.head + 1					return str_adv, pat_adv				end				return ch == this, false			end) self:advance elseif nxt == 0x3F then -- ? self:emit(function(self, ch)				return ch == this, true			end) self:advance else self:emit(function(self, ch)				local is = ch == this				return is, is			end) end end function Parser:do_parse self[-1].handler = main_handler end function export.parse(text) text = {codepoint(text, 1, -1)} return (select(2, Parser:parse{ text = text, node = {Pattern}, route = {"do_parse"} }))	end end

return export