Module:User:Theknightwho/parser/tokenizer

local error = error local ipairs = ipairs local require = require local setmetatable = setmetatable local type = type

local math = math local floor = math.floor local min = math.min

local string = string local find = string.find local lower = string.lower local rep = string.rep local sub = string.sub

local table = table local concat = table.concat local insert = table.insert local remove = table.remove

-- Contexts. local contextualizer = require("Module:User:Theknightwho/parser/contextualizer")

local has = contextualizer.has local unset = contextualizer.unset local safe_set = contextualizer.safe_set

local TEMPLATE = 2 local TEMPLATE_PARAM_KEY = 3 local ARGUMENT = 5 local LANGUAGE_BLOCK = 7 local WIKILINK = 11 local HEADING = 13 local FAIL_ON_END = TEMPLATE * ARGUMENT local COMPLETE_ON_END = WIKILINK * LANGUAGE_BLOCK

-- Tokens. local Tokens = require("Module:User:Theknightwho/parser/tokens")

local Text = Tokens.Text

local TemplateOpen = Tokens.TemplateOpen local TemplateParamSeparator = Tokens.TemplateParamSeparator local TemplateParamEquals = Tokens.TemplateParamEquals local TemplateClose = Tokens.TemplateClose

local ArgumentOpen = Tokens.ArgumentOpen local ArgumentSeparator = Tokens.ArgumentSeparator local ArgumentClose = Tokens.ArgumentClose

local HeadingStart = Tokens.HeadingStart local HeadingStarts = {} for i = 1, 6 do	HeadingStarts[i] = HeadingStart:new{i} end local HeadingEnd = Tokens.HeadingEnd

local ParserTagOpen = Tokens.ParserTagOpen local ParserTagClose = Tokens.ParserTagClose local ParserTagSelfClose = Tokens.ParserTagSelfClose local Ignored = Tokens.Ignored

-- Returned by a bad route. local BadRoute = {}

-- Constants. local START = {} local END = {}

local BEFORE_HEADING = { ["\n"] = true, [START] = true }

local CHUNKS = { ["\t"] = true, ["\n"] = true, ["\v"] = true, ["\f"] = true, ["\r"] = true, [" "] = true, ["!"] = true, ["-"] = true, ["/"] = true, ["<"] = true, ["="] = true, [">"] = true, ["["] = true, ["]"] = true, ["{"] = true, ["|"] = true, ["}"] = true, [START] = true, [END] = true }

local SPACES_TABS = { ["\t"] = true, [" "] = true }

local TAGS = { categorytree = true, ce = true, charinsert = true, chem = true, dynamicpagelist = true, gallery = true, graph = true, hiero = true, imagemap = true, indicator = true, inputbox = true, langconvert = true, mapframe = true, maplink = true, math = true, nowiki = true, poem = true, pre = true, ref = true, references = true, score = true, section = true, source = true, syntaxhighlight = true, talkpage = true, templatedata = true, templatestyles = true, thread = true, timeline = true }

local WHITESPACE = { ["\t"] = true, ["\n"] = true, ["\v"] = true, ["\f"] = true, ["\r"] = true, [" "] = true }

local Tokenizer = {} Tokenizer.__index = Tokenizer

function Tokenizer:new local p = setmetatable({		stack_len = 0	}, Tokenizer) p.stack = setmetatable({}, {		__index = function(t, k)			return t[p.stack_len][k]		end,		__newindex = function(t, k, v)			t[p.stack_len][k] = v		end	}) return p end

function Tokenizer:push(context) context = context or 1 insert(self.stack, {		tokens = {},		context = context,		textbuffer = {},		init_head = self.head,		init_context = context	}) self.stack_len = self.stack_len + 1 end

function Tokenizer:push_attempt(context) local bad_routes = self.bad_routes local head = self.head context = context or 1 if (		bad_routes[head] and		bad_routes[head][context]	) then return BadRoute end self:push(context) end

function Tokenizer:push_textbuffer local stack = self.stack local textbuffer = stack.textbuffer if #textbuffer > 0 then insert(			stack.tokens,			concat(textbuffer)		) stack.textbuffer = {} end end

function Tokenizer:pop self:push_textbuffer local layer = remove(self.stack) self.stack_len = self.stack_len - 1 return layer.tokens end

function Tokenizer:fail_route local bad_routes = self.bad_routes local stack = self.stack local init_head = stack.init_head if not bad_routes[init_head] then bad_routes[init_head] = {} end self.bad_routes[init_head][stack.init_context] = true self:pop return BadRoute end

function Tokenizer:emit(token) self:push_textbuffer insert(self.stack.tokens, token) end

function Tokenizer:emit_first(token) self:push_textbuffer insert(self.stack.tokens, 1, token) end

function Tokenizer:emit_text(text) insert(self.stack.textbuffer, text) end

function Tokenizer:emit_all(tokens) if #tokens > 0 and not type(tokens[1]) == "string" then self:emit_text(remove(tokens, 1)) end self:push_textbuffer local insert = insert local stack_tokens = self.stack.tokens for _, token in ipairs(tokens) do		insert(stack_tokens, token) end end

function Tokenizer:emit_block(open, block, close) self:emit_first(open) self:emit_all(block) self:emit(close) end

function Tokenizer:read(delta) local index = self.head + (delta or 0) if index < 1 then return START end return self.text[index] or END end

function Tokenizer:handle_single_brace self.head = self.head - 1 self:emit_first("{") return 0 end

-- Tries to parse the head as the first brace of a template. Returns successfully if it encounters "}}", or BadRoute if it encounters END. Templates start with TemplateOpen (""). Parameters are delineated with TemplateParamSeparator ("|"), and (where the key is named) keys and values are separated by TemplateParamEquals ("="). function Tokenizer:parse_template(braces) local reset = self.head if self:push_attempt(TEMPLATE) ~= BadRoute then local template = self:parse if template ~= BadRoute then self:emit_block(TemplateOpen, template, TemplateClose) return braces - 2 end end self.head = reset - 1 self:emit_first(rep("{", braces)) return 0 end

-- Tries to parse the head as the first brace of an argument. Returns successfully if it encounters "}}}", or BadRoute if it encounters "}}" (with no third brace) or END. Arguments start with ArgumentOpen (""). Argument names and default values are delineated with ArgumentSeparator ("|"). -- Note: it is possible for an argument to contain ArgumentSeparator multiple times. Under normal circumstances, the parser simply ignores everything between the second ArgumentSeparator and ArgumentClose. It is still necessary to parse it at this stage, though, because if the argument fails then it will need to be rendered. function Tokenizer:parse_argument(braces) local reset = self.head if self:push_attempt(ARGUMENT) ~= BadRoute then local argument = self:parse if argument ~= BadRoute then self:emit_block(ArgumentOpen, argument, ArgumentClose) return braces - 3 end end self.head = reset return parse_template(braces) end

Tokenizer.brace_handlers = setmetatable({	Tokenizer.handle_single_brace,	Tokenizer.parse_template,	Tokenizer.parse_argument }, {	__index = function(t)		return t[3]	end })

function Tokenizer:parse_braces self.head = self.head + 2 local braces = 2 while self:read == "{" do		self.head = self.head + 1 braces = braces + 1 end self:push local handlers = self.brace_handlers repeat braces = handlers[braces](self, braces) if braces > 0 then self.head = self.head + 1 end until braces == 0 self:emit_all(self:pop) end

-- Parses the head as the first character of a language conversion block. -- Language conversion blocks are not enabled, but the wikitext is parsed as though they were: anything that comes between the opening and closure of the block is treated as the content of the block, which prevents the closure of any outer blocks (e.g. "{{template | -{ some text }}" does not contain a valid template closure, because "some text }}" is treated as being inside the language block. -- Note: this applies even if the language block is never closed, since END is treated as the closure.

-- Parses the head as the first character of a wikilink. -- Wikilinks are not actually processed at this stage: they are processed after all templates have been expanded, because this means templates can add or modify wikilinks. However, the default preprocessor still treats any wikilink-like syntax as being a block: anything that comes between the opening and closure of the block is treated as the content of the block, which prevents the closure of any outer blocks (e.g. "" does not contain a valid template closure, because "some text }}" is treated as being inside the wikilink block. -- Note: this applies even if the wikilink block is never closed, since END is treated as the closure, as well as when the wikilink contains invalid syntax (such as disallowed characters or new lines). function Tokenizer:handle_heading_end	local stack = self.stack	local textbuffer = stack.textbuffer	for i = #textbuffer, 1, -1 do		if not SPACES_TABS[textbuffer[i]] then			self:emit_all(self:pop)			return		end	end	local tokens = stack.tokens	local tokens_len = #tokens	local prev_token = tokens[tokens_len]	if not (type(prev_token) == "string" and find(prev_token, "^=+$")) then self:emit_all(self:pop) return -- Single string of equals signs on its own line. elseif tokens_len == 1 then local prev_token_len = #prev_token local level = min(6, floor((prev_token_len - 1) / 2)) if level == 0 then self:emit_all(self:pop) return end tokens[1] = rep("=", prev_token_len - level * 2) self:emit_first(HeadingStarts[level]) self:emit_all(self:pop) self:emit(HeadingEnd) return end local start_len, end_len = #tokens[1], #tokens[tokens_len] local level = min(6, start_len, end_len) if start_len > level then tokens[1] = rep("=", start_len - level) self:emit_first(HeadingStarts[level]) tokens_len = tokens_len + 1 else tokens[1] = HeadingStarts[level] end if end_len > level then tokens[tokens_len] = rep("=", end_len - level) self:emit(HeadingEnd) else tokens[tokens_len] = HeadingEnd end self:emit_all(self:pop) end

function Tokenizer:handle_onlyinclude local this, nxt, nxt2 = self:read, self:read(1), self:read(2) while not (this == "<" and nxt == "onlyinclude" and nxt2 == ">") do		if nxt2 == END then break end self.head = self.head + 1 this, nxt, nxt2 = nxt, nxt2, self:read(2) end self.head = self.head + 2 end

-- If a line only contains comments, the parser ignores the entire line (including leading/trailing spaces and tabs for each comment). This isn't applied to the first or last lines (including when there's an unclosed comment), as the parser checks for a preceding and following \n. -- If the check for this fails, loop back and parse each comment conventionally, because the intermediate whitespace needs to be included. -- `backtrack` counts the number of leading spaces and tabs on the line. Since they've already been added to the textbuffer, they will need to be removed (along with \n itself) if the line only contains comments.

function Tokenizer:find_close_tag(name, this) local reset = self.head local nxt, nxt2, nxt2_lower = self:read(1), self:read(2) if this ~= "<" or nxt ~= "/" then return false end nxt2_lower = lower(nxt2) if nxt2_lower ~= name then return false end self.head = self.head + 2 repeat self.head = self.head + 1 local this = self:read if this == ">" then return true end until not WHITESPACE[this] self.head = reset return false end

-- Note: Ignored tokens are necessary to block handle_heading_end from creating a heading if ignored wikitext is included after heading wikitext on the same line. This matches the default parser, which only accounts for comments in that position. function Tokenizer:parse_tag(name) local ignored_tag = self.ignored_tags[name] local reset = self.head self:push self.head = self.head + 2 local this = self:read while this ~= ">" and this ~= END do		self:emit_text(this) self.head = self.head + 1 this = self:read end if this == END then self.head = reset self:pop self:emit_text("<") elseif ignored_tag then -- Discard. self:pop self:emit(Ignored) elseif name == self.ignored_element then self:pop if self:read(-1) ~= "/" then repeat self.head = self.head + 1 this = self:read until self:find_close_tag(name, this) or this == END end self:emit(Ignored) elseif self:read(-1) == "/" then local attr = concat(self:pop) self:emit(ParserTagSelfClose:new{name, attr}) else local attr = concat(self:pop) self:push self.head = self.head + 1 this = self:read while not (self:find_close_tag(name, this) or this == END) do			self:emit_text(this) self.head = self.head + 1 this = self:read end if this == END then self.head = reset self.no_close[name] = reset self:pop self:emit_text("<") else self:emit_first(ParserTagOpen:new{name, attr}) self:emit_all(self:pop) self:emit(ParserTagClose) end end end

Tokenizer.chunk_handlers = { ["{"] = function(self) if self:read(1) == "{" then self:parse_braces else self:emit_text("{") end end, ["|"] = function(self, stack) local context = stack.context if has(context, TEMPLATE) then stack.context = safe_set(context, TEMPLATE_PARAM_KEY) self:emit(TemplateParamSeparator) elseif has(context, ARGUMENT) then self:emit(ArgumentSeparator) else self:emit_text("|") end end, ["}"] = function(self, stack) local nxt = self:read(1) if nxt == "}" then local context = stack.context if has(context, TEMPLATE) then self.head = self.head + 1 return self:pop elseif has(context, ARGUMENT) then if self:read(2) == "}" then self.head = self.head + 2 return self:pop end return self:fail_route end self:emit_text("}") elseif nxt == "-" and has(stack.context, LANGUAGE_BLOCK) then self:emit_text("}-") self:emit_all(self:pop) self.head = self.head + 1 else self:emit_text("}") end end, ["="] = function(self, stack) local context = stack.context if has(context, HEADING) then local count = 1 while self:read(1) == "=" do				self.head = self.head + 1 count = count + 1 end self:emit(rep("=", count)) return end -- As a special case, a single equals sign on a new line is treated as a parameter key indicator when possible, instead of the start of an L1 heading. Two or more equals signs will always be treated as the start of a heading, though. local can_heading = BEFORE_HEADING[self:read(-1)] if can_heading and self:read(1) == "=" then self:push(HEADING) self.head = self.head - 1 elseif has(context, TEMPLATE_PARAM_KEY) then stack.context = unset(context, TEMPLATE_PARAM_KEY) self:emit(TemplateParamEquals) elseif can_heading then self:push(HEADING) self.head = self.head - 1 else self:emit_text("=") end end, ["\n"] = function(self, stack) if has(stack.context, HEADING) then self:handle_heading_end end self:emit_text("\n") end, ["-"] = function(self) if self:read(1) == "{" then self:emit_text("-") if self:read(2) == "{" then self.head = self.head + 1 local template_or_argument = self:parse_braces if template_or_argument == BadRoute then self:push(LANGUAGE_BLOCK) end else self:push(LANGUAGE_BLOCK) end else self:emit_text("-") end end, ["["] = function(self) if self:read(1) == "[" then self:push(WIKILINK) self:emit_text("")			self.head = self.head + 1		else			self:emit_text("[")		end	end,	["]"] = function(self, stack)		if self:read(1) == "]" and has(stack.context, WIKILINK) then			self:emit_text("") self:emit_all(self:pop) self.head = self.head + 1 else self:emit_text("]") end end, ["<"] = function(self, stack, ignored_element, ignored_tags, no_close, onlyinclude) local nxt, nxt2, nxt3 = self:read(1), self:read(2), self:read(3) if nxt == "!" then if nxt2 == "-" and nxt3 == "-" then self.head = self.head + 4 local lookbehind = -6 local this, nxt, nxt2 = self:read, self:read(1), self:read(2) while not (this == "-" and nxt == "-" and nxt2 == ">") do					if nxt2 == END then self.head = self.head + 2 return end self.head = self.head + 1 lookbehind = lookbehind - 1 this, nxt, nxt2 = nxt, nxt2, self:read(2) end self.head = self.head + 2 local backtrack, chunk = 0 repeat lookbehind = lookbehind - 1 backtrack = backtrack + 1 chunk = self:read(lookbehind) until not SPACES_TABS[chunk] if chunk ~= "\n" then return end local lookahead = 0 repeat lookahead = lookahead + 1 chunk = self:read(lookahead) until not SPACES_TABS[chunk] while chunk ~= "\n" do if not (chunk == "<" and self:read(lookahead + 1) == "!" and self:read(lookahead + 2) == "-" and self:read(lookahead + 3) == "-") then return end lookahead = lookahead + 4 this, nxt, nxt2 = self:read(lookahead), self:read(lookahead + 1), self:read(lookahead + 2) while not (this == "-" and nxt == "-" and nxt2 == ">") do						if nxt2 == END then return end lookahead = lookahead + 1 this, nxt, nxt2 = nxt, nxt2, self:read(lookahead + 2) end lookahead = lookahead + 2 repeat lookahead = lookahead + 1 chunk = self:read(lookahead) until not SPACES_TABS[chunk] end local remove = remove local textbuffer = stack.textbuffer for _ = 1, backtrack do					remove(textbuffer) end self.head = self.head + lookahead - 1 else self:emit_text("<") end elseif nxt == "/" then if nxt2 == "onlyinclude" and nxt3 == ">" and onlyinclude then self.head = self.head + 4 self:handle_onlyinclude else -- When looking for tags to ignore, the parser only uses the rules for opening tags, but treats (e.g.) "noinclude" and "/noinclude" as separate tag types. This causes anomalies, such as " " and " " being treated as valid tags to ignore. local nxt2_lower = lower(nxt2) if (					ignored_tags[nxt2_lower] and					(nxt3 == ">" or (nxt3 == "/" and self:read(4) == ">") or WHITESPACE[nxt3])				) then self:parse_tag(nxt2_lower) else self:emit_text("<") end end elseif nxt2 == ">" or (nxt2 == "/" and nxt3 == ">") or WHITESPACE[nxt2] then local nxt_lower = lower(nxt) if (				(TAGS[nxt_lower] and (not no_close[nxt_lower] or no_close[nxt_lower] > self.head)) or				ignored_tags[nxt_lower] or				nxt_lower == ignored_element			) then self:parse_tag(nxt_lower) else self:emit_text("<") end else self:emit_text("<") end end, [END] = function(self, stack) local context = stack.context if has(context, FAIL_ON_END) then return self:fail_route elseif has(context, COMPLETE_ON_END) then self:emit_all(self:pop) elseif has(context, HEADING) then self:handle_heading_end else return self:pop end end }

function Tokenizer:parse local handlers = self.chunk_handlers local ignored_element = self.ignored_element local ignored_tags = self.ignored_tags local no_close = self.no_close local onlyinclude = self.onlyinclude local stack = self.stack repeat local this = self:read local handler = handlers[this] if handler then local ret = handler(				self,				stack,				ignored_element,				ignored_tags,				no_close,				onlyinclude			) if ret then return ret end else self:emit_text(this) end self.head = self.head + 1 until false end

function Tokenizer:tokenize(str, transclude, t)	local concat, insert, sub, type = concat, insert, sub, type self.bad_routes = {} self.head = 1 self.no_close = {} self.onlyinclude = nil self.title = t	local str_len, text, i, from = #str, {}, 1, 1 for n = 1, str_len do		local chunk = sub(str, n, n)		if CHUNKS[chunk] then if n > from then text[i] = sub(str, from, n - 1) i = i + 1 end text[i] = chunk i = i + 1 from = n + 1 end end if from <= str_len then text[i] = sub(str, from) end self.text = text if transclude then self.ignored_element = "noinclude" self.ignored_tags = { includeonly = true }		-- The parser only enables onlyinclude if there are exact matches for both patterns below (which don't have to be in the right order), which is unexpected behaviour for two reasons: (1) the rules for parsing all other tags are looser, and (2) like noinclude and includeonly, onlyinclude is able to treat the end of the page as a closing tag, though due to the requirement already mentioned, this only becomes relevant if there are multiple opening onlyinclude tags on the page. -- Any additional onlyinclude tags must also be exact matches in order to work. The only exception is when onlyinclude tags are ignored on non-transcluded pages, as the ignored tag rules apply instead. -- If onlyinclude is enabled, the parser treats the start of the page as though it were immediately preceded by a closing onlyinclude tag (i.e. it starts by looking for " ", ignoring everything else). local find = find if find(str, " ") and find(str, " ") then self.onlyinclude = true self:handle_onlyinclude self.head = self.head + 1 end else self.ignored_element = "includeonly" self.ignored_tags = { noinclude = true, onlyinclude = true }	end self:push local tokens_raw = self:parse if tokens_raw == BadRoute then error("Tokenizer exited with bad route.") elseif self.stack_len > 0 then error("Tokenizer exited with non-empty token stack.") end local tokens, str_tokens = {}, {} for _, token in ipairs(tokens_raw) do		if type(token) == "string" then insert(str_tokens, token) elseif token ~= Ignored then if #str_tokens > 0 then insert(tokens, Text:new{concat(str_tokens)}) str_tokens = {} end insert(tokens, token) end end if #str_tokens > 0 then insert(tokens, Text:new{concat(str_tokens)}) end return tokens end

return Tokenizer