Module:User:Theknightwho/parser/tokenizer old

local error = error local ipairs = ipairs local require = require local type = type local unpack = unpack local xpcall = xpcall

local traceback = debug.traceback

local table = table local concat = table.concat local insert = table.insert local remove = table.remove

-- -- Context --

-- Contexts inform the tokenizer of the immediate assumptions it should make, and each records boolean values for all the base context states listed below. To allow rapid setting and checking, they are stored as numbers (not tables), as tens of thousands of them may be generated during a single tokenization. Ideally they would use bitwise operators (one bit for each context state), but the backported bit32 library is too slow. Instead, each context state is assigned a prime number, and multiple context states can be set by simply multiplying them. This is at least an order of magnitude faster. -- The context state can be checked with the Euclidean greatest common divisor algorithm: if the GCD is >1, then the context state is true. This is particularly useful with aggregate context states (e.g. TEMPLATE), where a GCD >1 means that at least one of the component states is true.

local TEMPLATE_NAME = 2 local TEMPLATE_PARAM_KEY = 3 local TEMPLATE_PARAM_VALUE = 5 local TEMPLATE = TEMPLATE_NAME * TEMPLATE_PARAM_KEY * TEMPLATE_PARAM_VALUE

local ARGUMENT_NAME = 7 local ARGUMENT_DEFAULT = 11 local ARGUMENT = ARGUMENT_NAME * ARGUMENT_DEFAULT

local WIKILINK_TITLE = 13 local WIKILINK_TEXT = 17 local WIKILINK = WIKILINK_TITLE * WIKILINK_TEXT

local EXT_LINK_URI = 19 local EXT_LINK_TITLE = 23 local EXT_LINK = EXT_LINK_URI * EXT_LINK_TITLE

local HEADING_1 = 29 local HEADING_2 = 31 local HEADING_3 = 37 local HEADING_4 = 41 local HEADING_5 = 43 local HEADING_6 = 47 local HEADING = HEADING_1 * HEADING_2 * HEADING_3 * HEADING_4 * HEADING_5 * HEADING_6

local TAG_OPEN = 53 local TAG_ATTR = 59 local TAG_BODY = 61 local TAG_CLOSE = 67 local TAG = TAG_OPEN * TAG_ATTR * TAG_BODY * TAG_CLOSE

local STYLE_ITALICS = 71 local STYLE_BOLD = 73 local STYLE_PASS_AGAIN = 79 local STYLE_SECOND_PASS = 83 local STYLE = STYLE_ITALICS * STYLE_BOLD * STYLE_PASS_AGAIN * STYLE_SECOND_PASS

local HTML_ENTITY = 89

local DL_TERM = 97

local HAS_TEXT = 101 local FAIL_ON_TEXT = 103 local FAIL_NEXT = 107 local FAIL_ON_LBRACE = 109 local FAIL_ON_RBRACE = 113 local FAIL_ON_EQUALS = 127 local HAS_TEMPLATE = 131

local FAIL = TEMPLATE * ARGUMENT * WIKILINK * EXT_LINK_TITLE * HEADING * TAG * STYLE local UNSAFE = TEMPLATE_NAME * TEMPLATE_PARAM_KEY * ARGUMENT_NAME * WIKILINK_TITLE * EXT_LINK_TITLE * TAG_CLOSE local DOUBLE = TEMPLATE_PARAM_KEY * TAG_CLOSE local NO_EXT_LINKS = TEMPLATE_NAME * ARGUMENT_NAME * WIKILINK_TITLE * EXT_LINK

local function has(c, cxt) repeat c, cxt = cxt, c % cxt until cxt == 0 return c > 1 end

-- Careful: these have no safety checks against double-(un)setting, to maximise speed. local function set(c, cxt) return c * cxt end

local function unset(c, cxt) return c / cxt end

-- Includes a safety check against double-unsetting. This should only be used to ensure all components of an aggregate state are unset, or when a base state could be unset already. local function safe_unset(c, cxt) local orig_c = c	repeat c, cxt = cxt, c % cxt until cxt == 0 return orig_c / c end

-- -- Tokenizer --

-- Tokens. local Tokens = require("Module:User:Theknightwho/parser/tokens")

local TemplateOpen = Tokens.TemplateOpen local TemplateParamSeparator = Tokens.TemplateParamSeparator local TemplateParamEquals = Tokens.TemplateParamEquals local TemplateClose = Tokens.TemplateClose

local ArgumentOpen = Tokens.ArgumentOpen local ArgumentSeparator = Tokens.ArgumentSeparator local ArgumentClose = Tokens.ArgumentClose

local WikilinkOpen = Tokens.WikilinkOpen local WikilinkSeparator = Tokens.WikilinkSeparator local WikilinkClose = Tokens.WikilinkClose

local ExternalLinkOpen = Tokens.ExternalLinkOpen local ExternalLinkSeparator = Tokens.ExternalLinkSeparator local ExternalLinkClose = Tokens.ExternalLinkClose

local HTMLEntityStart = Tokens.HTMLEntityStart local HTMLEntityNumeric = Tokens.HTMLEntityNumeric local HTMLEntityHex = Tokens.HTMLEntityHex local HTMLEntityEnd = Tokens.HTMLEntityEnd

local TagOpenOpen = Tokens.TagOpenOpen local TagAttrStart = Tokens.TagAttrStart local TagAttrEquals = Tokens.TagAttrEquals local TagAttrQuote = Tokens.TagAttrQuote local TagCloseOpen = Tokens.TagCloseOpen local TagCloseSelfclose = Tokens.TagCloseSelfclose local TagOpenClose = Tokens.TagOpenClose local TagCloseClose = Tokens.TagCloseClose

local CommentStart = Tokens.CommentStart local CommentEnd = Tokens.CommentEnd

-- Errors. local BadRoute = {} local StopIteration = {}

-- Constants. local constants = require("Module:User:Theknightwho/parser/constants")

local START = constants.START local END = constants.END

local CHUNKS = constants.CHUNKS local DL_MARKERS = constants.DL_MARKERS local HTML_ENTITIES -- required if needed local HTML_NUMERIC = constants.HTML_NUMERIC local HTML_HEX = constants.HTML_HEX local HTML_NAMED = constants.HTML_NAMED local LIST_MARKERS = constants.LIST_MARKERS local MARKERS = constants.MARKERS local NEWLINE = constants.NEWLINE local URI_CHARS = constants.URI_CHARS local URI_SCHEMES = constants.URI_SCHEMES local URI_END = constants.URI_END

local ERRORS = { [BadRoute] = true, [StopIteration] = true }

-- Internal state. local bad_routes local depth local head local text local title

local stack = setmetatable({}, {	__index = function(t, k)		return t[#t][k]	end,	__newindex = function(t, k, v)		t[#t][k] = v	end })

-- Functions. Some function variables are declared early, to allow mutual recursion. local function set_text(str) local str_len, text, i, from = #str, {}, 1, 1 for n = 1, str_len do		local chunk = str:sub(n, n)		if CHUNKS[chunk] then if n > from then text[i] = str:sub(from, n - 1) i = i + 1 end text[i] = chunk i = i + 1 from = n + 1 end end if from <= str_len then text[i] = str:sub(from) end return text end

local read

local function push(context) context = context or 1 if (		bad_routes[head] and		bad_routes[head][context]	) then BadRoute.context = context error(BadRoute) end insert(stack, {		tokens = {},		context = context,		textbuffer = {},		ident_head = head,		ident_context = context	}) end

local function push_textbuffer if #stack.textbuffer > 0 then insert(			stack.tokens,			concat(stack.textbuffer)		) stack.textbuffer = {} end end

local function pop(keep_context) push_textbuffer local layer = remove(stack) if keep_context then stack.context = layer.context end return layer.tokens end

local function can_recurse return depth < 49 end

local function memoize_bad_route if not bad_routes[stack.ident_head] then bad_routes[stack.ident_head] = {} end bad_routes[stack.ident_head][stack.ident_context] = true end

local function fail_route BadRoute.context = stack.context memoize_bad_route pop error(BadRoute) end

local function error_handler(try, err_handlers) local ret = {xpcall(		try,		function(err)			if err_handlers[err] then				return err_handlers[err]			elseif title then				error("Error tokenizing " .. title.fullText .. ": " .. traceback(err), 2)			else				error(traceback(err), 2)			end		end	)} return unpack(ret, 2) end

local function emit(token) push_textbuffer insert(stack.tokens, token) end

local function emit_first(token) push_textbuffer insert(stack.tokens, 1, token) end

local function emit_text(text) insert(stack.textbuffer, text) end

local function emit_all(tokens) if #tokens > 0 and not type(tokens[1]) == "string" then emit_text(remove(tokens, 1)) end push_textbuffer for _, token in ipairs(tokens) do		insert(stack.tokens, token) end end

local function emit_text_then_tokens(text) local tokens = pop emit_text(text) if tokens then emit_all(tokens) end head = head - 1 end

function read(delta, strict) local index = head + (delta or 0) if index < 1 then return START elseif text[index] then return text[index] elseif strict then fail_route else return END end end

local parse

local function parse_template depth = depth + 1 local reset = head local context = set(1, TEMPLATE_NAME) local template = error_handler(		function			return parse(context)		end,		{[BadRoute] = function			head = reset			return BadRoute		end}	) depth = depth - 1 if template == BadRoute then error(BadRoute) end emit_first(TemplateOpen) emit_all(template) emit(TemplateClose) end

local function parse_argument depth = depth + 1 local reset = head local context = set(1, ARGUMENT_NAME) local argument = error_handler(		function			return parse(context)		end,		{[BadRoute] = function			head = reset			return BadRoute		end}	) depth = depth - 1 if argument == BadRoute then error(BadRoute) end emit_first(ArgumentOpen) emit_all(argument) emit(ArgumentClose) end

local function parse_template_or_argument head = head + 2 local braces = 2 while read == "{" do		head = head + 1 braces = braces + 1 end push while braces > 0 do		if braces == 1 then return emit_text_then_tokens("{") elseif braces == 2 then if error_handler(				function					parse_template				end,				{[BadRoute] = function					return BadRoute				end}			) == BadRoute then return emit_text_then_tokens("{{") end break else if error_handler(				function					parse_argument					braces = braces - 3				end,				{[BadRoute] = function					return error_handler( function parse_template braces = braces - 2 end, {[BadRoute] = function return BadRoute end} )				end}			) == BadRoute then return emit_text_then_tokens(("{"):rep(braces)) end end if braces > 0 then head = head + 1 end end emit_all(pop) stack.context = safe_unset(stack.context, FAIL_NEXT) end

local function handle_template_param if has(stack.context, TEMPLATE_PARAM_KEY) then emit_all(pop) else stack.context = set(			safe_unset(stack.context, TEMPLATE),			TEMPLATE_PARAM_KEY		) end emit(TemplateParamSeparator) push(stack.context) end

local function handle_template_param_value emit_all(pop) stack.context = set(		unset(stack.context, TEMPLATE_PARAM_KEY),		TEMPLATE_PARAM_VALUE	) emit(TemplateParamEquals) end

local function handle_template_end if has(stack.context, TEMPLATE_PARAM_KEY) then emit_all(pop) end head = head + 1 return pop end

local function handle_argument_separator stack.context = set(		unset(stack.context, ARGUMENT_NAME),		ARGUMENT_DEFAULT	) emit(ArgumentSeparator) end

local function handle_argument_end head = head + 2 return pop end

local function all_valid(str, chars) for i = 1, #str do		if not chars[str:sub(i, i)] then return false end end return true end

local function really_parse_entity emit(HTMLEntityStart) head = head + 1 local this = read(0, true) local numeric, hexadecimal if this == "#" then numeric = true emit(HTMLEntityNumeric) head = head + 1 this = read(0, true) local this_1 = this:sub(1, 1) if this_1:lower == "x" then hexadecimal = true emit(HTMLEntityHex:new(this_1)) this = this:sub(2) if this == "" then fail_route end end end local valid = hexadecimal and HTML_HEX or numeric and HTML_NUMERIC or HTML_NAMED if not all_valid(this, valid) then fail_route end head = head + 1 if read ~= ";" then fail_route elseif numeric then local test = tonumber(this, hexadecimal and 16 or 10) if test 0x10FFFF then fail_route end else HTML_ENTITIES = HTML_ENTITIES or require("Module:User:Theknightwho/parser/html entities") if not HTML_ENTITIES[this] then fail_route end end emit(this) emit(HTMLEntityEnd) end

local function parse_entity local reset = head if error_handler(		function			push(HTML_ENTITY)			really_parse_entity		end,		{[BadRoute] = function			head = reset			emit_text(read)			return BadRoute		end}	) ~= BadRoute then emit_all(pop) end end

local function parse_comment head = head + 4 local reset = head - 1 push repeat local this = read if this == END then pop head = reset emit_text("" then emit_first(CommentStart) emit(CommentEnd) emit_all(pop) head = head + 2 stack.context = safe_unset(stack.context, FAIL_NEXT) return end emit_text(this) head = head + 1 until false end

local function handle_list_marker local markup = read if markup == ";" then stack.context = set(stack.context, DL_TERM) end emit(TagOpenOpen:new(markup)) emit_text(LIST_MARKERS[markup]) emit(TagCloseSelfclose) end

local function handle_list handle_list_marker while LIST_MARKERS[read(1)] do		head = head + 1 handle_list_marker end end

local function is_scheme(scheme, slashes) scheme = scheme:lower if slashes then return URI_SCHEMES[scheme] ~= nil end return URI_SCHEMES[scheme] end

local function parse_bracketed_uri_scheme push(EXT_LINK_URI) if read == "/" and read(1) == "/" then emit_text("//") head = head + 2 return end local scheme = {} local this = read while this ~= END and all_valid(this, URI_CHARS) do		insert(scheme, this) emit_text(this) head = head + 1 this = read end if this ~= ":" then fail_route end emit_text(":") head = head + 1 local slashes = read == "/" and read(1) == "/" if slashes then emit_text("//") head = head + 2 end if not is_scheme(concat(scheme), slashes) then fail_route end end

local function parse_free_uri_scheme local scheme = {} if error_handler(		function			for i = #stack.textbuffer, 1, -1 do				local chunk = stack.textbuffer[i]				for j = #chunk, 1, -1 do					local char = chunk:sub(j, j)					if char:find("%s") then						error(StopIteration)					elseif not URI_CHARS[char] then						error(BadRoute)					end					insert(scheme, 1, char)				end			end			end,		{			[BadRoute] = function				return BadRoute			end,			[StopIteration] = function				return			end		}	) == BadRoute then error(BadRoute) end scheme = concat(scheme) local slashes = (read == "/" and read(1) == "/") if not is_scheme(scheme, slashes) then error(BadRoute) end set(stack.context, EXT_LINK_URI) push(stack.context) emit_text(scheme) emit_text(":") if slashes then emit_text("//") head = head + 2 end end

local function handle_free_link_text(punct, tail, this) if this:find("%(") and punct[")"] then punct[")"] = nil	end	local this_len = #this	if punct[this:sub(this_len, this_len)] then		local n		for i = this_len - 1, 1, -1 do			if not punct[this:sub(i, i)] then				n = i				break			end		end		local stripped = this:sub(1, n)		if #stripped > 0 and #tail > 0 then			emit_text(concat(tail))			tail = {}		end		insert(tail, this:sub(n + 1))		this = stripped	elseif #tail > 0 then		emit_text(concat(tail))		tail = {}	end	emit_text(this)	return punct, tail end

local function is_uri_end(this, nxt) local context = stack.context return (		URI_END[this] or		this:find(" ") or		(this == "|" and has(context, TEMPLATE)) or		(this == "=" and has(context, TEMPLATE_PARAM_KEY * HEADING)) or		(this == "}" and nxt == "}" and has(context, TEMPLATE)) or		(this == "}" and nxt == "}" and read(2) == "}" and has(context, ARGUMENT))	) end

local function really_parse_external_link(brackets) if has(stack.context, NO_EXT_LINKS) then fail_route end local invalid = { ["\n"] = true, [" "] = true, ["]"] = true, }	local punct = {} if brackets then parse_bracketed_uri_scheme else parse_free_uri_scheme invalid["["] = true punct = { [","] = true, [";"] = true, ["\\"] = true, ["."] = true, [":"] = true, ["!"] = true, ["?"] = true, [")"] = true		}	end	local this = read	if this == END or invalid[this:sub(1, 1)] then		fail_route	end	local nxt, tail = read(1), {}	repeat		if this == "&" then			if #tail > 0 then				emit_text(concat(tail))				tail = {}			end			parse_entity		elseif this == "<" and nxt == "!" and read(2) == "-" and read(3) == "-" then			if #tail > 0 then				emit_text(concat(tail))				tail = {}			end			parse_comment		elseif this == "{" and nxt == "{" and can_recurse then			if #tail > 0 then				emit_text(concat(tail))				tail = {}			end			parse_template_or_argument		elseif brackets then			if this == END or this == "\n" then				fail_route			elseif this == "]" then				return pop			elseif is_uri_end(this, nxt) then				if this:find(" ") then					local before, after = this:match("^([^ ]*) (.*)$")					emit_text(before)					emit(ExternalLinkSeparator:new(false))					if #after > 0 then emit_text(after) end head = head + 1 else emit(ExternalLinkSeparator:new(true)) end stack.context = set(					unset(stack.context, EXT_LINK_URI),					EXT_LINK_TITLE				) return parse(1, false) end emit_text(this) else if is_uri_end(this, nxt) then if this ~= END and this:find(" ") then local before, after = this:match("^([^ ]*) (.*)$") punct, tail = handle_free_link_text(punct, tail, before) insert(tail, " " .. after) else head = head - 1 end return pop, tail end punct, tail = handle_free_link_text(punct, tail, this) end head = head + 1 this, nxt = read, read(1) until false end

local function remove_uri_scheme_from_textbuffer(scheme) local length = #scheme while length > 0 do		local textbuffer = stack.textbuffer[#stack.textbuffer] if length < #textbuffer then stack.textbuffer[#stack.textbuffer] = textbuffer:sub(1, -length - 1) break end length = length - #textbuffer table.remove(stack.textbuffer) end end

local function handle_dl_term stack.context = unset(stack.context, DL_TERM) if read == ":" then handle_list_marker else emit_text("\n") end end

local function parse_external_link(brackets) local reset = head head = head + 1 local link, extra = error_handler(		function			return really_parse_external_link(brackets)		end,		{[BadRoute] = function			head = reset			if ( not brackets and has(stack.context, DL_TERM) ) then				handle_dl_term			else				emit_text(read)			end			return BadRoute		end}	) if link ~= BadRoute then if not brackets then local scheme = link[1]:match("^[^:]*%f[%z:]") remove_uri_scheme_from_textbuffer(scheme) end emit(ExternalLinkOpen:new(brackets)) emit_all(link) emit(ExternalLinkClose) if extra then emit_text(concat(extra)) end end end

local function parse_wikilink local reset = head + 1 head = head + 2 local link = error_handler(		function			really_parse_external_link(true)		end,		{[BadRoute] = function			head = reset + 1			local wikilink = error_handler( function return parse(WIKILINK_TITLE) end, {[BadRoute] = function head = reset emit_text("[[")					return BadRoute				end}			)			if wikilink == BadRoute then				return BadRoute			end			emit(WikilinkOpen)			emit_all(wikilink)			emit(WikilinkClose)			return BadRoute		end}	)	if link ~= BadRoute then		if has(stack.context, EXT_LINK_TITLE) then			head = reset			emit_text("[[")		else			emit_text("[")			emit(ExternalLinkOpen:new(true))			emit_all(link)			emit(ExternalLinkClose)		end	end end

local function handle_wikilink_separator stack.context = set(		unset(stack.context, WIKILINK_TITLE),		WIKILINK_TEXT	) emit(WikilinkSeparator) end

local function handle_wikilink_end head = head + 1 return pop end

local function handle_end if has(stack.context, FAIL) then if has(stack.context, DOUBLE) then pop end fail_route end return pop end

local function verify_safe(this) local context = stack.context if has(context, FAIL_NEXT) then return false elseif has(context, TEMPLATE_NAME) then if this == "{" then stack.context = set(stack.context, FAIL_NEXT) return true end elseif has(context, FAIL_ON_EQUALS) then if this == "=" then return false end elseif has(context, FAIL_ON_LBRACE) then if this == "{" or (			read(-1) == "{" and			read(-2) == "{"		) then if has(context, TEMPLATE) then stack.context = set(stack.context, FAIL_ON_EQUALS) else stack.context = set(stack.context, FAIL_NEXT) end return true end stack.context = unset(stack.context, FAIL_ON_LBRACE) elseif has(context, FAIL_ON_RBRACE) then if this == "}" then stack.context = set(stack.context, FAIL_NEXT) return true end stack.context = unset(stack.context, FAIL_ON_RBRACE) elseif this == "{" then stack.context = set(stack.context, FAIL_ON_LBRACE) elseif this == "}" then stack.context = set(stack.context, FAIL_ON_RBRACE) end return true end

function parse(context, do_push) context = context or 1 if do_push ~= false then push(context) end repeat repeat local this = read if (				has(stack.context, UNSAFE) and				not verify_safe(this)			) then if has(stack.context, DOUBLE) then pop end fail_route end if not MARKERS[this] then emit_text(this) break elseif this == END then return handle_end end local nxt = read(1) if this == "{" and nxt == "{" and can_recurse then parse_template_or_argument break elseif this == "|" then if has(stack.context, TEMPLATE) then handle_template_param break elseif has(stack.context, ARGUMENT) then handle_argument_separator break elseif has(stack.context, WIKILINK_TITLE) then handle_wikilink_separator break end elseif this == "=" and has(stack.context, TEMPLATE_PARAM_KEY) then handle_template_param_value break elseif this == "}" and nxt == "}" then if has(stack.context, TEMPLATE) then return handle_template_end elseif read(2) == "}" and has(stack.context, ARGUMENT) then return handle_argument_end end elseif this == "[" then if nxt == "[" then if not has(stack.context, EXT_LINK_URI) then parse_wikilink break end else parse_external_link(true) break end else local prev = read(-1) if this == ":" and not MARKERS[prev] then parse_external_link(false) break elseif this == "]" then if has(stack.context, WIKILINK) then return handle_wikilink_end elseif has(stack.context, EXT_LINK_TITLE) then return pop end elseif this == "&" then parse_entity break elseif this == "<" and nxt == "!" and read(2) == "-" and read(3) == "-" then parse_comment break elseif LIST_MARKERS[this] and NEWLINE[prev] then handle_list break elseif DL_MARKERS[this] and has(stack.context, DL_TERM) then handle_dl_term break end end emit_text(this) until true head = head + 1 until false end

local function tokenize(str, t)	text = set_text(str) title = t	head = 1 depth = 0 bad_routes = {} local tokens = error_handler(		function			return parse		end,		{[BadRoute] = function			error(traceback("Tokenizer exited with bad route."))		end}	) if #stack > 0 then error("Tokenizer exited with non-empty token stack.") end return tokens end

return tokenize