Module:User:Theknightwho/wikitext parser

local anchor_encode = mw.uri.anchorEncode local byte = string.byte local char = string.char local concat = table.concat local explode = require("Module:string utilities").explode_utf8 local format = string.format local insert = table.insert local load_data = mw.loadData local lower = string.lower local match = string.match local pairs = pairs local rawset = rawset local remove = table.remove local rep = string.rep local require = require local select = select local setmetatable = setmetatable local sub = string.sub local tonumber = tonumber local tostring = tostring local type = type local ulower = string.ulower local umatch = mw.ustring.match local unpack = unpack local upper = string.upper local uupper = string.uupper

local m_parser = require("Module:parser") local d = load_data("Module:User:Theknightwho/wikitext parser/data")

local Parser, Node = m_parser.new

local export = {}

-- -- Helper functions --

-- Like tostring, but no character escapes are applied. local function rawstring(this) return type(this) == "table" and this:__rawstring or tostring(this) end

-- -- Nodes --

local Proxy = {}

function Proxy:__index(k) return Proxy[k] or self.__chars[k] end

function Proxy:__newindex(k, v)	local key = self.__keys[k] if key then self.__chars[k] = v		self.__parents[k][key] = v	elseif key == false then error("Character is immutable.") else error("Invalid key.") end end

function Proxy:build(a, b, c)	insert(self.__chars, a)	insert(self.__parents, b)	insert(self.__keys, c) end

function Proxy:iter(i) i = i + 1 local char = self.__chars[i] if char then return i, self[i], self, self.__keys[i], self.__parents[i] end end

function Node:new_proxy return setmetatable({		__node = self,		__chars = {},		__keys = {},		__parents = {}	}, Proxy) end

-- Iterates over display characters. function Node:pairs_display local proxy = self:new_proxy for char, parent, key in self:__pairs("next_display") do		if type(char) == "string" then proxy:build(char, parent, key) end end return Proxy.iter, proxy, 0 end

-- Iterates over raw wikitext characters. function Node:pairs_raw local proxy = self:new_proxy for char, parent, key, mut in self:__pairs("next_raw") do		if (			type(char) == "string" or			char.type == "apostrophes"		) then proxy:build(char, parent, mut and key or false) end end return Proxy.iter, proxy, 0 end

do local function escape(this) local len = #this if len == 1 then this = byte(this) elseif len == 2 then local b1, b2 = byte(this, 1, 2) this = 0x40 * b1 + b2 - 0x3080 elseif len == 3 then local b1, b2, b3 = byte(this, 1, 3) this = 0x1000 * b1 + 0x40 * b2 + b3 - 0xE2080 elseif len == 4 then local b1, b2, b3, b4 = byte(this, 1, 4) this = 0x40000 * b1 + 0x1000 * b2 + 0x40 * b3 + b4 - 0x3C82080 end return "&#" .. this .. ";"	end local function is_newline(this) return not this or			this == "\n" or			this == "\r" end local function iterate(i, this, proxy, key, word, wikilink) if not key then return this elseif (			d.NOWIKI[this] or			wikilink == 2 and match(this, "^%a$")		) then return escape(this) end local prev = proxy[i - 1] if (			(is_newline(prev) and d.NOWIKI_START[this]) or			prev == "_" and this == "_"		) then return escape(this) end local nxt, nxt2 = proxy[i + 1], proxy[i + 2] if (			this == ":" and nxt == "/" and nxt2 == "/" or			is_newline(prev) and this == "-" and nxt == "-" and nxt2 == "-" and proxy[i + 3] == "-" or			this == ":" and d.EL_SCHEMES_UNSLASHED[concat(word)] or			d.SPACE_SEPARATOR[this] and d.MAGIC_LINKS[concat(word)]		) then return escape(this) end return this end function Node:__tostring local ret, word, ret_len, word_len, wikilink = {}, {}, 0, 0, 0 for i, this, proxy, key in self:pairs_raw do			this = iterate(i, this, proxy, key, word, wikilink) ret_len = ret_len + 1 ret[ret_len] = this if match(this, "[%w_]") then word_len = word_len + 1 word[word_len] = this else for i = 1, word_len do					word[i] = nil end word_len = 0 end -- Raw "]" only appears at the end of a bracketed external link or wikilink. If there are two in a row, then it must be the end of a wikilink. wikilink = this == "]" and wikilink + 1 or 0 end return concat(ret) end function Node:__rawstring local ret, len = {}, 0 for _, this in self:pairs_raw do			len = len + 1 ret[len] = this end return concat(ret) end end

local Wikitext = Node:new_class("wikitext")

do local _new = Wikitext.new function Wikitext:new(t, force_wrapper) if type(t) == "string" then t = explode(t) end return _new(self, t, force_wrapper) end end

function Wikitext:next_display(i) i = i + 1 return self[i], i, true end

Wikitext.next_raw = Wikitext.next_display Wikitext.__tostring = Node.__tostring

-- Return no children. function Node:next_no_op end

local Apostrophes = Node:new_class("apostrophes") Apostrophes.next = Node.next_no_op Apostrophes.next_display = Node.next_no_op Apostrophes.next_raw = Node.next_no_op

function Apostrophes:__tostring return rep("'", self.num) end

local ExternalLink = Node:new_class("external link")

function ExternalLink:next(i) if i == 0 then i = self.scheme and "scheme" or "url" elseif i == "scheme" then i = "url" elseif i == "url" then i = "display" else return nil end return self[i], i end

-- FIXME: need to return something immutable if there's no display function ExternalLink:next_display(i) if i == 0 then i = "display" return self[i], i	end end

function ExternalLink:next_raw(i) if i == 0 then i = 1 if self.bracketed then return "[", i, false end end if i == 1 then i = self.scheme and "scheme" or "url" elseif i == "scheme" then i = 2 return ":", i, false elseif i == 2 then i = "url" elseif i == "url" then if self.display then i = 3 return " ", i, false elseif self.bracketed then return "]", nil, false end return nil elseif i == 3 then i = "display" elseif i == "display" and self.display then return "]", nil, false else return nil end return self[i], i, true end

local HTMLEntity = Node:new_class("html entity")

function HTMLEntity:next_display(i) if i == 0 then i = "char" return self[i], i	end end

function HTMLEntity:next_raw(i) i = i + 1 return self[i], i, false end

function HTMLEntity:__tostring return self.char:__tostring end

function HTMLEntity:__rawstring return self.char:__rawstring end

local HTMLTag = Node:new_class("html tag") HTMLTag.next = Node.next_no_op --[==[ function HTMLTag:__tostring local str = "<" if self["end"] then str = str .. "/"	end str = str .. tostring(self.name) if self.attributes then for i = 1, #self.attributes, 2 do str = str .. " " .. tostring(self.attributes[i]) .. "=\"" .. tostring(self.attributes[i + 1]) .. "\"" end end if self.self_closing then str = str .. "/"	end return str .. ">" end ]==] local PercentEncoding = Node:new_class("percent-encoding") PercentEncoding.next_display = HTMLEntity.next_display PercentEncoding.next_raw = HTMLEntity.next_raw

function PercentEncoding:__tostring return self.char end

PercentEncoding.__rawstring = PercentEncoding.__tostring

local StripMarker = Node:new_class("strip marker") StripMarker.next_display = Wikitext.next_display StripMarker.next_raw = Wikitext.next_raw

local Wikilink = Node:new_class("wikilink")

function Wikilink:next(i) if i == 0 then i = self.prefix and "prefix" or "title" elseif i == "prefix" then i = "title" elseif i == "title" then i = self.fragment and "fragment" or "display" elseif i == "fragment" then i = "display" else return nil end return self[i], i end

Wikilink.next_display = ExternalLink.next_display

function Wikilink:next_raw(i) if i == 0 or i == 1 then i = i + 1 return "[", i, false elseif i == 2 then i = 3 return ":", i, false elseif i == 3 then i = self.prefix and "prefix" or "title" elseif i == "prefix" then i = "title" elseif i == "title" then i = self.fragment and 4 or 5 return self.fragment and "#" or "|", i, false elseif i == 4 then i = "fragment" elseif i == "fragment" then i = 5 return "|", i, false elseif i == 5 then i = "display" elseif i == "display" then i = 6 return "]", i, false elseif i == 6 then i = nil return "]", i, false else return nil end return self[i], i, true end

local Prefix = Node:new_class("prefix")

function Prefix:next_raw(i) i = i + 1 if i % 2 == 1 then return self[(i + 1) / 2], i, true end return ":", i, false end --[==[ function Prefix:__tostring local output = {} for i = 1, #self do		insert(output, tostring(self[i])) end return concat(output, ":") .. ":" end ]==] local Category = Node:new_class("category") --[==[ function Category:__tostring local sortkey = self.sortkey and tostring(self.sortkey) or nil return "" end ]==] local Multipart = Node:new_class("multipart") --[==[ function Multipart:__tostring local output = {} for i = 1, #self do		local v = self[i] insert(output, type(v) == "table" and v:__tostring or tostring(v)) end return concat(output, "‎／ ") end ]==]

-- -- Parser --

function Parser:push(route) local layer = setmetatable({		head = self.head,		route = route,		len = 0	}, Wikitext) local len = self.len + 1 self[len] = layer self.n = layer self.len = len end

function Parser:push_sublayer(handler) local layer = self.n	rawset(layer, "__concat", layer.__concat) rawset(layer, "__index", layer) rawset(layer, "__newindex", layer) rawset(layer, "__pairs", layer.__pairs) rawset(layer, "__tostring", layer.__tostring) local sublayer = setmetatable({		handler = handler,		sublayer = true,		len = 0	}, layer) local len = self.len + 1 self[len] = sublayer self.n = sublayer self.len = len end

-- -- Italics and bold --

-- A direct copy of doQuotes in Parser.php: -- (1) ... is treated as ..., but an open  is implicitly closed as ...'. -- (2) A lone '' or  at the end of a line is treated as ' or ' respectively, but a lone  is completely ignored.

-- adjust_style_apostrophes is run if the number of StyleApostrophes2 and StyleApostrophes3 on the line are both odd, which converts one of the StyleApostrophes3 into an apostrophe followed by StyleApostrophes2. Parsoid uses the following priorities, and picks the first occurrence of the highest priority found: -- (1) After a single ASCII character after a normal space (" X'''"). -- (2) After multiple non-space characters ("XXX") or a non-ASCII character ("É"). -- (3) After a space (" '''"). -- Otherwise, no adjustment. -- If the new apostrophe is added straight after a free external link then the new apostrophe becomes part of the link, so the preceding FreeExternalLinkClose must be moved ahead of it. Note that this will cause any trailing punctuation between the old end and the apostrophe to be included as part of the link as well, since the new apostrophe means it is no longer trailing: for example, https://example.com/!' goes from being https://example.com/ followed by "!" to https://example.com/!' if adjusted (but remember that only one adjustment is made, even if the adjusted sequence is repeated). function Parser:handle_odd_number_italics_and_bold(layer) local italics, bold = 0, 0 for token in pairs(layer) do		if token.type == "apostrophes" then if token.num ~= 2 then bold = bold + 1 end if token.num ~= 3 then italics = italics + 1 end end end if italics % 2 == 0 or bold % 2 == 0 then return end local word_token, word_parent, word_key, space_token, space_parent, space_key for i, token, proxy, key, parent in layer:pairs_raw do		if token.type == "apostrophes" and token.num == 3 then if proxy[i - 1] == " " then space_token = space_token or token space_parent = space_parent or parent space_key = space_key or key elseif proxy[i - 1] and proxy[i - 2] == " " then token.num = 2 insert(parent, key, "'") parent.len = parent.len + 1 return else word_token = word_token or token word_parent = word_parent or parent word_key = word_key or key end end end if word_token then word_token.num = 2 insert(word_parent, word_key, "'") word_parent.len = word_parent.len + 1 local i, prev = 0 repeat i = i + 1 prev = layer[word_key - i]		until not (			type(prev) == "string" and			match(prev, "^[!%),%.:;%?\\]$") )		if not ( type(prev) == "table" and prev.type == "external link" and not prev.bracketed ) then			return		end		i = i - 1		local token		for _ = 1, i + 1 do			token = remove(layer, word_key - i)			layer.len = layer.len - 1			insert(prev.url, token)			prev.url.len = prev.url.len + 1		end	elseif space_token then		space_token.num = 2		insert(space_parent, space_key, "'")		space_parent.len = space_parent.len + 1	end	return end

function Parser:substitute_apostrophes(layer) local state, both_parent, both_key, final_parent, final_key for token, parent, key in pairs(layer) do		final_parent = false final_key = false if token.type == "apostrophes" then if token.num == 5 then if state == "b" then parent[key] = HTMLTag:new{ name = "b", ["end"] = true }					insert(parent, key + 1, HTMLTag:new{						name = "i",					}) parent.len = parent.len + 1 key = key + 1 state = "i" elseif state == "i" then parent[key] = HTMLTag:new{ name = "i", ["end"] = true }					insert(parent, key + 1, HTMLTag:new{						name = "b",					}) parent.len = parent.len + 1 key = key + 1 state = "b" elseif state == "bi" then parent[key] = HTMLTag:new{ name = "i", ["end"] = true }					insert(parent, key + 1, HTMLTag:new{						name = "b",						["end"] = true					}) parent.len = parent.len + 1 key = key + 1 state = "" elseif state == "ib" then parent[key] = HTMLTag:new{ name = "b", ["end"] = true }					insert(parent, key + 1, HTMLTag:new{						name = "i",						["end"] = true					}) parent.len = parent.len + 1 key = key + 1 state = "" elseif state == "both" then parent[key] = HTMLTag:new{ name = "b", ["end"] = true }					insert(parent, key + 1, HTMLTag:new{						name = "i",						["end"] = true					}) parent.len = parent.len + 1 key = key + 1 both_parent[both_key] = HTMLTag:new{ name = "i", }					insert(both_parent, both_key + 1, HTMLTag:new{						name = "b",					}) both_parent.len = both_parent.len + 1 if both_parent == parent then key = key + 1 end both_parent = nil both_key = nil state = "" else both_parent = parent both_key = key state = "both" end else local this = token.num == 2 and "i" or "b" local other = this == "i" and "b" or "i" if state == this then parent[key] = HTMLTag:new{ name = this, ["end"] = true }					state = "" elseif state == other .. this then parent[key] = HTMLTag:new{ name = this, ["end"] = true }					state = other elseif state == this .. other then parent[key] = HTMLTag:new{ name = other, ["end"] = true }					insert(parent, key + 1, HTMLTag:new{						name = this,						["end"] = true					}) insert(parent, key + 2, HTMLTag:new{						name = other,					}) parent.len = parent.len + 2 key = key + 2 state = other elseif state == "both" then parent[key] = HTMLTag:new{ name = this, ["end"] = true }					both_parent[both_key] = HTMLTag:new{ name = other, }					insert(both_parent, both_key + 1, HTMLTag:new{						name = this,					}) both_parent.len = both_parent.len + 1 if both_parent == parent then key = key + 1 end both_parent = nil both_key = nil state = other else parent[key] = HTMLTag:new{ name = this, }					state = state == other and other .. this or this end end final_parent = parent final_key = key end end -- No open tags at the end of a line. if final_parent and (		final_parent[final_key].type == "apostrophes" or		not final_parent[final_key]["end"]	) then final_parent[final_key] = nil final_parent.len = final_parent.len - 1 if state == "i" or state == "b" or state == "both" then return end state = sub(state, 1, 1) end if state == "b" or state == "ib" then self:emit(HTMLTag:new{			name = "b",			["end"] = true		}) end if state == "i" or state == "bi" or state == "ib" then self:emit(HTMLTag:new{			name = "i",			["end"] = true		}) end if state == "bi" then self:emit(HTMLTag:new{			name = "b",			["end"] = true		}) elseif state == "both" then self:emit(HTMLTag:new{			name = "i",			["end"] = true		}) self:emit(HTMLTag:new{			name = "b",			["end"] = true		}) both_parent[both_key] = HTMLTag:new{ name = "b", }		insert(both_parent, both_key + 1, HTMLTag:new{			name = "i",		}) both_parent.len = both_parent.len + 1 end return end

function Parser:finalize_line if self.n.apos then self:handle_odd_number_italics_and_bold(self.n)		self:substitute_apostrophes(self.n)	end -- Conversions that need to be done after apostrophes have been processed. for token, parent, key in pairs(self.n) do		if token.type == "html entity" then local char = token.char parent[key] = #char == 1 and char[1] or char elseif token.type == "percent-encoding" then parent[key] = char elseif (			token.type == "external link" and			token.display and			#token.display == 0		) then token.display = nil end end end

-- -- Apostrophes --

do local function handle_apostrophes(self, this) if this == "'" then self.n.apos = self.n.apos or {} insert(self.n.apos, self.head) else local apos = self.n.apos and #self.n.apos + 1 or 1 if apos == 1 then return self:fail_route elseif apos == 2 or apos == 3 or apos == 5 then self.n.num = apos elseif apos == 4 then self:emit("'") self.n.num = 3 else for _ = 1, apos - 5 do					self:emit("'") end self.n.num = 5 end return self:pop end end function Parser:do_apostrophes self:set("handler", handle_apostrophes) self:advance end function Parser:apostrophes local apostrophes = self:get("do_apostrophes") if apostrophes == self.n.bad_route then return nil end self:emit_tokens(apostrophes) self:advance(-1) return Apostrophes:new(apostrophes) end end

-- -- Carriage return --

-- "\r" and "\r\n" are both treated as "\n".

function Parser:carriage_return(this) if self.n.override == self.carriage_return then self.n.override = nil if this ~= "\n" then self:advance(-1) end return self:consume("\n") end self.n.override = self.carriage_return end

-- -- Comment --

do -- Handlers. local handle_start local traverse_comment local handle_end function handle_start(self, this) self.n.i = self.n.i + 1 if this ~= sub("", self.n.i, self.n.i) then self:advance return traverse_comment(self) elseif self.n.i == 3 then return self:pop end end function Parser:do_comment self:set("handler", handle_start) self.n.no_magic_word = true self.n.i = 1 self:advance end function Parser:comment local comment = self:get("do_comment") if comment == self.n.bad_route then return self:consume end end end

-- -- External link --

-- Note: the Parsoid implementation of URLs is pretty crude, and doesn't respect the URL spec at https://url.spec.whatwg.org/ in many cases.

do local function is_invalid(this) return this == "" or			this == "\239\191\189" or -- U+FFFD Replacement Character this ~= "\t" and byte(this) <= 0x1F -- C0 control characters except \t end -- Handlers. local handle_bracketed_start local handle_double_bracketed_start local handle_bracketed_scheme local handle_free_scheme local handle_slashes local handle_after_scheme local handle_ip local handle_decoded_ip local handle_uri local handle_free_uri_trail local handle_bracketed_uri_whitespace local handle_uri_end local handle_bracketed_text -- If another "[" is found, record the position after it as wikilink_on_fail, which will be used as the head of a wikilink if this route fails. function handle_bracketed_start(self, this) if this == "[" then self.n.handler = handle_double_bracketed_start return end if this == "/" then self.n.handler = handle_slashes self.n.i = 0 else self:push_sublayer(handle_bracketed_scheme) end return self:consume end function handle_double_bracketed_start(self, this) self.n.wikilink_on_fail = self.head if this == "[" then return self:fail_route end self.n.handler = handle_bracketed_start return self:consume end function handle_bracketed_scheme(self, this) if this == ":" then local scheme = self:pop_sublayer local normalized_scheme = lower(concat(scheme)) if d.EL_SCHEMES_SLASHED[normalized_scheme] then self.n.handler = handle_slashes self.n.i = 0 elseif d.EL_SCHEMES_UNSLASHED[normalized_scheme] then self.n.handler = handle_after_scheme else return self:fail_route end self.n.scheme = Wikitext:new(scheme) elseif match(this, "^[%w%+%-%.]$") then self:emit(this) else return self:fail_route end end function handle_free_scheme(self) local i, this, nxt = 0, ":" repeat i = i - 1 this, nxt = self:emitted(i), this until type(this) ~= "string" or not match(this, "^[%w%+%-%.]$") if (			match(nxt, "^%a$") and -- Schemes must start with a letter.			not (type(this) == "string" and umatch(this, "^%w$"))		) then local scheme = self:concat(-1, i + 1) local normalized_scheme = lower(scheme) self.n.scheme_pos = i + 1 self.n.pattern = "^[!%),%.:;%?\\]$"			if d.EL_SCHEMES_SLASHED[normalized_scheme] then				self:push_sublayer(handle_slashes)				self.n.i = 0			elseif d.EL_SCHEMES_UNSLASHED[normalized_scheme] then				self:push_sublayer(handle_after_scheme)			else				return self:fail_route			end			self.n.scheme = Wikitext:new(scheme)		else			return self:fail_route		end	end	function handle_slashes(self, this)		if this ~= "/" then			return self:fail_route		end		self:emit(this)		self.n.i = self.n.i + 1		if self.n.i == 2 then			self.n.handler = handle_after_scheme		end	end	-- Parsoid bugs:	-- (1) Entities for "[" (e.g. &lsqb;) aren't treated as the start of an IP address.	-- (2) Only "%5B" is converted to "[", not "%5b".	function handle_after_scheme(self, this)		self:push_sublayer(handle_uri)		if this == "%" then			this = self:percent_encoding or "%"			if rawstring(this) == "[" and this.code == "%5B" then self:emit(this) self:push_sublayer(handle_decoded_ip) return end self.head = this and this.head or self.head elseif this == "[" then self:emit("[") self:push_sublayer(handle_ip) return end return self:consume end -- IP URLs starting with "[" must have a matching "]". Fails if a non-IP character is found, since "[" is otherwise invalid. function handle_ip(self, this) if this == "]" then if #self.n == 0 then return self:fail_route end self:emit_tokens(self:pop_sublayer) self:emit("]") self.n.ip = true self.n.handler = handle_uri elseif match(this, "^[%x%.:]$") then self:emit(this) else return self:fail_route end end -- IP URLs starting with "%5B" must have a matching "%5D". If a non-IP character is found, "[" is converted back to "%5B". -- Parsoid bug: Only "%5D" is converted to "]", not "%5d". function handle_decoded_ip(self, this) if this == "%" then this = self:percent_encoding or "%" if (				#self.n > 0 and				rawstring(this) == "]" and				this.code == "%5D"			) then self:emit_tokens(self:pop_sublayer) self:emit(this) self.n.ip = true return end self.head = this and this.head or self.head elseif match(this, "^[%x%.:]$") then self:emit(this) return end self:emit_tokens(self:pop_sublayer) local i = 0 repeat i = i - 1 this = self:emitted(i) until rawstring(this) == "[" self:replace(i, "%") self:emit(i + 1, "5") self:emit(i + 1, "B") return self:consume end -- Note: Some valid wikitext characters which are invalid in URLs resolve to percent-encoding. -- Parsoid bugs: -- (1) In bracketed links, "<", ">" (and corresponding entities &lt; and &gt;) end the URI and start the text even if they come straight after the scheme, resulting in invalid targets like "https://". -- (2) In free links, the entities for "<", ">" and the non-breaking space are supposed to end the URI, but Parsoid doesn't account for &LT; &GT; and &NonBreakingSpace;. function handle_uri(self, this) local trail_pos = self.n.trail and #self.n + 1 if this == "&" then this = self:html_entity if not this then self:emit("&") elseif not this.char then this.char = Wikitext:new("\239\191\189") -- U+FFFD Replacement Character self:emit(this) else local decoded = rawstring(this) if decoded == " " then self:emit("+") elseif self.n.bracketed and (					this.code == "&lt;" or					this.code == "&gt;"				) or not self.n.bracketed and (					decoded == "<" and this.code ~= "&LT;" or					decoded == ">" and this.code ~= "&GT;" or					decoded == "\194\160" and this.code ~= "&NonBreakingSpace;"				) then self.head = this.head return handle_uri_end(self, trail_pos) elseif match(decoded, "^[\t\n\"<>%[%]|]$") then					this = format("%02X", byte(decoded))					self:emit("%")					self:emit(sub(this, 1, 1))					self:emit(sub(this, 2, 2))				else					self:emit(this)				end			end		elseif this == "'" then			this = self:apostrophes			if this then				self.n.apos = true				self.head = this.head				return handle_uri_end(self, trail_pos)			end			self:emit("'")		elseif this == "]" then			return handle_uri_end(self, trail_pos, true)		elseif not self.n.bracketed and this == "(" then -- Remove ")" from the trail pattern.			self.n.pattern = "^[!,%.:;%?\\]$"			self:emit("(")		elseif this == "|" then			self:emit("%")			self:emit("7")			self:emit("C")		elseif this == "\127" then			this = self:strip_marker			if this then				self.head = this.head				return handle_uri_end(self, trail_pos)			end			self:emit("?")		elseif not self.n.bracketed and match(this, self.n.pattern) then self:push_sublayer(handle_free_uri_trail) self.n.trail_head = self.head return self:consume elseif d.SPACE_SEPARATOR[this] then if not self.n.bracketed then return handle_uri_end(self, trail_pos) end self.n.handler = handle_bracketed_uri_whitespace elseif match(this, "^[\"<>%[]$") then			return handle_uri_end(self, trail_pos)		elseif is_invalid(this) then			if self.n.bracketed then				return self:fail_route			end			return handle_uri_end(self, trail_pos)		elseif #this > 1 and (			d.IGNORED_IN_URI[this] or			match(this, "^\243\160[\128-\191][\128-\191]$") -- U+E0000–E0FFF		) then			return		else			self:emit(this)		end		if self.n.trail and #self.n >= trail_pos then			self:emit_tokens(trail_pos, self.n.trail)			self.n.trail = nil		end	end	-- Gather any trail characters and save them. Later, they will be added to the URI if we know that the end doesn't come straight after them. If it does, they'll be discarded and the head set to the start of the trail. Note: Parsoid never adds decoded entities to the trail.	-- Parsoid bug: If "(" is given as an entity, it does not cause ")" to be excluded from the trail characters.	function handle_free_uri_trail(self, this) if match(this, self.n.pattern) then self:emit(this) else local trail = self:pop_sublayer self.n.trail = trail return self:consume end end function handle_bracketed_uri_whitespace(self, this) if not d.SPACE_SEPARATOR[this] then return handle_uri_end(self) end end -- Fail if end comes straight after the scheme (+ slashes where applicable). If wikilink_on_fail is set (e.g. https://), then reset it to nil, since Parsoid won't parse it as a wikilink either (even though "https://" is a valid title!). -- For free links, we also still need to determine if the trail needs to be added (e.g. there may be excess apostrophes after it). -- IP square brackets use percent-encoding if the URI continues after "]", even if entered as raw characters. function handle_uri_end(self, trail_pos, force_pop) if #self.n == 0 then self.n.wikilink_on_fail = nil return self:fail_route elseif self.n.trail then if #self.n >= trail_pos then self:emit_tokens(trail_pos, self.n.trail) else self.head = self.n.trail_head end end self:emit_tokens(self:pop_sublayer) if self.n.ip and rawstring(self:emitted) ~= "]" then local i, this = 0 repeat i = i - 1 this = self:emitted(i) if rawstring(this) == "]" then self:replace(i, "%") self:emit(i + 1, "5") self:emit(i + 1, "D") elseif rawstring(this) == "[" then self:replace(i, "%") self:emit(i + 1, "5") self:emit(i + 1, "B") break end until not this end local url = Wikitext:new(self:pop_sublayer) self.n.url = url if not self.n.bracketed or force_pop then return self:pop end self:push_sublayer(handle_bracketed_text) return self:consume end function handle_bracketed_text(self, this) if this == "&" then self:emit(self:html_entity or "&") elseif this == "'" then this = self:apostrophes self.n.apos = self.n.apos or this and true or nil self:emit(this or "'") elseif this == "<" then self:html_tag elseif this == "]" then local raw_display = self:pop_sublayer self.n.display = Wikitext:new(raw_display) return self:pop elseif this == "\127" then self:emit(self:strip_marker or "?") elseif is_invalid(this) then return self:fail_route else self:emit(this) end end function Parser:do_bracketed_external_link self:set("handler", handle_bracketed_start) self.n.bracketed = true self:advance self:push_sublayer end function Parser:bracketed_external_link local link = self:get("do_bracketed_external_link") if link == self.n.bad_route then if link.wikilink_on_fail then self:wikilink(link.wikilink_on_fail) else self:emit("[") end else if link.wikilink_on_fail then self:emit("[") end self:emit(ExternalLink:new(link)) end end function Parser:do_free_external_link self:set("handler", handle_free_scheme) end function Parser:free_external_link local link = self:get("do_free_external_link") if link == self.n.bad_route then if self.n.dl then self:emit(self.DescriptionListSeparator) self.n.dl = nil else self:emit(":") end else -- Account for already-emitted scheme. for _ = -1, link.scheme_pos, -1 do				self:remove end self:emit(ExternalLink:new(link)) self:advance(-1) end end end

-- -- Heading --

do -- Handlers. local handle_start local handle_start_whitespace local handle_start_excess local handle_only_equals_signs local handle_body local handle_body_whitespace local handle_end local handle_end_whitespace function handle_start(self, this) if this == "=" then self.n.eq = self.n.eq + 1 elseif this == "\n" or this == "" then return handle_only_equals_signs(self) elseif this == " " or this == "\t" then self.n.handler = handle_start_whitespace else handle_start_excess(self) self.n.handler = handle_body return self:consume end end function handle_start_whitespace(self, this) if this == "\n" or this == "" then return handle_only_equals_signs(self) elseif this ~= " " and this ~= "\t" then handle_start_excess(self) self.n.handler = handle_body return self:consume end end -- Emit any excess = signs once we know it's a conventional heading. Up till now, we couldn't know if the heading is just a string of = signs (e.g. ========), so it wasn't guaranteed that the heading text starts after the 6th. function handle_start_excess(self) if self.n.eq > 6 then for _ = 1, self.n.eq - 6 do				self:emit("=") end self.n.eq = 6 end end -- ===== is "=" as an L2; ======== is "==" as an L3 etc.	function handle_only_equals_signs(self) if self.n.eq < 3 then return self:fail_route end -- Calculate which equals signs determine the heading level. local eq = self.n.eq - 1 eq = eq - eq % 2 eq = eq > 12 and 12 or eq -- Emit the excess. for _ = 1, self.n.eq - eq do			self:emit("=") end self.n.level = eq / 2 return self:pop end function handle_body(self, this) if this == "=" then local end_eq = self:get("do_heading_end") if end_eq == self.n.bad_route then -- = signs are just part of the heading. self:advance(#self.n.bad_route) self:emit_tokens(self.n.bad_route) return self:consume elseif end_eq > self.n.eq then for _ = 1, end_eq - self.n.eq do					self:emit("=") end self.n.level = self.n.eq				return self:pop end for _ = 1, self.n.eq - end_eq do				self:emit(1, "=") end -- Remove already-emitted whitespace before end. local this = self:emitted while this == " " or this == "\t" do				self:remove this = self:emitted end self.n.level = end_eq return self:pop elseif this == " " or this == "\t" then self:emit(this) self.n.override = handle_body_whitespace elseif this == "\n" or this == "" then return self:fail_route elseif this == "&" then self:emit(self:html_entity or "&") elseif this == "'" then this = self:apostrophes self.n.apos = self.n.apos or this and true or nil self:emit(this or "'") elseif this == ":" then self:free_external_link elseif this == "<" then self:html_tag elseif this == "I" or this == "P" or this == "R" then self:magic_link(this) elseif this == "[" then self:bracketed_external_link elseif this == "\127" then self:emit(self:strip_marker or "?") else self:emit(this) end end function handle_body_whitespace(self, this) if this ~= " " and this ~= "\t" then self.n.override = nil return self:consume end end function handle_end(self, this) if this == "=" then self:emit("=") elseif this == "\n" or this == "" then return #self:pop elseif this == " " or this == "\t" then self.n.handler = handle_end_whitespace else return self:fail_route end end function handle_end_whitespace(self, this) if this == "\n" or this == "" then return #self:pop elseif this ~= " " and this ~= "\t" then return self:fail_route end end function Parser:do_heading self:set("handler", handle_start) self.n.eq = 1 self:advance end function Parser:do_heading_end self:set("handler", handle_end) end function Parser:heading local heading = self:get("do_heading") if heading ~= self.n.bad_route then self:emit(HTMLTag:new{				name = Wikitext:new("h" .. heading.level)			}) self:emit(HTMLTag:new{				name = Wikitext:new("span"),				attributes = {					Wikitext:new("class"),					Wikitext:new("mw-headline"),					Wikitext:new("id"),					export.parse_nowiki(anchor_encode(tostring(heading)))				}			}) self:emit_tokens(heading) self:emit(HTMLTag:new{				name = Wikitext:new("span"),				["end"] = true			}) self:emit(HTMLTag:new{				name = Wikitext:new("h" .. heading.level),				["end"] = true			}) end self:advance(-1) end end

-- -- Horizontal rule --

do local function handle_horizontal_rule(self, this) if this == "-" then self.n.i = self.n.i + 1 elseif self.n.i >= 4 then self:pop return true else return self:fail_route end end function Parser:do_horizontal_rule self:set("handler", handle_horizontal_rule) self.n.i = 1 self:advance end function Parser:horizontal_rule local horizontal_rule = self:get("do_horizontal_rule") if horizontal_rule ~= self.n.bad_route then self:emit(HTMLTag:new{				name = Wikitext:new("hr"),				self_closing = true			}) end self:advance(-1) end end

-- -- HTML entity --

-- Parsoid regex: &([A-Za-z0-9\x80-\xff]+;)|&\#([0-9]+)|&\#[xX]([0-9A-Fa-f]+)|(&)

-- If the route decodes to an invalid entity (e.g. &#xD800;), then the route still succeeds, but the output is the original wikitext. This matches Parsoid, which processes such entities but makes the output the same as the input string. This means that inputs such as & are treated as attempted links to pages with an HTML entity in the title (invalid), and not as a link to "&" with the fragment "xD800;".

-- Characters which are never valid in HTML entities. Note that non-ASCII characters are treated as valid in entity names by the Parsoid regex, since it supports some nonstandard entities that use them.

do local function is_invalid(this) return not not (			this == "" or			#this == 1 and not match(this, "^%w$")		) end -- Converts a codepoint to the equivalent character. Characters which aren't decoded by Parsoid return nil. local function utf8_char(cp) if (			cp <= 0x08 or			cp >= 0x0B and cp <= 0x1F or			cp >= 0x7F and cp <= 0x9F or			cp >= 0xD800 and cp <= 0xDFFF or			cp == 0xFFFE or cp == 0xFFFF or			cp > 0x10FFFF		) then return nil elseif cp < 0x80 then return char(cp) elseif cp < 0x800 then return char(				0xC0 + cp / 0x40,				0x80 + cp % 0x40			) elseif cp < 0x10000 then return char(				0xE0 + cp / 0x1000,				0x80 + cp / 0x40 % 0x40,				0x80 + cp % 0x40			) end return char(			0xF0 + cp / 0x40000,			0x80 + cp / 0x1000 % 0x40,			0x80 + cp / 0x40 % 0x40,			0x80 + cp % 0x40		) end -- Handlers. local handle_start local handle_numeric local handle_numeric_code local handle_dec_code local handle_hex_code local handle_named local handle_percent_encoding function handle_start(self, this) if this == "#" then self:emit(this) self.n.handler = handle_numeric elseif this == "%" then return handle_percent_encoding(self) elseif is_invalid(this) then return self:fail_route else self:emit(this) self.n.handler = handle_named end end function handle_numeric(self, this) if this == "%" then return handle_percent_encoding(self) elseif this == "X" or this == "x" then self:emit(this) self.n.handler = handle_hex_code elseif match(this, "^%d$") then self:emit(this) self.n.handler = handle_dec_code else return self:fail_route end end function handle_numeric_code(self, this, format, start, base) if this == "%" then return handle_percent_encoding(self) elseif this == ";" then local char = utf8_char(tonumber(self:concat(start), base)) if not char then self.n.no_char = true return self:fail_route end self:emit(";") self.n.char = Wikitext:new(char) return self:pop elseif not match(this, format) then return self:fail_route end self:emit(this) end function handle_dec_code(self, this) return handle_numeric_code(self, this, "^%d$", 3) end function handle_hex_code(self, this) return handle_numeric_code(self, this, "^%x$", 4, 16) end function handle_named(self, this) if this == "%" then return handle_percent_encoding(self) elseif this == ";" then local char = load_data("Module:data/entities")[self:concat(2)] if not char then self.n.no_char = true return self:fail_route end self:emit(";") self.n.char = Wikitext:new(char) return self:pop elseif is_invalid(this) then return self:fail_route end self:emit(this) end function handle_percent_encoding(self) if not self.n.decode_percent then return self:fail_route end local this = rawstring(self:percent_encoding or "%") if this == "%" then -- Avoid double-decoding. return self:fail_route end return self:consume(this) end -- `decode_percent` denotes underlying contexts in which percent-decoding should be attempted, since Parsoid decodes percent-encoding then HTML entities in that order (e.g. "%26%79%65%6E%3B" → "&yen;" → "¥"). function Parser:do_html_entity(decode_percent) self:set("handler", handle_start) self.n.decode_percent = decode_percent self:emit("&") self:advance end -- Returns nil if the parse fails (e.g. "&exam ple;" or "&#123x;"), and false if no character can be decoded (e.g. "&notvalid;" is not associated with any character, "&#0;" is a codepoint that doesn't get resolved, and "&#x100000;" is a codepoint that's too high). This is because the second type will cause wikilinks to fail, whereas the first will not. function Parser:html_entity(decode_percent) local entity = self:get("do_html_entity", decode_percent) if entity == self.n.bad_route then if self.n.bad_route.no_char then return false end return nil end entity.code = concat(entity) return HTMLEntity:new(entity) end end

-- -- HTML tag --

do -- HTML whitespace. local function is_space(this) return not not (			this == " " or			this == "\t" or			this == "\n" or			this == "\f"		) end -- Handlers. local handle_start local handle_open_tag_name local handle_before_attribute_name local handle_attribute_name local handle_after_attribute_name local handle_before_attribute_value local handle_quoted_attribute_value local handle_unquoted_attribute_value local handle_self_closing_tag local handle_end_tag_start local handle_end_tag_name local handle_end_tag_remainder function handle_start(self, this) if this == "/" then self:push_sublayer(handle_end_tag_start) elseif match(this, "^%a$") then self:push_sublayer(handle_open_tag_name) self:emit(lower(this)) else return self:fail_route end end function handle_open_tag_name(self, this) if this == "/" then local name = Wikitext:new(self:pop_sublayer) self.n.name = name self.n.handler = handle_self_closing_tag elseif this == ">" then local name = Wikitext:new(self:pop_sublayer) self.n.name = name return self:pop elseif this == "" then return self:fail_route elseif is_space(this) then local name = Wikitext:new(self:pop_sublayer) self.n.name = name self:push_sublayer(handle_before_attribute_name) elseif match(this, "^%u$") then self:emit(lower(this)) else self:emit(this) end end function handle_before_attribute_name(self, this) if this == "/" then self.n.handler = handle_self_closing_tag elseif this == "=" then self:push_sublayer(handle_attribute_name) self:emit("=") elseif this == ">" then local attributes = self:pop_sublayer if #attributes > 0 then self.n.attributes = attributes end return self:pop elseif this == "" then return self:fail_route elseif not is_space(this) then self:push_sublayer(handle_attribute_name) return self:consume end end function handle_attribute_name(self, this) if this == "/" or this == ">" or is_space(this) then self:emit(Wikitext:new(self:pop_sublayer)) self.n.handler = handle_after_attribute_name return self:consume elseif this == "=" then self:emit(Wikitext:new(self:pop_sublayer)) self.n.handler = handle_before_attribute_value elseif this == "" then return self:fail_route elseif match(this, "^%u$") then self:emit(lower(this)) else self:emit(this) end end function handle_after_attribute_name(self, this) if this == "/" then self:emit(Wikitext:new{}) self.n.handler = handle_self_closing_tag elseif this == "=" then self.n.handler = handle_before_attribute_value elseif this == ">" then self:emit(Wikitext:new{}) local attributes = self:pop_sublayer self.n.attributes = attributes return self:pop elseif this == "" then return self:fail_route elseif not is_space(this) then self:emit(Wikitext:new{}) self:push_sublayer(handle_attribute_name) return self:consume end end function handle_before_attribute_value(self, this) if this == "\"" or this == "'" then			self:push_sublayer(handle_quoted_attribute_value)			self:set("quoter", this)		elseif this == ">" then			self:emit(Wikitext:new{})			local attributes = self:pop_sublayer			self.n.attributes = attributes			return self:pop		elseif not is_space(this) then			self:push_sublayer(handle_unquoted_attribute_value)			return self:consume		end	end	function handle_quoted_attribute_value(self, this)		if this == self.n.quoter then			self:emit(Wikitext:new(self:pop_sublayer))			self.n.handler = handle_before_attribute_name		elseif this == "&" then			self:emit(self:html_entity or "&")		elseif this == "" then			return self:fail_route		else			self:emit(this)		end	end	function handle_unquoted_attribute_value(self, this)		if this == "&" then			self:emit(self:html_entity or "&")		elseif this == ">" then			self:emit(Wikitext:new(self:pop_sublayer))			local attributes = self:pop_sublayer self.n.attributes = attributes return self:pop elseif this == "" then return self:fail_route elseif is_space(this) then self:emit(Wikitext:new(self:pop_sublayer)) self.n.handler = handle_before_attribute_name else self:emit(this) end end function handle_self_closing_tag(self, this) if this == ">" then self.n.self_closing = true local attributes = self:pop_sublayer if #attributes > 0 then self.n.attributes = attributes end return self:pop end self.n.handler = handle_before_attribute_name return self:consume end function handle_end_tag_start(self, this) if match(this, "^%a$") then self.n["end"] = true self:emit(lower(this)) self.n.handler = handle_end_tag_name else return self:fail_route end end function handle_end_tag_name(self, this) if this == "/" or is_space(this) then local name = Wikitext:new(self:pop_sublayer) self.n.name = name self.n.handler = handle_end_tag_remainder elseif this == ">" then local name = Wikitext:new(self:pop_sublayer) self.n.name = name return self:pop elseif this == "" then return self:fail_route elseif match(this, "^%u$") then self:emit(lower(this)) else self:emit(this) end end function handle_end_tag_remainder(self, this) if this == ">" then return self:pop elseif this == "" then return self:fail_route end end function Parser:do_html_tag self:set("handler", handle_start) self:advance end function Parser:html_tag local tag = self:get("do_html_tag") if tag == self.n.bad_route then self:emit("<") else self:emit(HTMLTag:new(tag)) end end end

-- -- Magic link --

-- Parsoid regexes: -- ISBN: \bISBN$spaces((?:97[89]$spdash?)?(?:[0-9]$spdash?){9}[0-9Xx]\b	-- PMID/RFC: \b(?:RFC|PMID)$spaces([0-9]+)\b	-- where:	-- $spaces is (?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})++	-- $spdash is (?:-|\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})

do -- Handlers. local handle_prefix local handle_whitespace local handle_isbn13_number_first local handle_isbn_number local handle_isbn_spdash local handle_isbn_end local handle_other_number function handle_prefix(self, this) self.n.i = self.n.i + 1 if this ~= sub(self.n.prefix, self.n.i, self.n.i) then return self:fail_route end self:emit(this) if self.n.i == #self.n.prefix then if (				type(self.n.prev) == "string" and				umatch(self.n.prev, "^%w$")			) then return self:fail_route end self.n.handler = handle_whitespace end end function handle_whitespace(self, this) if this == "&" then this = self:html_entity if (				not this or				this.code == "&NonBreakingSpace;" or				rawstring(this) ~= "\194\160"			) then return self:fail_route end self.n.ws_found = true elseif d.SPACE_SEPARATOR[this] then self.n.ws_found = true elseif match(this, "^%d$") then self:emit(" ") if self.n.prefix == "ISBN" then local number = self:get("do_isbn_number") if number == self.n.bad_route then return self:fail_route end self:emit_tokens(number) return self:pop end self.n.handler = handle_other_number return self:consume else return self:fail_route end end -- spdash is not allowed between the first three digits of an ISBN13 number. function handle_isbn13_number_first(self, this) self.n.i = self.n.i + 1 if (			self.n.i == 1 and this ~= "9" or			self.n.i == 2 and this ~= "7" or			self.n.i == 3 and this ~= "8" and this ~= "9"		) then return self:fail_route end self:emit(this) if self.n.i == 3 then return self:pop end end function handle_isbn_number(self, this) self.n.i = self.n.i + 1 if self.n.i == 10 and match(this, "^[%dXx]$") then self:emit(this) self.n.handler = handle_isbn_end elseif self.n.i < 10 and match(this, "^%d$") then self:emit(this) self.n.override = handle_isbn_spdash else return self:fail_route end end function handle_isbn_spdash(self, this) self.n.override = nil if this == "&" then this = self:html_entity if (				not this or				this.code == "&NonBreakingSpace;" or				rawstring(this) ~= "\194\160"			) then return self:fail_route end self:emit("\194\160") elseif this == "-" or d.SPACE_SEPARATOR[this] then self:emit(this) else return self:consume end end function handle_isbn_end(self, this) if not umatch(this, "^%w$") then return self:pop end return self:fail_route end function handle_other_number(self, this) if match(this, "^%d$") then self:emit(this) elseif umatch(this, "^%w$") then return self:fail_route else return self:pop end end function Parser:do_magic_link(this) self:set("handler", handle_prefix) self.n.prev = self:emitted self.n.prefix = this == "I" and "ISBN" or			this == "P" and "PMID" or			this == "R" and "RFC" self.n.i = 0 end function Parser:do_isbn_number self:set("handler", handle_isbn_number) local isbn13_first = self:get("do_isbn13_number_first") if isbn13_first ~= self.n.bad_route then local isbn13_rem = self:get("do_isbn13_number_remainder") if isbn13_rem ~= self.n.bad_route then self:emit_tokens(isbn13_first) self:emit_tokens(isbn13_rem) return self:pop end self.head = isbn13_first.head end self.n.i = 0 end function Parser:do_isbn13_number_first self:set("handler", handle_isbn13_number_first) self.n.i = 0 end function Parser:do_isbn13_number_remainder self:set("handler", handle_isbn_number) self.n.i = 0 self.n.override = handle_isbn_spdash self:advance end function Parser:magic_link(this) local magic_link = self:get("do_magic_link", this) if magic_link == self.n.bad_route then self:emit(this) return elseif magic_link.prefix == "ISBN" then local prefix = Prefix:new{Wikitext:new("Special")} local title = Wikitext:new("BookSources/") for i = 6, #magic_link do				if match(magic_link[i], "^[%dXx]$") then insert(title, upper(magic_link[i])) end end self:emit(Wikilink:new{				prefix = prefix,				title = title,				display = Wikitext:new(magic_link)			}) else local url, scheme, i, c			if magic_link.prefix == "PMID" then url, i, c = explode("//www.ncbi.nlm.nih.gov/pubmed/?dopt=Abstract"), 6, 25 else url, i, c = explode("//tools.ietf.org/html/rfc"), 5, 21 scheme = "https" end for n = i, #magic_link do				insert(url, n + c, magic_link[n]) end self:emit(ExternalLink:new{				scheme = scheme and Wikitext:new(scheme) or nil,				url = Wikitext:new(url),				display = Wikitext:new(magic_link),				bracketed = true			}) end self:advance(-1) end end

-- -- Magic word --

do -- Handlers. local handle_start local handle_body local handle_end function handle_start(self, this) if this ~= "_" then return self:fail_route end self.n.handler = handle_body end function handle_body(self, this) if this == "_" then self.n.handler = handle_end elseif match(this, "^%a$") then self:emit(this) else return self:fail_route end end function handle_end(self, this) if this == "_" then local magic_word = self:concat if d.MAGIC_WORDS_CS[magic_word] then -- Case sensitive. return uupper(magic_word) end magic_word = uupper(magic_word) if d.MAGIC_WORDS_NOT_CS[magic_word] then -- Case insensitive. return magic_word end return self:fail_route elseif match(this, "^%a$") then self:emit("_") self:emit(this) self.n.handler = handle_body else return self:fail_route end end function Parser:do_magic_word self:set("handler", handle_start) self.n.no_magic_word = true self:advance end function Parser:magic_word if self.n.no_magic_word then return self:consume end local magic_word = self:get("do_magic_word") if magic_word == self.n.bad_route then return self:consume end self:pop if not self.n.magic_words then self.magic_words = {} end insert(self.magic_words, magic_word) end end

-- -- Newline --

-- If a newline is found, the current layer is retained as the main layer for the current parse, but sublayers are used for each subsequent newline. This allows finalize_line to do line-by-line postprocessing (matching Parsoid), which can then be emitted to the main layer once finalised. function Parser:newline -- Remove already-emitted whitespace before end. local this = self:emitted while this == " " or this == "\t" do		self:remove this = self:emitted end self:finalize_line if self.n.sublayer then self:emit_tokens(self:pop_sublayer) end self:emit("\n") self:push_sublayer end

-- -- Multipart --

function Parser:multipart(data, on_fail) data.route[2] = true -- multipart data.route[3] = 1 -- head if on_fail then data.allow_fail = true on_fail.route[2] = true -- multipart end local parser, ok, tokens, sections = 1 while true do		ok, tokens, parser = Parser:parse(data) if not ok then on_fail.route[3] = data.route[3] -- head tokens, parser = select(2, Parser:parse(on_fail)) end if parser["end"] then break end data.route[3] = parser.head + 1 sections = sections or {} insert(sections, tokens) end if sections then insert(sections, tokens) return Multipart:new(sections) end return tokens end

-- -- Percent-encoding --

-- If decoding fails, this will normally cause the containing wikilink to fail, since any bytes decoded up to that point would decode to an invalid UTF-8 sequence on their own, which in invalid anywhere in a link. However, if if decoding fails on the leading byte due to an invalid raw character, then the wikilink will not fail, because the link will not contain any valid percent-encodings. e.g. %0G is a valid link, but %C2%0G and foo will both fail, since "%C2" must have a trailing byte and "%80" can't be a leading byte in UTF-8.

do -- Handlers. local handle_leading_byte local handle_trailing_byte local handle_digit function handle_leading_byte(self) local byte = self:get("do_digit") if self.n.bad_route then self.n.no_fail_wikilink = true return self:fail_route elseif (			byte.val > 0x7F and byte.val < 0xC2 or			byte.val > 0xF4		) then return self:fail_route end self:emit_tokens(byte) if byte.val < 0x80 then self.n.char = char(byte.val) return self:pop end self.n.bytes = {byte.val} self.n.num = byte.val < 0xE0 and 2 or byte.val < 0xF0 and 3 or 4 self.n.handler = handle_trailing_byte end function handle_trailing_byte(self, this) if this ~= "%" then return self:fail_route end local byte = self:get("do_digit") if (			byte == self.n.bad_route or			byte.val < 0x80 or			byte.val > 0xBF or			#self.n.bytes == 1 and ( self.n.bytes[1] == 0xE0 and byte.val < 0xA0 or				self.n.bytes[1] == 0xED and byte.val > 0x9F or				self.n.bytes[1] == 0xF0 and byte.val < 0x90 or				self.n.bytes[1] == 0xF4 and byte.val > 0x8F )		) then return self:fail_route end self:emit_tokens(byte) insert(self.n.bytes, byte.val) if #self.n.bytes == self.n.num then self.n.char = char(unpack(self.n.bytes)) return self:pop end end function handle_digit(self, this) if not match(this, "^%x$") then return self:fail_route end self:emit(this) self.n.i = self.n.i + 1 if self.n.i == 2 then self.n.val = tonumber(self:concat(2), 16) return self:pop end end function Parser:do_percent_encoding self:set("handler", handle_leading_byte) end function Parser:do_digit self:set("handler", handle_digit) self.n.i = 0 self:emit("%") self:advance end function Parser:percent_encoding local percent = self:get("do_percent_encoding") if percent == self.n.bad_route then return self.n.bad_route.no_fail_wikilink and "%" or nil end percent.code = concat(percent) return PercentEncoding:new(percent) end end

-- -- Strip marker --

do local unstrip_nowiki = mw.text.unstripNoWiki -- Handlers. local handle_prefix local handle_tag local handle_hex_code local handle_dec_code local handle_suffix function handle_prefix(self, this) self.n.i = self.n.i + 1 if this ~= sub("'\"`UNIQ--", self.n.i, self.n.i) then			return self:fail_route		end		self:emit(this)		if self.n.i == 9 then			self.n.handler = handle_tag		end	end	function handle_tag(self, this)		if this == "-" then			self.n.tag = self:concat(11)			self:emit("-")			if d.STRIP_MARKERS_HEX[self.n.tag] then				self.n.i = 0				self.n.handler = handle_hex_code			elseif d.STRIP_MARKERS_DEC[self.n.tag] then				self.n.handler = handle_dec_code			else				return self:fail_route			end		elseif match(this, "^%l$") then			self:emit(this)		else			return self:fail_route		end	end	function handle_hex_code(self, this)		if this == "-" then			if self.n.i ~= 8 then				return self:fail_route			end			self:emit("-")			-- Ends -QINU`\"'\127 (one dash). self.n.i = 1 self.n.handler = handle_suffix elseif match(this, "^[%d%u]$") then self:emit(this) self.n.i = self.n.i + 1 else return self:fail_route end end function handle_dec_code(self, this) if this == "-" then self:emit("-") -- Ends --QINU`\"'\127 (two dashes).			self.n.i = 0			self.n.handler = handle_suffix		elseif match(this, "^%d$") then			self:emit(this)		else			return self:fail_route		end	end	function handle_suffix(self, this)		self.n.i = self.n.i + 1		if this ~= sub("-QINU`\"'\127", self.n.i, self.n.i) then return self:fail_route end self:emit(this) if self.n.i == 9 then return self:pop end end function Parser:do_strip_marker self:set("handler", handle_prefix) self.n.i = 0 self:emit("\127") self:advance end function Parser:strip_marker local strip_marker = self:get("do_strip_marker") if strip_marker == self.n.bad_route then return nil elseif strip_marker.tag == "nowiki" then local head = strip_marker.head strip_marker = export.parse_nowiki(unstrip_nowiki(concat(strip_marker))) strip_marker.tag = "nowiki" strip_marker.head = head end return StripMarker:new(strip_marker) end end

-- -- Wikilink --

do local function is_invalid_target(this, pattern) return not not (			not this or			this == "" or			this == "\239\191\189" or -- U+FFFD Replacement Character			match(this, pattern)		) end do local handle_target_decoding local handle_target local handle_target_whitespace local handle_target_escape local handle_capitalizer local handle_multipart local handle_end_after_target local handle_default_display_text local handle_after_pipe local handle_rsqb_after_pipe local handle_text local handle_text_after_newline local handle_end_after_text local handle_end_after_extra_rsqb local handle_trail function handle_target_decoding(self, this) if this == "%" then this = self:percent_encoding if type(this) == "table" then return handle_target_decoding(self, rawstring(this)) end return this, this elseif this == "&" then this = self:html_entity(true) if this == false then return nil end end return this or "&", type(this) == "table" and rawstring(this) or this or "&" end function handle_target(self, this) if this == "'" then this = self:apostrophes self.n.apos = self.n.apos or this and true or nil self:emit(this or "'") return elseif this == "\\" then self.n.override = handle_target_escape return elseif this == "^" then self.n.override = handle_capitalizer elseif self.unembedded_link then if this == "/" then self.n.override = handle_multipart return elseif this == "" then local ret = handle_default_display_text(self, true) if ret then return ret end self["end"] = true return self:pop end -- Only if not self.unembedded_link. elseif this == "]" then if #self.n == 0 then return self:fail_route end self.n.handler = handle_end_after_target return elseif this == "|" then if #self.n == 0 then return self:fail_route end local wikilink = self:wikilink_target(Wikitext:new(self:pop_sublayer, true)) if not wikilink then return self:fail_route elseif wikilink.other then return self:pop end self.n.handler = handle_after_pipe return end local decoded if self.n.fragment then -- "<" and ">" are valid as literals in fragments. if is_invalid_target(this, "^[%z\1-\31%[%]{|}\127]$") then return self:fail_route end this, decoded = handle_target_decoding(self, this) if not decoded then return self:fail_route end else this, decoded = handle_target_decoding(self, this) if is_invalid_target(decoded, "^[%z\1-\31<>%[%]{|}\127]$") then return self:fail_route end end if decoded == "#" then self:emit(this) self.n.fragment = true elseif d.BIDI[decoded] then return elseif d.WIKILINK_SPACE[decoded] then self:emit(this) self.n.override = handle_target_whitespace else self:emit(this) end end function handle_target_whitespace(self, this) if this == " " then return elseif d.WIKILINK_SPACE[this] then self:emit(this) else self.n.override = nil return self:consume end end function handle_target_escape(self, this) self.n.override = nil if this == "" then return self:consume -- Retain escape for second pass. elseif this == "#" or this == ":" or this == "\\" then self:emit("\\") end self:emit(this) end function handle_capitalizer(self, this) self.n.override = nil -- TODO end function handle_multipart(self, this) self.n.override = nil if this == "/" then local ret = handle_default_display_text(self, true) if ret then return ret end return self:pop else self:emit("/") return self:consume(this) end end function handle_end_after_target(self, this) if this ~= "]" then return self:fail_route end local ret = handle_default_display_text(self) if ret then return ret end local display = self.n.display -- Push self.n.display onto the stack for the trail. display.handler = handle_trail display.head = self.head display.route = handle_trail local len = self.len + 1 self[len] = display self.n = display self.len = len end function handle_default_display_text(self, unembedded_link) local raw_display = self:pop_sublayer -- Generate the target using a clone of raw_display, in case it gets trashed. local wikilink = self:wikilink_target(				Wikitext:new({unpack(raw_display)}, true),				unembedded_link			) if not wikilink then return self:fail_route elseif wikilink.other then return self:pop end self.n.display = Wikitext:new(raw_display, true) -- Style apostrophes are parsed before the trail is added. self:substitute_apostrophes(self.n.display) end function handle_after_pipe(self, this) if this == "]" then self.n.handler = handle_rsqb_after_pipe return end self:push_sublayer(handle_text) return self:consume end function handle_rsqb_after_pipe(self, this) if this == "]" then return self:fail_route end self:push_sublayer(handle_text) self:emit("]") return self:consume end -- Note: except for trails, sortkeys are parsed like display text, since Parsoid parses them before doing the category logic. function handle_text(self, this) if this == "\n" then self:newline self.n.override = handle_text_after_newline elseif this == "&" then self:emit(self:html_entity or "&") elseif this == "'" then this = self:apostrophes self.n.apos = self.n.apos or this and true or nil self:emit(this or "'") elseif this == "<" then self:html_tag elseif this == "[" then if self.n.len > 0 and self:emitted == "[" then if self.n.other == "file" then -- TODO else return self:fail_route end end self.n.extra_rsqb = true self:emit("[") elseif this == "]" then self.n.handler = handle_end_after_text if self.n.extra_rsqb then local end_of_text = self:get("do_wikilink_end_after_extra_rsqb") if end_of_text ~= self.n.bad_route then self:emit_tokens(end_of_text) return self:consume end end elseif this == "{" then -- TODO: table elseif this == "|" and self.n.other == "file" then -- TODO elseif this == "\127" then self:emit(self:strip_marker or "?") elseif this == "" then return self:fail_route else self:emit(this) end end function handle_text_after_newline(self, this) if this == " " or this == "\t" then return end self.n.override = nil if this == "-" then self:horizontal_rule elseif this == "=" then self:heading else return self:consume end end function handle_end_after_text(self, this) if this == "]" then -- Style apostrophes are parsed before the trail is added. -- This is (bizarrely) even applied to sortkeys. self:finalize_line if self.n.other == "category" then local sortkey = Wikitext:new(self:pop_sublayer) self.n.sortkey = sortkey return self:pop end self.n.handler = handle_trail else self:emit("]") self.n.handler = handle_text return self:consume end end function handle_end_after_extra_rsqb(self, this) if this == "]" then self.n.i = self.n.i + 1 if self.n.i == 2 then return self:pop end else return self:fail_route end end function handle_trail(self, this) if not match(this, "^%a$") then local display = Wikitext:new(self:pop_sublayer) self.n.display = display self:advance(-1) return self:pop end self:emit(this) end function Parser:do_wikilink(head) self:set("handler", handle_target) self.head = head or self.head self:push_sublayer end function Parser:do_wikilink_end_after_extra_rsqb self:set("handler", handle_end_after_extra_rsqb) self.n.i = 0 self:emit("]") self:advance end end -- Second pass over wikilink target: -- Get normalized prefixes: capitalization is ignored, and spacing characters + "_" become spaces. -- Get any fragment. -- Check for the colon trick. -- Ignore style apostrophes. do local handle_target_decoding_2 local handle_prefix local handle_target_2 local handle_target_escape_2 local handle_category function handle_target_decoding_2(self, this) if type(this) == "table" then if this.type == "apostrophes" then return "apostrophes" end -- Replace HTML entities and percent-encoding with the relevant characters. local decoded, pos = this.char, self.head self.text[pos] = decoded[1] for i = 2, #decoded do					pos = pos + 1 insert(self.text, pos, decoded[i]) end return decoded[1] end return this end function handle_prefix(self, this) if this == "" then return self:fail_route end this = handle_target_decoding_2(self, this) if this == "apostrophes" then return self:fail_route elseif this == ":" then if #self.n == 0 then return self:pop end local raw_prefix = concat(self.n)				local prefix = load_data("Module:data/namespaces")[raw_prefix] if prefix then self.n.prefix_type = "namespace" -- Normalize namespace. if raw_prefix == prefix then self.n.normalized = self.n					else self.n.normalized = Wikitext:new(explode(prefix)) end self.n.str = prefix return self:pop end local prefix_type = load_data("Module:data/interwikis")[raw_prefix] if not prefix_type then return self:fail_route else self.n.prefix_type = prefix_type self.n.normalized = self.n					self.n.str = prefix return self:pop end -- Don't emit spaces at the start or end. elseif d.WIKILINK_SPACE[this] then if self.n.can_emit_space then self.n.do_emit_space = true end else if self.n.do_emit_space then self:emit(" ") self.n.do_emit_space = nil end if #this == 1 then if not match(this, "^%w$") then return self:fail_route end self:emit(lower(this)) else self:emit(ulower(this)) end self.n.can_emit_space = true end end function handle_target_2(self, this) if this == "\\" then self.n.override = handle_target_escape_2 return elseif this == "" then local layer = Wikitext:new(self:pop_sublayer) self.n[self.n.title and "fragment" or "title"] = layer return self:pop end this = handle_target_decoding_2(self, this) if this == "apostrophes" then return elseif not self.n.title then if this == "#" then local title = Wikitext:new(self:pop_sublayer) self.n.title = title self:push_sublayer return end -- TODO: add a title length counter and fail if too long if this == "%" then -- TODO: check for percent-encoding format elseif this == "&" then -- TODO: check for HTML entity format elseif this == "." then -- TODO: check for dot slash notation elseif this == "/" then -- TODO: ditto elseif this == "~" then -- TODO: check for 3+ consecutive tildes end end self:emit(this) end function handle_target_escape_2(self, this) self.n.override = nil self:emit(this) end function handle_file_or_category(self, this) if this == "" then local layer = Wikitext:new(self:pop_sublayer) self.n.title = layer return self:pop elseif type(this) == "table" then if this.type == "apostrophes" then for _ = 1, this.num do						self:emit("'") end return end this = rawstring(this) end self:emit(this) end function Parser:do_prefix self.n.handler = handle_prefix end function Parser:do_wikilink_2(unembedded_link) local colons, prefix, prefixes, prev_prefix_type = 0 while true do				prefix = self:get("do_prefix") if prefix == self.n.bad_route then break elseif not prefixes then if prefix.len == 0 then if colons == 1 then return self:fail_route end self.n.colon_trick = true elseif prefix.prefix_type == "current" then self.n.colon_trick = true else prefixes = Prefix:new{} insert(prefixes, prefix.normalized) prev_prefix_type = prefix.prefix_type end colons = 1 elseif #prefixes == 1 and prefix.len == 0 then if (						colons == 2 or						colons == 1 and not ( prev_prefix_type == "local" or							prev_prefix_type == "external" )					) then return self:fail_route end colons = 2 elseif prefix.len > 0 then insert(prefixes, prefix.normalized) colons = 1 prev_prefix_type = prefix.prefix_type end self:advance -- Category prefix in an unembedded link always links to the category. if not self.n.colon_trick and (					prefix.str == "file" or 					prefix.str == "category" and not unembedded_link				) then self.n.handler = handle_file_or_category self.n.other = prefix.str self:push_sublayer return elseif prefix.prefix_type == "namespace" then break end end self.n.prefix = prefixes self.n.handler = handle_target_2 self:push_sublayer end function Parser:wikilink_target(target, unembedded_link) if self.n.apos then self:handle_odd_number_italics_and_bold(target) self.n.apos = nil end local parser = Parser:new(target) local wikilink = parser:get("do_wikilink_2", unembedded_link) if wikilink == parser.bad_route then return nil end self.n.title = wikilink.title if wikilink.other then self.n.other = wikilink.other else self.n.colon_trick = wikilink.colon_trick self.n.prefix = wikilink.prefix self.n.fragment = wikilink.fragment end return wikilink end end function Parser:wikilink(head) local wikilink = self:get("do_wikilink", head) if wikilink == self.n.bad_route then self:emit("[") self:emit("[") self:advance elseif wikilink.other == "category" then self:emit(Category:new(wikilink)) else self:emit(Wikilink:new(wikilink)) end end do local function traverse_link_template(self) local this, layer repeat this = self:read if this == "\r" then layer = self:carriage_return("\r") elseif this == "<" then layer = self:comment elseif this ~= "\0" then layer = self:consume(this) end self:advance until layer self:advance(-1) return layer end function Parser:do_link_template(_, head) self.traverse = traverse_link_template self.unembedded_link = true self.head = head return self:do_wikilink end end function export.parse_link_template(str) local text = explode(str) return Parser:multipart(			{				text = text,				node = {Wikilink},				route = {"do_link_template"}			},			{				text = text,				node = {Wikitext},				route = {"do_default"}			}		) end end

-- -- Parser --

do -- Handlers. local handle_plaintext local handle_plaintext_whitespace local handle_plaintext_after_newline local handle_multipart function handle_plaintext(self, this) if this == " " or this == "\t" then self:emit(this) self.n.override = handle_plaintext_whitespace elseif this == "\n" then self:newline self.n.override = handle_plaintext_after_newline elseif this == "&" then self:emit(self:html_entity or "&") elseif this == "'" then this = self:apostrophes self.n.apos = self.n.apos or this and true or nil self:emit(this or "'") elseif this == "/" and self.multi then self.n.override = handle_multipart elseif this == ":" then self:free_external_link elseif this == "<" then self:html_tag elseif this == "I" or this == "P" or this == "R" then self:magic_link(this) elseif this == "[" then self:bracketed_external_link elseif this == "\127" then self:emit(self:strip_marker or "?") elseif this == "" then self:finalize_line self["end"] = true return self:pop else self:emit(this) end end function handle_plaintext_whitespace(self, this) if this ~= " " and this ~= "\t" then self.n.override = nil return self:consume(this) end end function handle_plaintext_after_newline(self, this) self.n.override = nil if this == "#" then self:emit(self.OrderedListMarker) elseif this == "*" then self:emit(self.UnorderedListMarker) elseif this == "-" then self:horizontal_rule elseif this == ":" then self:emit(self.IndentationMarker) elseif this == ";" then self:emit(self.DescriptionListMarker) self.n.dl = true elseif this == "=" then self:heading else return self:consume(this) end end function handle_multipart(self, this) self.n.override = nil if this == "/" then self:finalize_line return self:pop else self:emit("/") return self:consume(this) end end do local function traverse_default(self) local this, layer repeat this = self:read if this == "\r" then layer = self:carriage_return("\r") elseif this == "<" then layer = self:comment elseif this == "_" then layer = self:magic_word elseif this ~= "\0" then layer = self:consume(this) end self:advance until layer self:advance(-1) return layer end function Parser:do_default(multipart, head) self.traverse = traverse_default if multipart then self.multi = multipart self.head = head end self:set("handler", handle_plaintext) end end function export.parse(str, multipart) local data = { text = explode(str), node = {Wikitext}, route = {"do_default"} }		if multipart then return Parser:multipart(data) end return (select(2, Parser:parse(data))) end end

do local handle_nowiki local handle_multipart function handle_nowiki(self, this) if this == "\r" then return self:carriage_return("\r") elseif this == "&" then self:emit(self:html_entity or "&") elseif this == "/" and self.multi then self.n.override = handle_multipart elseif this == "" then self:finalize_line self["end"] = true return self:pop elseif this ~= "/0" then self:emit(this) end end function handle_multipart(self, this) self.n.override = nil if this == "/" then return self:pop else self:emit("/") return self:consume(this) end end function Parser:do_nowiki(multipart, head) if multipart then self.multi = multipart self.head = head end self:set("handler", handle_nowiki) end function export.parse_nowiki(str) local data = { text = explode(str), node = {Wikitext}, route = {"do_nowiki"} }		return (select(2, Parser:parse(data))) end end

return export