Module:User:Theknightwho/doSubstitutions

require("Module:User:Theknightwho").convert_string_libraries -- Converts any specified exceptions into PUA characters, to avoid having diacritics stripped. Uses the supplemetary PUA planes (U+FXXXX & U+10XXXX), to ensure that any characters in the BMP (U+0XXXX) or SMP (U+1XXXX) can be round-trip converted to PUA. -- This will need to be reviewed if any characters in the SIP (U+2XXXX) or TIP (U+3XXXX) need to be processed by it, but as these planes are exclusively CJK characters as of 2022, this is unlikely to happen for the time being. However, it is unwise to start using non-PUA codepoints in the U+4XXXX-U+EXXXX range, as support for these is completely untested, so they may result in unpredictable behaviour. local function removeExceptions(text, sc, remove_exceptions, undo) if remove_exceptions then local u, cp, len, substitute = mw.ustring.char, mw.ustring.codepoint, mw.ustring.len for _, exception in ipairs(remove_exceptions) do			exception = sc:toFixedNFD(exception) substitute = {cp(exception, 1, len(exception))} for i, cp in ipairs(substitute) do substitute[i] = u(cp+0xF0000) end if undo then text = text:gsub(table.concat(substitute), exception) else text = text:gsub(exception, table.concat(substitute)) end end end return text end

local function doSubstitutions(text, self, sc, substitution_data, function_name, recursed) local fail, cats = nil, {} -- If there are language-specific substitutes given in the data module, use those. if type(substitution_data) == "table" then -- If a script is specified, run this function with the script-specific data before continuing. local sc_code = sc:getCode if substitution_data[sc_code] then text, fail, cats = doSubstitutions(text, self, sc, substitution_data[sc_code], function_name, true) -- Hant, Hans and Hani are usually treated the same, so add a special case to avoid having to specify each one separately. elseif sc_code:match("^Han") and substitution_data.Hani then text, fail, cats = doSubstitutions(text, self, sc, substitution_data.Hani, function_name, true) -- Substitution data with key 1 in the outer table may be given as a fallback. elseif substitution_data[1] then text, fail, cats = doSubstitutions(text, self, sc, substitution_data[1], function_name, true) end -- Iterate over all strings in the "from" subtable, and gsub with the corresponding string in "to". We work with the NFD decomposed forms, as this simplifies many substitutions. if substitution_data.from then local gsub for i, from in ipairs(substitution_data.from) do -- We normalize each loop, to ensure multi-stage substitutions work correctly. text = sc:toFixedNFD(text) -- Check whether specific magic characters are present, as they rely on UTF-8 compatibility. If not, just use string.gsub. In most cases, doing this is faster than using mw.ustring.gsub every time. if from:match("[%%.[%]*+%-?]") then gsub = mw.ustring.gsub else gsub = string.gsub end text = gsub(text, sc:toFixedNFD(from), substitution_data.to[i] or "") end end if substitution_data.remove_diacritics then text = sc:toFixedNFD(text) -- Convert exceptions to PUA. text = removeExceptions(text, sc, substitution_data.remove_exceptions) -- Strip diacritics. This must use mw.ustring.gsub, to ensure the character class is UTF-8 compatible. text = mw.ustring.gsub(text, "[" .. substitution_data.remove_diacritics .. "]", "") -- Convert exceptions back. text = removeExceptions(text, sc, substitution_data.remove_exceptions, true) end elseif type(substitution_data) == "string" then -- If there is a dedicated function module, use that. local is_module, module = pcall(require, "Module:" .. substitution_data) if is_module then if function_name == "tr" then text, fail, cats = module[function_name](text, self:getCode, sc:getCode) else text, fail, cats = module[function_name](sc:toFixedNFD(text), self:getCode, sc:getCode) end else error("Substitution data does not match an existing module.") end end -- Don't normalize to NFC if this is the inner loop or if a module returned nil. if recursed or not text then return text, fail, cats else -- Fix any discouraged sequences created during the substitution process, and normalize into the final form. text = sc:fixDiscouragedSequences(text) return sc:toFixedNFC(text), fail, cats end end

return function (text, self, sc, substitution_data, function_name) local PUA = {} text = text:gsub("[\238-\239][\128-\163][\128-\191]",		function (c1)			table.insert(PUA, c1)			return "\1"		end) local fail, cats text, fail, cats = doSubstitutions(text, self, sc, substitution_data, function_name) if text then for i, char in ipairs(PUA) do			text = string.gsub_old(text, "\1", char, 1) end end return text, fail, cats end