Module:User:Erutuon/patterns

local export = {}

-- Non-ASCII bytes. Only \128-\191 and \194-\244 are actually used in UTF-8. local nonASCIIByte = "[\128-\255]" local nonASCIIChar = "[\194-\244][\128-\191]+"

-- Character classes that will match non-ASCII codepoints if they are used in -- Ustring pattern-matching functions. local ustringClass = "%%[aAcCdDlLpPsSuUwWxX]"

local function isPattInPatt(str, patt1, patt2) for match in string.gmatch(str, patt1) do		if string.find(match, patt2) then return true end end return false end

-- Function to determine whether a pattern will behave exactly the same in the -- basic string functions as it does in the Ustring functions.

-- The Lua-implemented version of Ustring has a function that supposedly makes -- this determination, but it's overly conservative (disqualifying a pattern -- if it contains non-ASCII bytes). Not sure about the PHP version of Ustring -- that is actually used by Scribunto.

-- This does not check that the string is well-formed UTF-8. function export.canUseString(pattern) assert(type(pattern) == "string", ("argument #1 to canUseString should be string, but is " .. type(pattern))) -- Remove percent sign followed by anything besides a letter. pattern = string.gsub(pattern, "[%%]%A", "") -- If non-ASCII inside a set: "[αειου]", "[^αειου]" if isPattInPatt(pattern, "%b[]", nonASCIIByte) then return false end -- In Ustring, the classes listed in the pattern all contain multi-byte characters. if string.find(pattern, ustringClass) then return false end -- Quantifier following multi-byte character: -- in basic string function, quantifiers act on bytes, not on UTF-8 characters. if string.find(pattern, nonASCIIChar .. "[?*%-+]") then return false end -- In basic string function, dot matches a single byte, not a UTF-8 character. if string.find(pattern, "%.") then return false end return true end

function export.log for _, funcname in ipairs{ "find", "match", "gmatch", "gsub" } do		local old_func = mw.ustring[funcname] mw.ustring[funcname] = function (str, patt, ...) if export.canUseString(patt) then mw.log(funcname, str, patt, "can use string") end return old_func(str, patt, ...) end end end

return export