Module:User:Erutuon/UTF-8

local export = {}

local m_debug = require("Module:debug")

-- Excludes null byte, which is supposed to be able to be included in Lua strings, -- but causes patterns to fail. local continuationByte = "[\128-\191]" --- local UTF8Char = "[\1-\127\194-\244]" .. continuationByte .. "*" local nonASCII = "[\194-\244]" .. continuationByte .. "+" -- local escapePatt = "\\%d" local hexPatt = "0x%X"

local floor = math.floor

local function highlight(text) return m_debug.highlight(text, { inline = true }) end

local function hex(number) return hexPatt:format(number) end

local function byteEscape(number) return escapePatt:format(number) end

local function escapeBytes(str) local out = {} for i, byte in ipairs{ string.byte(str, 1, -1) } do		if byte < 128 then table.insert(out, string.char(byte)) else table.insert(out, byteEscape(byte)) end end return table.concat(out) end

-- Based on the helpful byte chart at UTF-8. local function getLeadingContinuation(codepoint) if codepoint < 0x80 then error("ASCII does not have leading bytes.") elseif codepoint < 0x800 then return 194 + floor((codepoint - 0x80) / 0x40), 1 elseif codepoint < 0x10000 then return 224 + floor((codepoint - 0x800) / 0x1000), 2 elseif codepoint < 0x11000 then return 240 + floor((codepoint - 0x10000) / 0x40000), 3 else error(("Codepoint U+%X is outside valid range."):format(codepoint)) end end

function export.makeUTF8Pattern(lower, higher) local codepoint1, codepoint2 = mw.ustring.codepoint(lower), mw.ustring.codepoint(higher) local leading1, continuationCount1 = getLeadingContinuation(codepoint1) local leading2, continuationCount2 = getLeadingContinuation(codepoint2) local continuationSequence = string.rep(continuationByte, continuationCount1) if continuationCount1 < continuationCount2 then continuationSequence = continuationSequence .. "+"	elseif continuationCount1 > continuationCount2 then error(string.format("The first character to makeUTF8Pattern (U+%X) should have a lower codepoint than the second (U+%X).", codepoint1, codepoint2)) end local leading = leading1 == leading2 and byteEscape(leading1) or "[" .. byteEscape(leading1) .. "-" .. byteEscape(leading2) .. "]"	return lower .. "-" .. higher .. " (" .. highlight(hex(codepoint1) .. "-" .. hex(codepoint2)) .. "): " .. highlight(leading .. escapeBytes(continuationSequence)) end

function export.makeRange(characters) characters = string.gsub(characters, "%-", "") local firstChar = string.match(characters, UTF8Char) local lower, higher = firstChar, firstChar for character in string.gmatch(characters, UTF8Char) do		if character < lower then lower = character elseif character > higher then higher = character end end return lower, higher end

function export.charPatternForScript(scCode) local sc = require("Module:scripts").getByCode(scCode) local characters = sc:getCharacters return export.makeUTF8Pattern(export.makeRange(characters)) end

function export.show(frame) return export.charPatternForScript(frame.args[1] or "polytonic") end

return export