Module:User:Theknightwho/scripts/charToScript

local subexport = {}

local cp = mw.ustring.char local split = mw.text.split

--[=[	Takes a codepoint or a character and finds the script code(s) (if any) that are appropriate for it based on the codepoint, using the data module Module:scripts/recognition data. By default, it returns only the first script code if there are multiple matches (i.e. the code we take to be the default). If `all_scripts` is set, then a table of all matching codes is returned. ]=]

local memo = {}

local charToScriptData = mw.loadData("Module:User:Theknightwho/recognition data") function subexport.charToScript(char, all_scripts) local t = type(char) if t == "string" then if char:find("[%z\1-\127\194-\244][\128-\191]*[%z\1-\127\194-\244]") then error("bad argument #1 to 'charToScript' (expected a single character)") end elseif t == "number" then char = u(char) else error(("bad argument #1 to 'charToScript' (expected string or a number, got %s)")			:format(t)) end if not memo[char] then local data, ret = charToScriptData for byte in char:gmatch(".") do			local new_data if data[byte] then new_data = data[byte] else for k, v in pairs(data) do if #k > 1 and byte:find("[" .. k .. "]") then new_data = v						break end end end if not new_data then ret = {"None"} break elseif type(new_data) == "string" then ret = split(new_data, "%s*,%s*") break else data = new_data end end if all_scripts then memo[char] = ret else memo[char] = ret[1] end end return memo[char] end

--[=[	Finds the best script for a string in a language-agnostic way. Converts each character to a codepoint. Iterates the counter for the script code if the codepoint is in the list of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to. Each script has a two-part counter, for primary and secondary matches. Primary matches are when the script is the first one listed; otherwise, it's a secondary match. When comparing scripts, first the total of both are compared (i.e. the overall number of matches). If these are the same, the number of primary and then secondary matches are used as tiebreakers. For example, this is used to ensure that `Grek` takes priority over `polytonic` if no characters which exclusively match `polytonic` are found, as `Grek` is a subset of `polytonic`. ]=] function subexport.findBestScriptWithoutLang(text) local min = math.min -- `scripts` contains counters for any scripts detected so far. Jpan and Kore are handled as special-cases, as they are combinations of other scripts. local weights_mt = { __lt = function(a, b)			if a[1] + a[2] ~= b[1] + b[2] then return a[1] + a[2] < b[1] + b[2] elseif a[1] ~= b[1] then return a[1] < b[1] elseif a[2] ~= b[2] then return a[2] < b[2] else return false end end }	local scripts_mt = 	{__index = function(t, k)		return setmetatable({0, 0}, weights_mt) end} local scripts = setmetatable({}, scripts_mt) text = require("Module:utilities").get_plaintext(text) for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do		for i, script in ipairs(subexport.charToScript(character, true)) do			scripts[script] = scripts[script] local weight = min(i, 2) scripts[script][weight] = scripts[script][weight] + 1 end end local bestScript local greatestCount for script, count in pairs(scripts) do		if (not greatestCount) or greatestCount < count then bestScript = script greatestCount = count end end bestScript = bestScript or "None" return require("Module:scripts").getByCode(bestScript) end

return subexport