Module:User:Erutuon/scripts

local export = {}

local floor = math.floor local lookup = mw.loadData("Module:Unicode data/scripts") local get_codepoint = mw.ustring.codepoint

-- These pairs of scripts share many characters. local shares_some_characters_with = { Latn = "Latinx", Grek = "polytonic", Cyrl = "Cyrs", }

local compound_scripts = { Jpan = { "Hani", "Hira", "Kana" }, Kore = { "Hang", "Hani" }, }

-- What scripts wouldn't these override? local overrides = { Jpan = "Latn", Hani = "Latn", Kore = "Latn", -- ... }

local scripts_with_identical_characters = { Latn = { "Latf", "Latn", "Zyyy", "nv-Latn", "pjt-Latn" }, Arab = { "Arab", "fa-Arab", "kk-Arab", "ks-Arab", "ku-Arab", "ms-Arab", "mzn-Arab", "ota-Arab", "pa-Arab", "ps-Arab", "sd-Arab", "tt-Arab", "ug-Arab", "ur-Arab", },	Hani = { "Hani", "Hans", "Hant" }, }

local function in_array(array, val) if array == nil then return false end for _, v in ipairs(array) do		if val == v then return true end end return false end

-- Returns the first value in one array that is also found in another array. local function get_intersection(array1, array2) for _, v in ipairs(array1) do		if in_array(array2, v) then return v		end end end

local function compare_range_arrays(range1, range2) return range1[1] < range2[1] end

--	Binary search: more efficient for the longer lists of codepoint ranges than	for the shorter ones. local function binary_search(ranges, value) if not ranges then return nil end --	Initialize numbers. local iStart, iMid = 1, 0 -- Can't use # because table is loaded by mw.loadData. local iEnd = ranges.length or require("Module:table").size(ranges)

if iEnd == 0 then return nil end

local iterations = 0

-- Do search. while iStart <= iEnd do		iterations = iterations + 1

-- Calculate middle. iMid = floor((iStart + iEnd) / 2)

-- Get compare value. local range = ranges[iMid]

if range[1] > value then iEnd = iMid - 1

-- Return matching index. Assumes there are no duplicates. elseif value <= range[2] then return range

-- Keep searching. else iStart = iMid + 1 end end return nil end

local function look_up_in_order(number, ranges) for i, range in ipairs(ranges) do		if number < range[1] then return nil elseif number <= range[2] then return range[3] end end end

-- Save previously used codepoint ranges in case another character is in the -- same range. local ranges_cache = {}

--[=[	Takes a codepoint and finds the script code (if any) that is appropriate for it, using the data module Module:Unicode data/scripts. The data module was generated from the patterns in Module:scripts/data using Module:User:Erutuon/script recognition.

Returns a script code if the codepoint is in the list of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to, else returns "None". ]=] local individual_lookup = lookup.individual local function codepoint_to_script(codepoint) local individual_match = individual_lookup[codepoint] if individual_match then return individual_match else local script = look_up_in_order(codepoint, ranges_cache) if script then return script end

local index = floor(codepoint / 0x1000)

script = look_up_in_order(index, lookup.blocks) if script then return script end

local range = binary_search(lookup[index], codepoint) if range then table.insert(ranges_cache, range) table.sort(ranges_cache, compare_range_arrays) return range[3] end end

return "None" end

function export.get_script_counts(str) local script_counts = {} setmetatable(		script_counts,		{			__index = function(self, key)				self[key] = 0				return 0			end		}) for codepoint in mw.ustring.gcodepoint(str) do		local script = codepoint_to_script(codepoint) script_counts[script] = script_counts[script] + 1 end setmetatable(script_counts, nil) return script_counts end

local get_lang_data = require "Module:fun".memoize(function (lang_code)	if #lang_code == 3 then		return mw.loadData("Module:languages/data/3/" .. lang_code:sub(1, 1))[lang_code]	elseif #lang_code == 2 then		return mw.loadData "Module:languages/data/2"[lang_code]	else		return mw.loadData "Module:languages/data/exceptional"[lang_code]	end end)

local function transfer_count(script_counts, from_script_code, to_script_code) script_counts[from_script_code], script_counts[to_script_code] = 0, (script_counts[to_script_code] or 0) + script_counts[from_script_code] end

function export.find_best_script(str, lang_code) local script_counts = export.get_script_counts(str) -- Show string and list of scripts. -- mw.log(str, table.concat(require "Module:fun".mapIter(function(value, key) return key end, pairs(script_counts)), ", ")) -- Might save a little processing time. script_counts.None = nil if lang_code then local data = get_lang_data(lang_code) or error("Language code " .. lang_code .. " not recognized.") local scripts = data and data.scripts local compound_script for _, script in ipairs(scripts) do			if compound_scripts[script] then compound_script = script end end local overriding for script, count in pairs(script_counts) do			if not in_array(scripts, script) then local similar_script = shares_some_characters_with[script] local other_script -- in Ancient Greek: Grek -> polytonic if similar_script and in_array(scripts, similar_script) then other_script = similar_script -- in Japanese: Kana -> Jpan elseif compound_script and in_array(compound_scripts[compound_script], script) then other_script = compound_script -- in Navajo: Latn -> nv-Latn elseif scripts_with_identical_characters[script] then local intersection = get_intersection(scripts, scripts_with_identical_characters[script]) if intersection then other_script = intersection end end -- Transfer character count of original script to new script. if other_script then transfer_count(script_counts, script, other_script) script = other_script end end if overrides[script] then overriding = script end end if compound_script and script_counts[compound_script] then local constituent_scripts = compound_scripts[compound_script] for script, count in pairs(script_counts) do				if count > 0 and in_array(constituent_scripts, script) then transfer_count(script_counts, script, compound_script) end end end if overriding then local overridden = overrides[overriding] if script_counts[overridden] then transfer_count(script_counts, overridden, overriding) end end end local greatest_count = 0 local best_script for script, count in pairs(script_counts) do		if count > greatest_count and script ~= "None" then greatest_count = count best_script = script end end return best_script end

return export