Module:User:Erutuon/script recognition

local export = {}

local getCodepoint = mw.ustring.codepoint local U = mw.ustring.char local floor = math.floor

local title = mw.title.getCurrentTitle.fullText

local function check(funcName, expectType) return function(argIndex, arg) require("libraryUtil").checkType(funcName, argIndex, arg, expectType) end end

function mw.logf(...) return mw.log(string.format(...)) end

local output_mt = {} function output_mt:insert(str) self.n = self.n + 1 self[self.n] = str end

function output_mt:insert_format(...) self:insert(string.format(...)) end

output_mt.join = table.concat

output_mt.__index = output_mt

local function Output return setmetatable({ n = 0 }, output_mt) end

local function dump(val) local output = Output output:insert('{\n') local range_format = { 0x%05X, 0x%05X, "%s" }, local length_format = -- also close range array length = %d,	}, for i = 0, 0x10FFFF / 0x100 do		local ranges = val[i] if ranges then output:insert_format( 	[0x%02X] = {, i)			for j, range in ipairs(ranges) do				output:insert_format(range_format, unpack(range)) end output:insert_format(length_format, ranges.length or -1) end end output:insert [[

individual = { ]]	for codepoint, script in require "Module:table".sortedPairs(val.individual) do		output:insert_format( 		[0x%05X] = "%s", ,		codepoint, script) end output:insert [[	},

blocks = { ]]	for _, blockRange in ipairs(val.blocks) do		output:insert_format( 		{ 0x%02X, 0x%02X, "%s" }, ,		unpack(blockRange)) end output:insert }, }	return require "Module:debug".highlight(table.concat(output)) end

local function printRanges(ranges) local output = Output output:insert("Ranges:") for _, range in ipairs(ranges) do		output:insert_format('\n\tU+%04X-U+%04X: %s', unpack(range)) end mw.log(output:join) end

local function hasContents(t) if next(t) then return true else return false end end

local function log(message) if title:match("testcases/documentation$") then mw.log(message) end end

local function makeRangeKey(codepoint) return floor(codepoint / 0x1000) end

local function isInRange(value, lower, upper) -- mw.log(value, lower, upper) local check = check("isInRange", "number") check(1, value) check(2, lower) check(3, upper) return value >= lower and value <= upper end

local function lookupCharacter(characterLookup, character) local codepoint if type(character) == "string" then if mw.ustring.len(character) == 1 then codepoint = getCodepoint(character) else error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.") end elseif type(character) == "number" then codepoint = character else error("Character is the wrong type: " .. type(character) .. ".") end if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then return false elseif characterLookup.values and characterLookup.values[codepoint] then return true else for i, range in ipairs(characterLookup) do			if isInRange(codepoint, range[1], range[2]) then return true end end end return false end

local function forEachChar(str, func) if type(func) == "function" then for i = 1, mw.ustring.len(str) do			char = mw.ustring.sub(str, i, i)			func(char) end end end

function export.makeCharacterLookup(pattern) local characterLookup = {} local values = {} local allValues = {} local i = 1 -- Create ranges in which all characters belong to the script. local workingString = mw.ustring.gsub(		pattern,		"([^-])%-([^-])",		function(item1, item2)			local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)			--			if not (codepoint1 < codepoint2) then				error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")			end						table.insert(characterLookup, { codepoint1, codepoint2 })			allValues[codepoint1] = true			allValues[codepoint2] = true			return ""		end	) if workingString ~= "" then workingString = mw.ustring.gsub(			workingString,			".",			function(char)				local codepoint = getCodepoint(char)				values[codepoint] = true				allValues[codepoint] = true			end		) end --		Place the tables of ranges in the Unicode order (the patterns		should already be in that order, but just to be safe). table.sort(		characterLookup,		function(item1, item2)			return item1[1] < item2[1]		end) local allValuesKeys = require("Module:table").numKeys(allValues) local smallest, largest = allValuesKeys[1], allValuesKeys[#allValuesKeys] -- Don't create an empty values table. if hasContents(values) then characterLookup.values = values end --		Don't record the smallest and largest values if they're found in the		first range. if not (smallest == characterLookup[1][1] and largest == characterLookup[1][2]) then characterLookup.smallest, characterLookup.largest = smallest, largest end return characterLookup end

function export.makeAllScriptsCharacterLookup local allScriptsCharacterLookup = {} local patternToScript = {} for code, data in pairs(require("Module:scripts/data")) do		if not code:find("-", 1, true) then if data.characters then -- Don't generate identical lookup table twice. local scriptWithPattern = patternToScript[data.characters] if scriptWithPattern then allScriptsCharacterLookup[code] = allScriptsCharacterLookup[scriptWithPattern] else allScriptsCharacterLookup[code] = export.makeCharacterLookup(data.characters) end patternToScript[data.characters] = code end end end return allScriptsCharacterLookup end

-- fa-Arab → Arab-fa local function switchLangSc(scriptCode) return scriptCode:gsub("^([^-]+)%-(.+)$", "%2-%1") end

-- To ensure that Grek and Latn appear first. -- This also makes Grek and Latn take precedence when generating -- the codepoint-to-script lookup table. local scriptCodeReplacements = { polytonic = "Grek2", Latinx = "Latnx", Latf = "Latnf", }

local function modifyAdHocCode(code) if scriptCodeReplacements[code] then return scriptCodeReplacements[code] elseif not (code:find("%u%l%l%l") or code:find("%l%l%l%-%u%l%l%l")) then return code:gsub("^(.+)$", "~%1") else return code end end local function keySort(key1, key2) local type1, type2 = type(key1), type(key2) if type1 == "number" and type2 == "string" then return true elseif type1 == "string" and type2 == "number" then return false elseif type1 == "string" then key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2) key1, key2 = switchLangSc(key1), switchLangSc(key2) local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2) return lower1 < lower2 else return key1 < key2 end end

local function hex(number) return string.format("0x%X", number) end

local function divideRange(lower, upper, width, testing) local ranges = {} if not (lower and upper) then mw.log("divideRange failed:", lower, upper, width, testing) return nil end local position = floor(lower / width) local start = position * width local i = 0 local increment = i * width repeat local range1 = start + increment local range2 = range1 + width - 1 if range1 < lower then range1 = lower end if range2 > upper then range2 = upper end if testing then range1, range2 = hex(range1), hex(range2) end ranges[position + i] = { range1, range2 } i = i + 1 increment = i * width until start + increment > upper return ranges end

function export.showDividedRange(frame) local lower = 0x2A700 local higher = 0x2B73F local width = 0x1000 local dividedRange = divideRange(lower, higher, width, true) return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange) end

-- Scripts that consist entirely of characters from another script. local scriptBlacklist = { ["Latf"]		= true; ["Latg"]		= true; ["Hans"]		= true; ["Hant"]		= true; ["Kore"]		= true; ["Jpan"]		= true; ["Zyyy"]		= true; }

local function sortRange(range1, range2) local number1, number2 = tonumber(range1[1]), tonumber(range2[1]) if number1 == number2 then return keySort(range1[3], range2[3]) else return number1 < number2 end end

local function printScriptRange(range, hideScriptName) if hideScriptName then return ("U+%04X-U+%04X"):format(range[1], range[2]) else return ("%s (U+%04X-U+%04X)"):format(range[3], range[1], range[2]) end end -- When there is overlap between ranges belonging to two different scripts, -- the key in this table overrides the value. local overrides = { Beng = "as-Beng", Cyrl = "Cyrs", Grek = "polytonic", Latn = "Latinx", }

local function fixRangeOverlaps(ranges) local prev local i = 1 while ranges[i] do		range = ranges[i] prev = ranges[i - 1] if prev and (range[1] <= prev[1] or range[2] <= prev[2]) then -- mw.logf("%s in conflict with %s",				-- printScriptRange(prev), printScriptRange(range)) local overrider, overridden if overrides[range[3]] == prev[3] then overrider, overridden = range, prev elseif overrides[prev[3]] == range[3] then overrider, overridden = prev, range end if overrider and overridden then mw.logf("%s overrides %s", printScriptRange(overrider),					printScriptRange(overridden)) else mw.logf("Should %s override %s or the other way around?",					printScriptRange(prev), printScriptRange(range)) end if overrider[1] <= overridden[1] then -- low end of overridden is inside overrider if overridden[2] <= overrider[2] then -- overridden entirely within overrider table.remove(ranges, overridden == range and i or i - 1) -- remove overridden if overridden == prev then i = i - 1 end else -- upper part of overridden outside of overrider if overridden[2] - overrider[2] == 1 then -- one codepoint of overridden is outside overrider table.remove(ranges, overridden == range and i or i - 1) -- remove overridden if overridden == prev then i = i - 1 end individual[overridden[2]] = overridden[3] else overridden[1] = overrider[2] + 1 end end else -- overridden[1] < overrider[1]: low end of overridden is outside overrider -- single codepoint at low end of overridden is outside overrider table.remove(ranges, overridden == range and i or i - 1) -- remove overridden if overridden == prev then i = i - 1 end if overrider[1] - overridden[1] == 1 then individual[overridden[1]] = overridden[3] else -- multiple codepoints at low end of overridden are outside overrider ranges:insert(i,						{ overridden[1], overrider[1] - 1, overridden[3] }) i = i + 1 end if overrider[2] < overridden[2] then -- high end of overridden is outside overrider -- single codepoint at high end of overridden is outside of overrider if overridden[2] - overrider[2] == 1 then individual[overridden[2]] = overridden[3] else ranges:insert(i,							{ overrider[2] + 1, overridden[2], overridden[3] }) i = i + 1 end end end end i = i + 1 end end

local function checkRangeOverlaps(ranges) local prev for i, range in ipairs(ranges) do		if prev and prev[2] >= range[1] then mw.logf("%s overlaps with %s",				printScriptRange(prev), printScriptRange(range)) end prev = range end end

local function makeCodepointToScriptLookup(testing) local output = {} local ranges_mt = { insert = function (self, i, value) if value ~= nil then if self[i][1] < value[1] then i = i + 1 end mw.logf("Inserting %s below %s",					printScriptRange(value), printScriptRange(self[i])) table.insert(self, i, value) else value = i				table.insert(self, value) end end, remove = table.remove, }	ranges_mt.__index = ranges_mt setmetatable(output,		{			__index = function (self, key)				local val = setmetatable({}, ranges_mt)				self[key] = val				return val			end,		}) output.individual = {} local individual = output.individual local rangeStrings = {} local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do		if not scriptBlacklist[scriptCode] then for key, value in ipairs(lookup) do				if type(value) == "table" then local newRanges = divideRange(value[1], value[2], 0x1000, testing) if newRanges then for position, newRange in pairs(newRanges) do local rangeString = newRange[1] .. newRange[2] if rangeStrings[rangeString] then mw.logf("The range U+%04X-U+%04X is already "									.. "recorded as belonging to the script "									.. "code %s, conflicting with %s.",									newRange[1], newRange[2], rangeStrings[rangeString], scriptCode) else rangeStrings[rangeString] = scriptCode output[position]:insert({ newRange[1], newRange[2], scriptCode }) end end end end end if lookup.values then for codepoint in pairs(lookup.values) do					if individual[codepoint] then mw.logf("The codepoint %s is already recorded as " ..							"belonging to the script code %s, conflicting with %s.",							hex(codepoint), individual[codepoint], scriptCode) else individual[codepoint] = scriptCode end end end end end for position, ranges in pairs(output) do		if type(position) == "number" then local prevRange local i = 1 while ranges[i] do				range = ranges[i] if prevRange and range[3] == prevRange[3] and prevRange[2] == range[1] - 1 then mw.logf("Merged %s with %s",						printScriptRange(range), printScriptRange(prevRange)) prevRange[2] = range[2] table.remove(ranges, i)					i = i - 1 -- to compensate for removed element end prevRange = range i = i + 1 end table.sort(ranges, sortRange) end end local individualCodepoints = require "Module:table".numKeys(individual) local minimumCodepointRange = 3 local i = 1 while individualCodepoints[i] do		local codepoint = individualCodepoints[i] local script = individual[codepoint] if not script then error(("No script for U+%04X"):format(codepoint)) end local startOfRun = codepoint while individual[codepoint + 1] == script do			codepoint = codepoint + 1 i = i + 1 end if codepoint - startOfRun + 1 >= minimumCodepointRange and makeRangeKey(startOfRun) == makeRangeKey(codepoint) then for j = startOfRun, codepoint do				individual[j] = nil end local rangeKey = makeRangeKey(startOfRun) local ranges = output[rangeKey] if not ranges then ranges = {} output[rangeKey] = ranges end ranges:insert({ startOfRun, codepoint, script }) mw.logf("Added range %s from a run in individual map",				printScriptRange { startOfRun, codepoint, script }) table.sort(ranges, sortRange) end i = i + 1 end for position, ranges in pairs(output) do		if type(position) == "number" then if ranges[2] then fixRangeOverlaps(ranges) end end end -- Add length field to range arrays and check that there are no overlaps -- between ranges. output.blocks = {} local prevScript, blockRange for index, ranges in pairs(output) do		if type(index) == "number" then ranges.length = #ranges if ranges[2] then checkRangeOverlaps(ranges) end local firstScript = ranges[1][3] if not ranges[2] or require "Module:fun".all(					function (range)						return range[3] == firstScript					end,					ranges) then -- All ranges contain the same script. if prevScript and firstScript == prevScript then if not blockRange then blockRange = { index - 1, index, prevScript } table.insert(output.blocks, blockRange) else blockRange[2] = index end else blockRange = nil prevScript = firstScript end else prevScript = nil end end end setmetatable(output, nil) return output end

--	Binary search: more efficient for the longer lists of codepoint ranges than	for the shorter ones. local function binarySearch(ranges, value) --	Initialize numbers. local iStart, iMid = 1, 0 -- Can't use # because table is loaded by mw.loadData. local iEnd = require("Module:table").size(ranges) if iEnd == 0 then return nil end local iterations = 0 -- Do search. while iStart <= iEnd do		iterations = iterations + 1 -- Calculate middle. iMid = floor((iStart + iEnd) / 2) -- Get compare value. local range = ranges[iMid] -- Return matching index. Assumes there are no duplicates. if isInRange(value, range[1], range[2]) then return range -- Keep searching. elseif value < range[1] then iEnd = iMid - 1 else iStart = iMid + 1 end end return nil end

local function lookupInOrder(number, ranges) for i, range in ipairs(ranges) do		if isInRange(number, range[1], range[2]) then -- mw.log(mw.ustring.char(number), hex(number), i)			return range[3] end if number < range[1] then -- mw.log(mw.ustring.char(number), hex(number), i)			return nil end end end

-- Save previously used codepoint ranges in case another character is in the -- same range. local rangesCache = {}

function export.charToScript(char) local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup local codepoint = mw.ustring.codepoint(char) local individualMatch = lookup.individual[codepoint] if individualMatch then return individualMatch else local script = lookupInOrder(codepoint, rangesCache) if script then return script end local index = makeRangeKey(codepoint) script = lookupInOrder(index, lookup.blocks) if script then return script end local range = binarySearch(lookup[index], codepoint) if range then table.insert(rangesCache, range) table.sort(rangesCache, sortRange) return range[3] end end return nil end

function export.show(frame) local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data") local str = frame.args[1] or "ABCD一丨丶丿乙亅" local result = {} forEachChar(		str,		function(char)			table.insert(result, tostring(export.charToScript(char)))		end	)

return str .. ": " .. table.concat(result, ", ") end

function export.show(frame) return dump(makeCodepointToScriptLookup) end

return export