Module:User:Theknightwho/lua-uca/lua-uca-collator

-- object for Unicode string collation local tailoring_lib = require "Module:User:Theknightwho/lua-uca/lua-uca-tailoring" local reordering_table = mw.loadData "Module:User:Theknightwho/lua-uca/lua-uca-reordering-table"

local collator = {} collator.__index = collator

local function copy_table(tbl) local t = {} for k, v in pairs(tbl) do		if type(v) == "table" then t[k] = copy_table(v) else t[k] = v		end end return t end

collator.copy_table = copy_table

function collator.new(codes) local self = setmetatable({}, collator) -- tree with mappings from codepoints to collation elements self.codes = codes self.updated_codes = {} setmetatable(self.updated_codes, {__mode="kv"}) -- cached sort keys self.stringcache = {} self.tailoring_multiplier = {1, 1, 1, 1} return self end

function collator:get_implicit_weight(codepoints, pos) -- implicit weight is based on the codepoint value local codepoint = codepoints[pos] return, pos + 1 end

function collator:read_weight(codepoints, pos) -- try to find contractions and return weight for longest matched string -- in the database local function read_children(parent, pos) local newpos = pos + 1 newcodepoint = codepoints[newpos] -- if we go out of the codepoint array if not newcodepoint then return nil end local child = type(parent) == "table" and parent[newcodepoint] or nil if child then local nextchild, nextpos = read_children(child, newpos) if nextchild then return nextchild, nextpos end if child then return child, newpos end end return nil end local weights local current_codepoint = codepoints[pos] local codes = self.updated_codes[current_codepoint] or self.codes[current_codepoint] if not codes then return nil, pos + 1 end -- first try to read contractions weights, new_pos = read_children(codes, pos) if weights then return weights, new_pos + 1 end -- if no contraction, weights are in the value field if not self.codes[current_codepoint] then return nil, pos + 1 end return self.codes[current_codepoint], pos + 1 end

-- get weights for the next characters function collator:get_weights(codepoints, pos) local weights, next_pos = self:read_weight(codepoints, pos) -- return implicit weights for codepoints that are not in the database if not weights then weights, next_pos = self:get_implicit_weight(codepoints, pos) end -- don't step next_pos if it is larger than size of the codepoints array if next_pos > #codepoints then next_pos = nil end return weights, next_pos end

function collator:update_levels(levels, weights) -- process weight weights if type(weights) == "table" then for i, w in ipairs(weights) do			if type(w) == "table" then for j, x in ipairs(w) do					-- process collation elements if x ~= 0 and type(x) == "number" then -- ignore zero elements -- insert element at the current collation level local current_level = levels[j] or {} table.insert(current_level, x)						levels[j] = current_level end end else if w ~= 0 and type(w) == "number" then -- ignore zero elements -- insert element at the current collation level local current_level = levels[i] or {} table.insert(current_level, w)					levels[i] = current_level end end end elseif type(weights) == "number" then if weights ~= 0 then local current_level = levels[1] or {} table.insert(current_level, weights) levels[1] = current_level end end return levels end

-- make sort key from codepoints array function collator:make_sort_key(codepoints) local levels = {} local pos = 1 local weights local sort_key = {} while true do		weights, pos = self:get_weights(codepoints, pos) levels = self:update_levels(levels, weights) -- break when we reach end of the codepoints array if not pos then break end end for i, elements in ipairs(levels) do		for _, element in ipairs(elements) do			table.insert(sort_key, element) end -- zero separates levels in the sort key table.insert(sort_key, 0) end return sort_key end

function collator:compare(a, b)	-- sort using sort keys local min = math.min(#a, #b) for i = 1, min do		if a[i] ~= b[i] then return a[i] < b[i] end end -- this should happen only when the strings are equal -- it needs to return false, otherwise the table.sort function reports -- "invalid order function for sorting" error return #a < #b end

local codepoints_cache = {}

function collator:compare_strings(a,b) local codepoint, len, toNFD = mw.ustring.codepoint, mw.ustring.len, mw.ustring.toNFD -- sort using strings local cache = self.stringcache local get_sortkey = function(x) x = toNFD(x) return self:make_sort_key({codepoint(x, 1, len(x))}) end local asortkey = cache[a] or get_sortkey(a) local bsortkey = cache[b] or get_sortkey(b) cache[a], cache[b] = asortkey, bsortkey return self:compare(asortkey, bsortkey) end

-- update collation codes function collator:update_codes(key, elements) local main_codes = self.codes local keys = self.updated_codes local function add_to_tree(tbl, current_pos) local tbl = tbl or {} local current_key = key[current_pos] local el = tbl[current_key] or {} if current_pos < #key then el[2] = add_to_tree(el[2], current_pos + 1) elseif current_pos == #key then el[1] = elements end tbl[current_key] = el		return tbl end keys = add_to_tree(keys, 1) end

--- change sorting ordering function collator:tailor(base, target, tailoring_table) -- get the value of the base character local value = self:get_weights(base, 1) local new_value = {} -- create a new collation element for k, v in ipairs(value) do		local subtable = {} for x, y in ipairs(v) do			subtable[x] = y + ((tailoring_table[x] or 0) * self.tailoring_multiplier[x] or 1) end new_value[k] = subtable end -- when tailoring sets an equivialent character, it needs to be ignored in collator:weight_to_codepoints local is_equivalent = 0 for _, x in ipairs(tailoring_table) do is_equivalent = is_equivalent + x end if is_equivalent == 0 then new_value.equal = true end self:update_codes(target, new_value) end

-- reorder scripts -- pass table with script names to reorder function collator:reorder(tbl) -- make table of the reordering table local t = copy_table(reordering_table) for _, script in ipairs(tbl) do		-- reorder scripts tailoring_lib.reorder(script, t)	end -- apply reordering to the collator object tailoring_lib.reorder_collator(self, t) end

-- expand characters to another characters function collator:equal(base, target) local new_weight = {} local values, pos pos = 1 while true do		value, pos = self:get_weights(target, pos) for _, v in ipairs(value) do			new_weight[#new_weight + 1] = v		end if not pos then break end end self:update_codes(base, new_weight) end

return collator