Module:Hani-sortkey

local export = {}

local m_str_utils = require("Module:string utilities")

local byte = string.byte local codepoint = m_str_utils.codepoint local concat = table.concat local convert_iteration_marks = require("Module:Hani").convert_iteration_marks local explode = m_str_utils.explode_utf8 local format = string.format local gmatch = string.gmatch local gsub = string.gsub local insert = table.insert local sub = string.sub local u = m_str_utils.char local ugsub = mw.ustring.gsub local umatch = mw.ustring.match local upper = m_str_utils.upper

local m_data = require("Module:Hani-sortkey/data/serialized") local m_data_core = mw.loadData("Module:Hani-sortkey/data/core") local cache = {}

--	Returns the index in the string where the ideographic description sequence	(IDS) ends, or the index of the end of the string. Iterates whenever	another ideographic description character (IDC) is found. local function findEndOfIDS(text, IDchar, i)	if not (text and IDchar and i) then return nil end local j = i	local component = 1 -- Number of components expected after current IDC. local components = m_data_core.ids[IDchar] while component <= components do		j = j + 1 local char = text[j] if not char then break elseif m_data_core.ids[char] then j = findEndOfIDS(text, char, j)		end component = component + 1 end --		If the expected number of components has been found,		return the current index in the text. if component - components == 1 then return j	else return nil end end

local function unserialize(a, b) return m_data_core.radicals[byte(a)] .. format("%02d", byte(b) - 10) end

-- The data is stored in Module:Hani-sortkey/data. This data is not accessed directly (due to the large amount of memory this would consume), but is instead stored in a serialized form as Module:Hani-sortkey/data/serialized. If the data is changed, the new serialized data can be generated with Module:Hani-sortkey/data/serializer. function export.getData(char) if type(char) == "string" then char = codepoint(char) elseif type(char) ~= "number" then error("getData must operate on a single character or codepoint.") end local offset, s, f, lookup = 0 for i = 2, m_data_core.ranges.n, 2 do		s, f = m_data_core.ranges[i - 1], m_data_core.ranges[i] if char > f then offset = offset + f - s + 1 elseif char >= s and char <= f then lookup = 2 * (offset + char - s + 1) return (gsub(sub(m_data, lookup - 1, lookup), "(.)(.)", unserialize)) end end return u(char) end

function export.makeSortKey(text, lang, sc) local scripts = { Hani = true, Hans = true, Hant = true, Jpan = true, Kore = true }	if sc and not scripts[sc] then return upper(text) end -- Convert any iteration marks into full characters, and remove any spaces. Also remove punctuation if the term contains non-punctuation (so that entries for punctuation characters can still be sorted properly). text = ugsub(convert_iteration_marks(text), "%s+", "") if not umatch(text, "^%p+$") then text = ugsub(text, "%p+", "") end text = explode(text) local sort, text_len, i = {}, #text, 0 while i < text_len do		i = i + 1 local char = text[i] if m_data_core.preconvert[char] then local j = 0 for c in gmatch(m_data_core.preconvert[char], ".[\128-\191]*") do				if j == 0 then text[i] = c				else insert(text, i + j, c)				end j = j + 1 end char = text[i] text_len = #text end --[=[			If we encounter an ideographic description character (IDC), find out if it begins a valid ideographic description sequence (IDS). If the IDS is valid and a sortkey for it is listed in			Module:Hani-sortkey/data/unsupported, then return the sortkey, and move to the next character after the IDS. Otherwise, insert the IDC into the sortkey and move to the next character after the IDC. If the IDS is valid and no sortkey for it is found, track it. ]=]		if m_data_core.ids[char] then local j = findEndOfIDS(text, char, i)			local IDS, data if j then IDS = concat(text, nil, i, j)				data = m_data_core.unsupported[IDS] end if not data then if IDS then require("Module:debug").track("Hani-sortkey/IDS-without-sortkey") mw.log("ideographic description sequence without sortkey: '"						.. IDS .. "'") else require("Module:debug").track("Hani-sortkey/invalid-IDS") mw.log("invalid ideographic description sequence at the beginning of '"						.. text[i] .. "'") end end if IDS and data then insert(sort, data) i = j			else insert(sort, char) end else if not cache[char] then cache[char] = export.getData(char) end insert(sort, cache[char]) end end return concat(sort) end

return export