Module:User:Theknightwho/Jpan-sortkey

local export = {}

local kanji_pattern = mw.loadData("Module:ja/data/range").kanji local ideograph_pattern = mw.loadData("Module:ja/data/range").ideograph local kana_graph_pattern = mw.loadData("Module:ja/data/range").kana_graph local latin_pattern = mw.loadData("Module:ja/data/range").latin local find = mw.ustring.find local gsub = mw.ustring.gsub local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local u = mw.ustring.char local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"

function export.sortkey_from_string(text, lang, sc) text = mw.ustring.toNFD(require("Module:ja").kata_to_hira(text)) -- If the first character has dakuten, replace it with the corresponding character without dakuten and add an apostrophe to the end, e.g. がす > かす' text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x3099) .. "(.*)", "%1%2'") -- Similar thing, but with handuken and two apostrophes, e.g. ぱす -> はす'' text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x309A) .. "(.*)", "%1%2''") -- Replace the long vowel mark with the vowel that it stands for if text:match("ー") then local from = { "あぁかさたなはまやゃらわ", "いぃきしちにひみり", "うぅくすつぬふむゆゅる", "えぇけせてねへめれ", "おぉこそとのほもよょろ", "ん" }		local to = {"あ", "い", "う", "え", "お", "ん"} local dh = u(0x3099) .. u(0x309A) for i, v in ipairs(from) do text = mw.ustring.gsub(text, "([" .. v .. "][" .. dh .. "]?)ー", "%1" .. to[i]) end end text = gsub(text, "[・゠]", " ") local ret = require("Module:Hani-sortkey").makeSortKey(text, lang, sc) if ret ~= text then require("Module:debug/track"){"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang} end return ret end

function export.makeSortKey(text, lang, sc) local langname = require("Module:languages").getByCode(lang):getCanonicalName local seen_pages = {} local section local function scrape_page(text) seen_pages[text] = true local content = mw.title.new(toNFC(text)):getContent if content then local section = section or require("Module:User:Theknightwho/get_header").get_header1 local i = 1 for heading in content:gmatch("(%f[^%z%s](=+)[^\n=]-%S+[^\n=]*%2%f[%z%s])") do				i = i + 1 if heading:find("==%s*" .. langname:gsub("%-", "%%%-") .. "%s*==") then break end end local loc1, loc2 = content:find("%f[^%z%s]==[^\n=]*" .. langname:gsub("%-", "%%%-") .. "[^\n=]*==") local loc2 = content:find("%f[^%z%s]==[^\n=]+==", loc2) if loc1 then content = content:sub(loc1, loc2) section = section - i + 1 local findTemplates = require("Module:templateparser").findTemplates local templates = { [lang .. "-noun"] = true, [lang .. "-verb"] = true, [lang .. "-adj"] = true, [lang .. "-phrase"] = true, [lang .. "-verb form"] = true, [lang .. "-verb-suru"] = true, [lang .. "-see"] = true, [lang .. "-see-kango"] = true, [lang .. "-gv"] = true, }				local templates2 = { [lang .. "-head"] = true, [lang .. "-pos"] = true, }				local function parse_section(section_content) local kanjitab, br					for template, args, _, temp_start in findTemplates(section_content) do						if templates[template] and args[1] then text = args[1]:gsub("[ %-%.^%%]", "") br = true break elseif templates2[template] and args[2] then text = args[2]:gsub("[ %-%.^%%]", "") br = true break elseif (template == "head" or template == "head-lite") and args[1] == lang then for i, arg in ipairs(args) do								if arg == "kana" then local kana = args[i+1] if kana then text = kana br = true break end end end elseif template == lang .. "-kanjitab" then kanjitab = kanjitab or args end end if (not br) and kanjitab then require("Module:debug/track"){"Jpan-sortkey/kanjitab", "Jpan-sortkey/kanjitab/" .. lang} if kanjitab.sortkey then return kanjitab.sortkey end -- extract kanji and non-kanji local kanji = {} local non_kanji = {} local kanji_border = 1 mw.ustring.gsub(text, "([" .. kanji_pattern .. "々])", function(p1, w1, p2)							table.insert(non_kanji, mw.ustring.sub(text, kanji_border, p1 - 1))							kanji_border = p2							table.insert(kanji, w1)						end) table.insert(non_kanji, mw.ustring.sub(text, kanji_border)) -- 々 for i, v in ipairs(kanji) do							if v == "々" then kanji[i] = kanji[i - 1] end end -- process readings local readings = {} local readings_actual = {} local reading_length_total = 0 for i in ipairs(kanjitab) do							local reading_kana, reading_length _, _, reading_kana, reading_length = mw.ustring.find(kanjitab[i] or "", "^([^0-9]*)([0-9]*)$") reading_kana = reading_kana ~= "" and reading_kana or nil reading_length = reading_kana and tonumber(reading_length) or 1 table.insert(readings, {reading_kana, reading_length}) reading_length_total = reading_length_total + reading_length for i = reading_length_total + 1, #kanji do								table.insert(readings, {nil, 1}) end if reading_kana then local actual_reading = kanjitab["k" .. i] local okurigana = kanjitab["o" .. i] readings_actual[i] = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length} else readings_actual[i] = {nil, 1} end end local sortkey = {non_kanji[1]} local id = 1 for _, v in ipairs(readings_actual) do							id = id + v[2] v[1] = v[1] ~= "-" and v[1] table.insert(sortkey, (v[1] or "") .. (non_kanji[id] or "")) end sortkey = table.concat(sortkey) if sortkey ~= "" then text = sortkey end end end local sections, i = {}, 0 for pos in content:gmatch("%f[^%z%s](=+)[^\n=]-%S+[^\n=]*%2%f[%z%s]") do					i = i + 1 sections[i] = pos end for i = section, 1, -1 do					local section_content = content:sub(sections[section], sections[section + 1]) parse_section(section_content) if not find(text, "[0-9" .. kanji_pattern .. ideograph_pattern .. kana_graph_pattern .. latin_pattern .. "]") then break end end end end return text end while lang ~= "mul" and (not seen_pages[text]) and find(text, "[0-9" .. kanji_pattern .. ideograph_pattern .. kana_graph_pattern .. latin_pattern .. "]") do		text = scrape_page(text) end return export.sortkey_from_string(text, lang, sc) end

return export