Module:ja

local m_str_utils = require("Module:string utilities")

local export = {}

local codepoint = m_str_utils.codepoint local concat = table.concat local find = string.find local get_by_code = require("Module:languages").getByCode local insert = table.insert local load_data = mw.loadData local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local u = m_str_utils.char local ugsub = m_str_utils.gsub local ulen = m_str_utils.len local ulower = m_str_utils.lower local umatch = mw.ustring.match local usub = m_str_utils.sub

-- note that arrays loaded by mw.loadData cannot be directly used by gsub local pagename -- generated when needed, to avoid an infinite loop with Module:Jpan-sortkey local namespace = mw.title.getCurrentTitle.nsText

local data = load_data("Module:ja/data") local long_vowels_hira = data.long_vowels_hira local long_vowels_kata = data.long_vowels_kata local voice_marks = data.voice_marks

local range = load_data("Module:ja/data/range") local r_hiragana = range.hiragana local r_vowels = range.vowels local r_kana_combining_characters = range.kana_combining_characters

local function change_codepoint(added_value) return function(char) return u(codepoint(char) + added_value) end end

function export.hira_to_kata(text) if type(text) == "table" then text = text.args[1] end text = ugsub(toNFD(text), "[ぁ-ゖゝゞ]", change_codepoint(96)) text = ugsub(text, "[𛅐-𛅒]", change_codepoint(20)) text = ugsub(text, "[𛀁𛀆𛄟𛄲]", data.hira_to_kata) return toNFC(text) end

function export.kata_to_hira(text) if type(text) == "table" then text = text.args[1] end text = ugsub(toNFD(text), "[ァ-ヶヽヾ]", change_codepoint(-96)) text = ugsub(text, "[𛅤-𛅦]", change_codepoint(-20)) text = ugsub(text, "[𛀀𛄠-𛄢𛅕]", data.kata_to_hira) return toNFC(text) end

-- removes spaces and hyphens from input -- intended to be used when checking manual romaji to allow the -- insertion of spaces or hyphens in manual romaji without appearing "wrong" function export.rm_spaces_hyphens(f) local text = type(f) == "table" and f.args[1] or f	return (text:gsub("[ '%-.]+", "")		:gsub(" ", "")) end

do local function handle_macron(ch) return ch == "o" and "ou" or ch .. ch	end function export.romaji_to_kata(f) local text = type(f) == "table" and f.args[1] or f		text = ulower(toNFD(text)) text = text:gsub("(.[\128-\191]*)\204\132", handle_macron) :gsub("(.)%1", "ッ%1") :gsub("tc", "ッc") :gsub("tsyu", "ツュ") :gsub("ts[uoiea]", {["tsu"]="ツ",["tso"]="ツォ",["tsi"]="ツィ",["tse"]="ツェ",["tsa"]="ツァ"}) :gsub("sh[uoiea]", {["shu"]="シュ",["sho"]="ショ",["shi"]="シ",["she"]="シェ",["sha"]="シャ"}) :gsub("ch[uoiea]", {["chu"]="チュ",["cho"]="チョ",["chi"]="チ",["che"]="チェ",["cha"]="チャ"}) :gsub("n[uoiea']?", {["nu"]="ヌ",["no"]="ノ",["ni"]="ニ",["ne"]="ネ",["na"]="ナ"}) :gsub("[wvtrpsnmlkjhgfdbzy][yw]?[uoiea]", data.rk) :gsub("n'?", "ン") :gsub("[aeiou]", {			u = "ウ", o = "オ", i = "イ", e = "エ", a = "ア"			}) return text end end

-- expects: any mix of kanji and kana -- determines the script types used -- e.g. given イギリス人, it returns Kana+Hani function export.script(f) local text = type(f) == "table" and f.args[1] or f	local script = {} -- For Hira and Kana, we remove any characters which also feature in the other first, so that we don't get false positives for ー etc. local no_overlap = ugsub(text, "[" .. range.kana_overlap .. "]+", "") if umatch(no_overlap, "[" .. r_hiragana .. "ゟ]") then insert(script, "Hira") end if umatch(no_overlap, "[" .. range.katakana .. "ヿ]") then insert(script, "Kana") end if umatch(text, "[" .. range.kanji .. "]") then insert(script, "Hani") end if umatch(text, "[" .. range.latin .. "]") then insert(script, "Romaji") end if umatch(text, "[" .. range.numbers .. "]") then insert(script, "Number") end if umatch(text, "[〆々]") then insert(script, "Abbreviation") end

return concat(script, "+") end

do local submoraic = range.submoraic_kana .. r_kana_combining_characters local spacing_punc = "%s%p%$%+=>%^`|~" local function handle_spacing_punc(ch, mora) insert(mora, ch) if ch:match("[^%^%%']") then mora.sp = true end return ch, mora end local function iterate_mora(text, start, morae, mora) mora = mora or {} local ch = umatch(text, "^[" .. spacing_punc .. "]+", start) if ch then return handle_spacing_punc(ch, mora) end ch = usub(text, start, start) if ch == "<" then ch = umatch(text, "^<.->", start) or umatch(text, "^[<" .. spacing_punc .. "]+", start) return handle_spacing_punc(ch, mora) elseif (			mora.sp or			mora.kana and umatch(ch, "[^" .. submoraic .. "]")		) then insert(morae, concat(mora)) mora = {} end mora.kana = true insert(mora, ch) return ch, mora end -- Returns an array of morae. -- Small vowel kana (and any combining dakuten/handakuten) are grouped with any prior word characters, which should be kana. Non-word characters (spaces, punctuation etc.) are accounted for, and grouped with surrounding morae wherever possible. function export.moraify(text) local morae, start, text_len, mora = {}, 1, ulen(text) while start <= text_len do			local ch			ch, mora = iterate_mora(text, start, morae, mora) start = start + ulen(ch) end if mora then insert(morae, concat(mora)) end return morae end local function remove_formatting(text) return ugsub(text:gsub("<.->", ""), "[<" .. spacing_punc .. "]+", "") end -- Counts the number of morae. function export.count_morae(text) text = export.moraify(text) local morae = #text for i = 1, morae do			if #remove_formatting(text[i]) == 0 then morae = morae - 1 end end return morae end local function do_long_vowel(i, text) if not text[i]:find("ー") then return end local prev = text[i - 1] if not prev then return end prev = ugsub(remove_formatting(prev), "[" .. r_kana_combining_characters .. "]+", "") :match("[^\128-\191][\128-\191]*$") for vowel, kana in pairs(r_vowels) do			if kana:find(prev) then local v = (umatch(prev, "[" .. r_hiragana .. "]") and long_vowels_hira or long_vowels_kata)[vowel] text[i] = text[i]:gsub("ー", v, 1) end end end

local function do_iteration_mark(i, n, text) local mora = text[i] if mora:find("ゝ") or mora:find("ヽ") then return n + 1 elseif n == 0 then return end -- Count backwards once for each iteration mark, but stop early if we find something which can't be iterated, as that marks the start of the set to be repeated. local anchor = i		for j = 0, n - 1 do			local prev = text[anchor - j]			if not prev then n = j				break end prev = remove_formatting(prev) if prev:find("ゝ") or prev:find("ヽ") or umatch(prev, "[%s%p]") then n = j				break end end if n == 0 then return end i = i - n + 1 -- Replace iteration marks ahead with the relevant character. for j = i, i + n - 1 do			mora = remove_formatting(text[j]):gsub("^(.[\128-\191]*)\227\130[\153\154]", "%1") text[j + n] = ugsub(text[j + n], "([ゝヽ])([゙゚]?)", function(mark, voicing)				local repl = mora:gsub("^.[\128-\191]*", "%0" .. voicing)				return mark == "ゝ" and export.kata_to_hira(repl) or export.hira_to_kata(repl)			end) end return end -- Normalizes long vowels, iteration marks and non-combining voice marks to the standard equivalents. -- Note: output text is normalized to NFD. function export.normalize_kana(text) text = export.moraify((toNFD(text):gsub("[\227\239][\130\190][\155\156\158\159]", voice_marks))) local n, morae = 0, #text for i = morae, 1, -1 do			n = do_iteration_mark(i, n, text) or 0 end for i = 1, morae do			do_long_vowel(i, text) end -- Normalize again to be safe. return toNFD(concat(text)) end end

-- returns the "stem" of a verb or -i adjective, that is the term minus the final character function export.definal(f) return usub(f.args[1], 1, -2) end

function export.remove_ruby_markup(text) return (text:gsub("[%^%-%. %%]", "")) end

-- do the work of Template:ja-kanji, Template:ryu-kanji etc. -- should probably be folded into Module:Jpan-headword function export.kanji(frame) pagename = pagename or load_data("Module:headword/data").pagename -- only do this if this entry is a kanji page and not some user's page if namespace == "" then local params = { grade = {}, -- To be removed. rs = {}, shin = {}, kyu = {}, head = {}, }		local lang_code = frame.args[1] local lang = get_by_code(lang_code) local lang_name = lang:getCanonicalName local args = require("Module:parameters").process(frame:getParent.args, params, nil, "ja", "kanji") local sortkey = args.rs or require("Module:Hani-sortkey").makeSortKey(pagename) or pagename -- radical sort local shin = args.shin local kyu = args.kyu local wikitext, categories = {}, {} -- display the kanji itself at the top at 275% size insert(wikitext, " " .. (args.head or pagename) .. " ")

-- display information for the grade

-- determine grade local grade, in_parenthesis = export.kanji_grade(pagename), {} insert(in_parenthesis, data.grade_links[grade]) if args.grade then require("Module:debug/track")("ja/redundant grade parameter") end -- link to shinjitai if shinjitai was specified, and link to kyujitai if kyujitai was specified if kyu then insert(in_parenthesis, "shinjitai kanji, kyūjitai form " .. kyu .. " ") elseif shin then insert(in_parenthesis, "kyūjitai kanji, shinjitai form " .. shin .. " ") end insert(wikitext, "''(" .. concat(in_parenthesis, ", ") .. "'')")

-- add categories insert(categories, lang_name .. " kanji") insert(categories, lang_name .. " " .. data.grades[grade]) if grade <= 6 then insert(categories, lang_name .. " kyōiku kanji") insert(categories, lang_name .. " jōyō kanji") -- Grade 7 get this from the data. end if mw.title.new(lang_name .. " terms spelled with " .. pagename, 14).exists then insert(wikitext, 1, "See also:Category:" .. lang_name .. " terms spelled with " .. pagename .. "  ") end return concat(wikitext) .. require("Module:utilities").format_categories(categories, lang, sortkey) end end

function export.kanji_grade(kanji) for i, set in ipairs(data.grade_kanji) do		if find(set, kanji, 1, true) then return i		end end return umatch(kanji, "[" .. range.kanji .. "]") and 9 or false end

return export