Module:ko-pron

local export = {}

local m_data = mw.loadData("Module:ko-pron/data") local m_ko_utilities = require("Module:ko") local m_str_utils = require("Module:string utilities")

local codepoint = m_str_utils.codepoint local concat = table.concat local floor = math.floor local gmatch = m_str_utils.gmatch local gsub = m_str_utils.gsub local insert = table.insert local len = m_str_utils.len local match = m_str_utils.match local pattern_escape = m_str_utils.pattern_escape local sub = m_str_utils.sub local toNFC = mw.ustring.toNFC local u = m_str_utils.char local upper = m_str_utils.upper

local PAGENAME = mw.loadData("Module:headword/data").pagename local PAGENAME2 = gsub(PAGENAME, "-", "")

local system_lookup = { ["ph"] = 1, ["rr"] = 2, ["rrr"] = 3, ["mr"] = 4, ["yr"] = 5, ["ipa"] = 6, }

local question_mark = "?"

local system_list = { { 		abbreviation = "ph", display = "Phonetic hangul: ", separator = "/", },	{ 		abbreviation = "rr", display = "Revised Romanization" .. question_mark, separator = "/", },	{ 		abbreviation = "rrr", display = "Revised Romanization (translit.)" .. question_mark, separator = "/" },	{ 		abbreviation = "mc", display = "McCune–Reischauer" .. question_mark, separator = "/" },	{ 		abbreviation = "yr", display = "Yale Romanization" .. question_mark, separator = "/" },	{ 		abbreviation = "ipa", display = "(SK Standard/Seoul) IPA(key): ", separator = " ~ " } }

--[[

vowel_variation: rules for vowel transformation. key: the number of a syllable's vowel (vowel_id): floor(((codepoint('가') - 0xAC00) % 588) / 28) = 0 floor(((codepoint('개') - 0xAC00) % 588) / 28) = 1 value: an integer that is added to the decimal codepoint of the syllable u(codepoint('개') + 112) = '게'

allowed_vowel_scheme: a list of which systems vowel transformation is reflected in. key: vowel_id .. "-" .. system_index system_index: see system_list above. IPA is #6 value: 1, representing true

]]

local final_syllable_conversion = { [""] = "Ø", ["X"] = "" } local com_mc = { ["g"] = "k", ["d"] = "t", ["b"] = "p", ["j"] = "ch", ["sy"] = "s", ["s"] = "ss" } local com_ph = { ["ᄀ"] = "ᄁ", ["ᄃ"] = "ᄄ", ["ᄇ"] = "ᄈ", ["ᄉ"] = "ᄊ", ["ᄌ"] = "ᄍ" } local vowel_variation = { [1] = 112, -- 개→게 [3] = 112, -- 걔→계 [10] = 140, -- 괘→궤

[7] = -56, -- 계→게

[11] = 112, -- 괴→궤 [16] = 0,  -- 귀→귀 } local allowed_vowel_scheme = { ["1-1"] = 1,	["1-6"] = 1,	["3-1"] = 1,	["3-6"] = 1,	["10-1"] = 1,	["10-6"] = 1,

["7-1"] = 1,	["7-6"] = 1,

["11-1"] = 1,	["11-6"] = 1,	["16-6"] = 1, } local ambiguous_intersyllabic_rr = { ["oe"] = 1, ["eo"] = 1, ["eu"] = 1, ["ae"] = 1, ["ui"] = 1 } local ambiguous_intersyllabic_mr = { ["oe"] = 1, ["ae"] = 1 } local ambiguous_intersyllabic_yr = { ["ay"] = 1, ["ey"] = 1, ["oy"] = 1, ["uy"] = 1, ["̄y"] = 1, ["ya"] = 1, ["ye"] = 1, ["yo"] = 1, ["yu"] = 1 }

local function decompose_syllable(word) local decomposed_syllables = {} for syllable in mw.text.gsplit(word, "") do		insert(decomposed_syllables, m_ko_utilities.decompose_jamo(syllable)) end return decomposed_syllables end

local function tidy_phonetic(original, romanised) local j, k, w = 1, 1, {} for i = 1, len(romanised) do		local romanised_syllable = sub(romanised, k, k)		local original_syllable = sub(original, j, j)		if romanised_syllable ~= original_syllable then insert(w, ..romanised_syllable..) local original_advance = match(original_syllable, "[^ː ]") local romanised_advance = match(romanised_syllable, "[^ː ]") if original_advance or not romanised_advance then k = k + 1 end if romanised_advance or not original_advance then j = j + 1 end else insert(w, ' '..romanised_syllable..' ') j, k = j + 1, k + 1 end end return concat(w) end

local function tidy_ipa(ipa) ipa = gsub(ipa, "ʌ̹%(ː%)", "ɘ(ː)") -- TODO: 멀다 really should be [ˈmʌ̹ɭda̠] ~ [ˈmɘːɭda̠] instead of [ˈmɘ(ː)ɭda̠] ipa = gsub(ipa, "ɭɭi", "ʎʎi") ipa = gsub(ipa, "ɭɭj", "ʎʎ") ipa = gsub(ipa, "s([ʰ͈])ɥi" ,"ʃ%1ɥi") ipa = gsub(ipa, "ss͈([ji])" ,"ɕɕ͈%1") ipa = gsub(ipa, "s([ʰ͈])([ji])" ,"ɕ%1%2") ipa = gsub(ipa, "nj", "ɲ") ipa = gsub(ipa, "([ʑɕ])([ʰ͈]?)j", "%1%2") ipa = gsub(ipa, "kʰ[ijɯ]", { 		["kʰi"] = "cçi", 		["kʰj"] = "cç", 		["kʰɯ"] = "kxɯ" }	) ipa = gsub(ipa, "[hɦ][ijɯouw]", {		["hi"] = "çi",		["hj"] = "ç",		["hɯ"] = "xɯ",		["ho"] = "ɸʷo",		["hu"] = "ɸʷu",		["hw"] = "ɸw",		["ɦi"] = "ʝi",		["ɦj"] = "ʝ",		["ɦɯ"] = "ɣɯ",		["ɦo"] = "βo",		["ɦu"] = "βu",		["ɦw"] = "βw" }	) if match(ipa, "ɥi") then local midpoint = floor(len(ipa) / 2) ipa = sub(ipa, 1, midpoint) .. gsub(sub(ipa, midpoint+1, -1), "ɥi", "y") end return ipa end

function export.romanise(text_param, system_index, args) if type(text_param) == "table" then args = text_param:getParent.args system_index = args[2] or 2 text_param = args[1] end local p, optional_params = {}, { "nn", "l", "com", "cap", "ni" } for _, pm in ipairs(optional_params) do		p[pm] = { } if args[pm] then for pp in mw.text.gsplit(args[pm], ",") do p[pm][tonumber(pp) or pp] = 1 end end end

local categories = {} local vowel_ui_i, vowel_ui_e, no_batchim, batchim_reduce, s_variation, iotation = args.ui, args.uie, args.nobc, args.bcred, args.svar, args.iot system_index = system_lookup[system_index] or system_index text_param = gsub(text_param, '["](.)', "%1")	for primitive_word in gmatch(text_param, "[%-ᄀ-ᄒ".."ᅡ-ᅵ".."ᆨ-ᇂ" .. "ㄱ-ㅣ가-힣' 􀀀-􏿽]+") do		local the_original = primitive_word		primitive_word = gsub(primitive_word, "", "ß")		local formatting_position, formatting_count = {}, 0		primitive_word = gsub(primitive_word, "([ß􀀀-􏿽])", function(m1, m2)			formatting_position[m1 + formatting_count] = m2 == "ß" and "" or m2			return ""		end)		local has_vowel = {}		for ch in gmatch(primitive_word, ".") do			local jungseong = floor(((codepoint(ch) - 0xAC00) % 588) / 28)			if not match(ch, "[예옛옘례롄]") and match(ch, "[가-힣]") then has_vowel[jungseong] = true end		end		local word_set = { primitive_word }		local function add_respelling(variable, modification, modification2)			modification2 = modification2 or function(x) return x end			if variable and match(system_index, "[16]") then				variable = tonumber(variable) local pre_length = #word_set for i = 1, pre_length do					local item = mw.text.split(word_set[i], "") item[variable] = modification(item[variable]) item[variable + 1] = modification2(item[variable + 1]) word_set[pre_length + i] = concat(item) end end end add_respelling(vowel_ui_i, function(x) return "이" end) add_respelling(vowel_ui_e, function(x) return "에" end) add_respelling(no_batchim, 			function(x) return u(codepoint(x) - (codepoint(x) - 0xAC00) % 28) end, 			function(y) return u(codepoint(y) + 588) end) add_respelling(s_variation, function(x) return u(codepoint(x) - 12) end) add_respelling(iotation, function(x) return u(codepoint(x) + 56) end) for vowel_id, vowel_variation_increment in pairs(vowel_variation) do if has_vowel[vowel_id] and allowed_vowel_scheme[vowel_id .. "-" .. system_index] then local pre_length = #word_set for i = 1, pre_length do					local item = mw.text.split(word_set[i], "") for num, it in ipairs(item) do						if floor(((codepoint(it) - 0xAC00) % 588) / 28) == vowel_id then item[num] = u(codepoint(it) + vowel_variation_increment) end end if vowel_id == 11 then insert(word_set, i, concat(item)) else insert(word_set, concat(item)) end end end end local word_set_romanisations = {} for _, respelling in ipairs(word_set) do			local decomposed_syllables = decompose_syllable(respelling) local romanisation = {} local formatting_insert_count = 0 for index = 0, #decomposed_syllables, 1 do				local this_syllable_text = index ~= 0 and sub(respelling, index, index) or "" if this_syllable_text == "-" then -- skip it, it will be handled below else local syllable = decomposed_syllables[index] or { initial = "Ø", vowel = "Ø", final = "X" } local next_index = index local next_syllable_text local saw_hyphen_after = false while true do						next_index = next_index + 1 next_syllable_text = next_index > #decomposed_syllables and "" or sub(respelling, next_index, next_index) if next_syllable_text ~= "-" then break end saw_hyphen_after = true end local next_syllable = decomposed_syllables[next_index] or { initial = "Ø", vowel = "Ø", final = "Ø" } syllable.final = final_syllable_conversion[syllable.final] or syllable.final if system_index == 5 and syllable.vowel == "ᅮ" and match(syllable.initial, "[ᄆᄇᄈᄑ]") then syllable.vowel = "ᅳ" end if match(system_index, "[1246]") then if match(syllable.initial, "[ᄌᄍᄎ]") then if syllable.vowel == "ᅣ" then syllable.vowel = "ᅡ" elseif syllable.vowel == "ᅤ" then syllable.vowel = "ᅢ" elseif syllable.vowel == "ᅧ" then syllable.vowel = "ᅥ" elseif syllable.vowel == "ᅨ" then syllable.vowel = "ᅦ" elseif syllable.vowel == "ᅭ" then syllable.vowel = "ᅩ" elseif syllable.vowel == "ᅲ" then syllable.vowel = "ᅮ" end end end if match(system_index, "[16]") then if syllable.vowel == "ᅴ" and this_syllable_text ~= "의" then syllable.vowel = "ᅵ" end end if match(system_index, "[1246]") then if this_syllable_text == "넓" then if match(next_syllable.initial, "[ᄌᄉ]") then syllable.final = "ᆸ" elseif next_syllable.initial == "ᄃ" then if match(next_syllable.vowel, "[^ᅡᅵ]") then syllable.final = "ᆸ" end end end end local vowel = m_data.vowels[syllable.vowel][system_index]

if p.nn[next_index] and match(system_index, "[1246]") then next_syllable.initial = "ᄂ" end if p.com[index] and match(system_index, "[16]") then next_syllable.initial = com_ph[next_syllable.initial] or next_syllable.initial end if p.ni[next_index] and system_index ~= 3 then next_syllable.initial = (system_index == 5 and syllable.final == "ᆯ") and "ᄅ" or "ᄂ" end if match(system_index, "[1246]") then if tonumber(batchim_reduce or -1) == index then syllable.final = m_data.boundary[syllable.final .. "-Ø"][1] end if index ~= 0 and this_syllable_text == "밟" and not match(next_syllable.initial, "[ᄋᄒ]") then syllable.final = "ᆸ" end if next_syllable_text == "없" then if match(syllable.final, "[ᆩᆪᆰᆿ]") then syllable.final = "ᆨ" elseif match(syllable.final, "[ᆬᆭ]") then syllable.final = "ᆫ" elseif match(syllable.final, "[ᆺᆻᆽᆾᇀ]") then syllable.final = "ᆮ" elseif match(syllable.final, "[ᆲᆳᆴᆶ]") then syllable.final = "ᆯ" elseif syllable.final == "ᆱ" then syllable.final = "ᆷ" elseif match(syllable.final, "[ᆵᆹᇁ]") then syllable.final = "ᆸ" end end if tonumber(batchim_reduce or -1) ~= index then if match(syllable.final .. next_syllable.initial, "ᇀᄋ") then if next_syllable.vowel == "ᅵ" then syllable.final = "ᆾ" elseif next_syllable.vowel == "ᅧ" then syllable.final = "ᆾ" next_syllable.vowel = "ᅥ" end elseif match(syllable.final .. next_syllable.initial, "ᆴᄋ") then if next_syllable.vowel == "ᅵ" then syllable.final = "ᆯ" next_syllable.initial = "ᄎ" elseif next_syllable.vowel == "ᅧ" then syllable.final = "ᆯ" next_syllable.initial = "ᄎ" next_syllable.vowel = "ᅥ" end elseif match(syllable.final .. next_syllable.initial, "ᆮᄋ") and tonumber(s_variation or -1) ~= index then if next_syllable.vowel == "ᅵ" then syllable.final = "ᆽ" elseif next_syllable.vowel == "ᅧ" then syllable.final = "ᆽ" next_syllable.vowel = "ᅥ" end elseif match(syllable.final .. next_syllable.initial, "ᆮᄒ") then if next_syllable.vowel == "ᅵ" then syllable.final = "ᆾ" next_syllable.initial = "ᄋ" elseif next_syllable.vowel == "ᅧ" then syllable.final = "ᆾ" next_syllable.initial = "ᄋ" next_syllable.vowel = "ᅥ" end elseif match(syllable.final .. next_syllable.initial .. next_syllable.vowel, "[ᆬᆽᆾ][ᄋᄒ]ᅧ") then next_syllable.vowel = "ᅥ" end end if syllable.final .. next_syllable.initial == "ᆺᄋ" and not match(next_syllable_text, "[아았어었에으은을음읍의이인일임입있]") then syllable.final = "ᆮ" end end local bound = syllable.final .. "-" .. next_syllable.initial if not m_data.boundary[bound] then require("Module:debug").track("ko-pron/no boundary data") mw.log("No boundary data for " .. bound .. ".") return nil end local junction = m_data.boundary[bound][system_index] if system_index == 2 then local pos_format_start = index + formatting_insert_count + 1 local pos_format_end = pos_format_start while formatting_position[pos_format_end] do							pos_format_end = pos_format_end + 1 formatting_insert_count = formatting_insert_count + 1 end if pos_format_end > pos_format_start then local a, b = match(junction, "^(ng%-?)(.?)$") if not a or not b then a, b = match(junction, "^(.?%-?)(.*)$") end junction = match(syllable.final .. next_syllable.initial, "^Ø?[ᄀ-ᄒ]$") and concat(formatting_position, "", pos_format_start, pos_format_end - 1) .. (a or "") .. (b or "") or (a or "") .. concat(formatting_position, "", pos_format_start, pos_format_end - 1) .. (b or "") end end if p.l[index] or (p.l["y"] and index == 1) then -- FIXME, verify this code still works with final/initial cons changes if system_index == 1 then if #junction == 0 then junction = junction .. "ː" else junction = gsub(junction, "^(.)(.?)$", function(a, b)									return match(a, "[ᆨ-ᇂ]") and a .. "ː" .. b or "ː" .. a .. b end) end elseif system_index == 5 then vowel = gsub(vowel, "([aeiou])", "%1̄") elseif system_index == 6 then vowel = vowel .. "ː" if index == 1 then insert(categories, "Korean terms with long vowels in the first syllable") end end end if (p.l["y"] or p.l[1]) and index == 0 and system_index == 6 and #decomposed_syllables > 1 then vowel = vowel .. "ˈ" end if p.com[index] then -- FIXME, verify this code still works with final/initial cons changes junction = gsub(junction, "(.)$", function(next_letter)							return 								(system_index == 5 and "q" or "") .. 								(system_index == 4 and (com_mc[next_letter..(p.cap["y"] or "")] or com_mc[next_letter] or next_letter) or next_letter) end) end if p.ni[next_index] and system_index == 5 then -- FIXME, verify this code still works with final/initial cons changes junction = gsub(junction, "([nl])$", "%1") end

local final_cons, initial_cons = match(junction, "^(.*);(.*)$") if not final_cons then if system_index == 2 then error("Need a semicolon in the boundary value for " .. bound) end -- FIXME, throw an error for all systems once we've added semicolons everywhere final_cons = junction initial_cons = "" end

if system_index == 2 then insert(romanisation, vowel .. final_cons .. (saw_hyphen_after and "-" or "") .. initial_cons) else insert(romanisation, vowel .. junction) end end end local temp_romanisation = concat(romanisation) if system_index == 1 then temp_romanisation = tidy_phonetic(primitive_word, toNFC(temp_romanisation)) elseif match(system_index, "[23]") then for i = 1, 2 do					temp_romanisation = gsub(temp_romanisation, "(.)…(.)", function(a, b)						return a .. (ambiguous_intersyllabic_rr[a .. b] and "'" or "") .. b end) temp_romanisation = gsub(temp_romanisation, "wo'e", "woe") temp_romanisation = gsub(temp_romanisation, "yo'e", "yoe") temp_romanisation = gsub(temp_romanisation, "we'o", "weo") temp_romanisation = gsub(temp_romanisation, "we'u", "weu") temp_romanisation = gsub(temp_romanisation, "ye'u", "yeu") temp_romanisation = gsub(temp_romanisation, "yu'i", "yui") end elseif system_index == 4 then for i = 1, 2 do					temp_romanisation = gsub(temp_romanisation, "(.)…(.)", function(a, b)						return a .. (ambiguous_intersyllabic_mr[a .. b] and "'" or "") .. b end) temp_romanisation = gsub(temp_romanisation, "yo'e", "yoe") temp_romanisation = gsub(temp_romanisation, "a'e", "aë") temp_romanisation = gsub(temp_romanisation, "o'e", "oë") temp_romanisation = gsub(temp_romanisation, "n'k", "nk") temp_romanisation = gsub(temp_romanisation, "swi", "shwi") end elseif system_index == 5 then for i = 1, 2 do					temp_romanisation = gsub(temp_romanisation, "(.)…(.)", function(a, b)						return a .. (ambiguous_intersyllabic_yr[a .. b] and "." or "") .. b end) temp_romanisation = gsub(temp_romanisation, "[.]q", "q") end

elseif system_index == 6 then temp_romanisation = "[" .. temp_romanisation .. "]"			end

if match(system_index, "[16]") then temp_romanisation = gsub(temp_romanisation, "ː", "(ː)") end if p.cap["y"] and match(system_index, "[234]") then temp_romanisation = upper(sub(temp_romanisation, 1, 1)) .. sub(temp_romanisation, 2, -1) end

insert(word_set_romanisations, temp_romanisation) end

text_param = gsub(			text_param,			pattern_escape(the_original),			concat(word_set_romanisations, system_list[system_index].separator),			1		) end

if system_index == 6 then text_param = tidy_ipa(text_param) end

if #categories > 0 then text_param = text_param .. require("Module:utilities").format_categories(categories, m_ko_utilities.lang) end return text_param end

function export.make(frame, scheme) local params = { [1] = { default = PAGENAME2, list = true }, ["a"] = {}, ["audio"] = { alias_of = "a" }, ["nn"] = {}, ["l"] = {}, ["com"] = {}, ["cap"] = {}, ["ui"] = {}, ["uie"] = {}, ["nobc"] = {}, ["ni"] = {}, ["bcred"] = {}, ["svar"] = {}, ["iot"] = {}, }	local args = require("Module:parameters").process(frame:getParent.args, params) local results = {} for _, text_param in ipairs(args[1]) do		local current_word_dataset = {} for system_index, system in pairs(system_list) do			local romanised = export.romanise(text_param, system_index, args) insert(current_word_dataset, romanised) end insert(results, current_word_dataset) end local output_result = { [1] = {}, [2] = {}, [3] = {}, [4] = {}, [5] = {}, [6] = {} } for _, result in ipairs(results) do		for result_index, value in ipairs(result) do			insert(output_result[result_index], value) end end

local html_ul = mw.html.create( "ul" ) :done local html_li_ipa = mw.html.create( "li" ) :wikitext( system_list[6].display ) :tag( "span" ) :addClass( "IPA" ) :wikitext( concat(output_result[6], system_list[6].separator) ) :done :done local html_li_ph = mw.html.create( "li" ) :addClass( "ko-pron__ph" ) :wikitext( system_list[1].display ) :tag( "span" ) :addClass( "Kore" ) :attr( "lang", "ko" ) :wikitext( "[" .. concat(output_result[1], system_list[1].separator) .. "]" ) :done :done

if args.a then html_li_ipa :tag( "ul" ) :tag( "li" ) :wikitext( require("Module:audio").format_audio {						lang = m_ko_utilities.lang,						file = args.a == "y" and "Ko-" .. PAGENAME .. ".ogg" or args.a,					}) :done :done :done end

if args.l then html_li_ph :tag( "ul" ) :tag( "li" ) :addClass( "ko-pron__note-vowel-length" ) :wikitext( 'Though still prescribed in Standard Korean, most speakers in both Koreas no longer distinguish vowel length.' ) :done :done :done end

html_ul :node( html_li_ipa ) :node( html_li_ph ) :done

local html_table = mw.html.create( "table" ) :addClass( "ko-pron" ) :addClass( "mw-collapsible" ) :addClass( "mw-collapsed" ) :tag( "tr" ) :tag( "th" ) :attr( "colspan", 2 ) :wikitext( "Romanizations" ) :done :done :done for roman_index = 2, 5 do		html_table :tag( "tr" ) :tag( "th" ) :wikitext( system_list[roman_index].display ) :done :tag( "td" ) :addClass( "IPA" ) :wikitext( concat(output_result[roman_index], system_list[roman_index].separator) ) :done :done :done end return tostring(html_ul) .. tostring(html_table) .. require("Module:TemplateStyles")("Template:ko-IPA/style.css") end

function export.make_hanja(frame, scheme) local params = { [1] = { list = true },

["l"] = {}, }

local args = require("Module:parameters").process(frame:getParent.args, params)

local results = { [1] = {},		[6] = {},	}	for _, text_param in ipairs(args[1]) do		for _, system_index in pairs({1, 6}) do			local romanised = export.romanise(text_param, system_index, args) insert(results[system_index], romanised) end end

local html_ul = mw.html.create( "ul" ) :done local html_li_ipa = mw.html.create( "li" ) :wikitext( system_list[6].display ) :tag( "span" ) :addClass( "IPA" ) :wikitext( concat(results[6], system_list[6].separator) ) :done :done local html_li_ph = mw.html.create( "li" ) :addClass( "ko-pron__ph" ) :wikitext( system_list[1].display ) :tag( "span" ) :addClass( "Kore" ) :attr( "lang", "ko" ) :wikitext( "[" .. concat(results[1], system_list[1].separator) .. "]" ) :done :done

if args.l then html_li_ph :tag( "ul" ) :tag( "li" ) :addClass( "ko-pron__note-vowel-length" ) :wikitext( 'Though still prescribed in Standard Korean, most speakers in both Koreas no longer distinguish vowel length.' ) :done :done :done end

html_ul :node( html_li_ipa ) :node( html_li_ph ) :done

return tostring(html_ul) .. require("Module:TemplateStyles")("Template:ko-IPA/style.css") end

return export