Module:cmn-pron-Xian

local export = {}

-- see the encoding below local initials = { b = "p", p = "pʰ", m = "m", f = "f", v = "v", B = "pf", P = "pfʰ", d = "t", t = "tʰ", n = "n", l = "l", g = "k", k = "kʰ", N = "ŋ", h = "x", j = "t͡ɕ", q = "t͡ɕʰ", x = "ɕ", Z = "t͡ʂ", C = "t͡ʂʰ", S = "ʂ", r = "ʐ", z = "t͡s", c = "t͡sʰ", s = "s", [""] = "", }

-- see the encoding below (U=ü, N=ng) local finals = { a = "a", ia = "ia", ua = "ua", ar = "ɐr", iar = "iɐr", uar = "uɐr", o = "o", uo = "uo", Uo = "yo", er = "ər", uor = "uər", e = "ɤ", ue = "ɯ", ie = "iɛ", Ue = "yɛ", ier = "iɛr", Uer = "yɛr", ii = "z̩", ih = "ʐ̩", i = "i", u = "u", U = "y", iir = "ər", ihr = "ər", ir = "iər", ur = "uər", Ur = "yər", ai = "æ", iai = "iæ", uai = "uæ", air = "ær", iair = "iær", uair = "uær", ei = "ei", ui = "uei", eir = "er", uir = "uer", ao = "au", iao = "iau", aor = "ɔr", iaor = "iɔr", ou = "ɤu", iu = "iɤu", our = "ər", iur = "iər", an = "ã", ian = "iã", uan = "uã", Uan = "yã", anr = "ɐ̃r", ianr = "iɐ̃r", uanr = "uɐ̃r", Uanr = "yɐ̃r", en = "ẽ", ["in"] = "iẽ", un = "uẽ", Un = "yẽ", enr = "ə̃r", inr = "iə̃r", unr = "uə̃r", Unr = "yə̃r", aN = "aŋ", iaN = "iaŋ", uaN = "uaŋ", aNr = "ɐ̃r", iaNr = "iɐ̃r", uaNr = "uɐ̃r", eN = "əŋ", iN = "iŋ", oN = "uəŋ", ioN = "yoŋ", eNr = "ə̃r", iNr = "iə̃r", oNr = "uə̃r", ioNr = "yə̃r", }

local tones = { ["1"] = "²¹", --陰平(T1) ["2"] = "²⁴", --陽平(T2) ["3"] = "⁵³", --上(T3) ["4"] = "⁵⁵", --去(T4) ["5"] = "", -- toneless (T0) }

-- internal use, encode and decode digraphs local digraph_encode = { bv = "B", pf = "P", ng = "N", zh = "Z", ch = "C", sh = "S", ["\204\140"] = "\1",	["\204\129"] = "\2",	["\204\128"] = "\3",	["\204\132"] = "\4", } local digraph_decode = { B = "bv", P = "pf", N = "ng", Z = "zh", C = "ch", S = "sh", U = "ü", ["\1"] = "\204\140",	["\2"] = "\204\129",	["\3"] = "\204\128",	["\4"] = "\204\132",	["\5"] = ' ',	["\6"] = " ", } local function encode(text) text = mw.ustring.toNFD(text) :gsub("u\204\136","U") :gsub("[bpnzcs\204][vfgh\128\129\132\140]",digraph_encode) return text end local function decode(text) text = mw.ustring.toNFC(text:gsub("[BPNZCSU\1-\7]",digraph_decode)) return text end

local function py_join_syllables(text) text = text:gsub("'(\5?[bpmfvBPdtnlgkhjqxZCSrzcsyw])","%1"):gsub("ng","N") return text end

local function py_divide_syllables(text) local res = text :gsub("([aeiouU\1-\4])N%f[aeiouU]","%1n'g") :gsub("[bpmfvBPdtnlgkNhjqxZCSrzcsyw][aeiouU]","'%0") :gsub("''+","'") :gsub("%f[^ %z]'","") local check = py_join_syllables(res) if text ~= check then error("Xi'an: error with apostrophes, "..decode(text).." should be "..decode(check)..".") end return res end

local function py_put_tone(syllable, tone) syllable = syllable:gsub("[iuU]?[aeiouU]", "%0" .. (tone~="5" and string.char(tone) or ""), 1) return syllable end

local function py_transf(syllable) local tone = tostring((syllable:match("[\1-\4]") or "\5"):byte(1)) local syllable_detone, count = syllable:gsub("[\1-\4]","") if count > 1 then error("Xi'an: two tones in one syllable: " .. decode(syllable)) end local check = py_put_tone(syllable_detone,tone) if check ~= syllable then error("Xi'an: error with tone placement, "..decode(syllable).." should be "..decode(check)..".") end return tone .. syllable_detone end

-- canonize to adhere to pinyin rules, e.g. jü -> ju local function py_canonize(text) text = text :gsub("([jqx])U","%1u") :gsub("%f[%l%u]u[in]?",{u="w",ui="wei",un="wen"}) :gsub("%f[%l%u]oN","weN") :gsub("w(r?)%f[^%l%u]","wu%1") :gsub("%f[%l%u]i[hu]?",{i="y",ih="ri",iu="you"}) :gsub("y([nN]?r?)%f[^%l%u]","yi%1") :gsub("%f[%l%u]U","yu") :gsub("i[ih]","i") return text end

-- normalize to initial+final, e.g. ju -> jü local function py_normalize(text) local res = text :gsub("([jqx])u","%1U") :gsub("w[ue][inN]?",{wu="u",wei="ui",wen="un",weN="oN"}) :gsub("w","u") :gsub("y[iuo]u?",{yi="i",yu="U",you="iu"}) :gsub("y","i") :gsub("([zcs])i","%1ii") :gsub("([ZCSr])i","%1ih") :gsub("rih%f[^%l%u]","ih") local check = py_canonize(res) if text ~= check then error("Xi'an: invalid syllable: "..decode(text).." should be "..decode(check)) end return res end

local function py_to_ipa(text) text = text:gsub("[^ ]+",function(syllable)		local a,b,c,d = syllable:match("^([12345])([bpmfvBPdtnlgkNhjqxZCSrzcs]?)([aeiouU][%lN]*)([12345]?)$")		if not a then error("Xi'an: Invalid syllable: " .. decode(syllable)) end		return (initials[b] or error("Xi'an: Invalid initial: " .. decode(b)))			.. (finals[c] or error ("Xi'an: Invalid final: " .. decode(c)))			.. tones[a]			.. (d~="" and "⁻"..tones[d] or "")		end) return "/" .. text .. "/" end

-- returns (display_text, phonetic_text, ipa) function export.py_process(text) local conv_display = {} local conv_hidden = {} local conv_ipa = {} local i = 0 for reading in mw.text.gsplit(text,"/",true) do		i = i + 1 conv_display[i] = reading:gsub("[12345]","") -- no check is done for things like "xUān", any capitalisation is valid reading = mw.ustring.lower(reading) reading = encode(reading) reading = py_divide_syllables(reading) if reading:match("[12345]") then local phonetic = reading :gsub("([bpmfvBPdtnlgkNhjqxZCSrzcsyw]?[iuU]?[aeiouU])[\1-\4]?([%lN]*)([1-5])", function(a,b,c)					return "\5" .. a .. (c~="5" and string.char(c) or "") .. b .. "\6"				end) phonetic = py_join_syllables(phonetic) conv_hidden[i] = conv_display[i] .. " [Phonetic: " .. decode(phonetic) .. "]"		else conv_hidden[i] = conv_display[i] end reading = reading:gsub("'"," "):gsub("[^ ]+",py_transf) reading = py_normalize(reading) conv_ipa[i] = py_to_ipa(reading) end return table.concat(conv_display, " / "), table.concat(conv_hidden, " / "), table.concat(conv_ipa, ", ") end

return export