Module:User:Theknightwho/cmn-pron

local concat = table.concat local error = error local explode = require("Module:string utilities").explode_utf8 local insert = table.insert local rawget = rawget local rawset = rawset local select = select local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local type = type local _lower = string.lower local _upper = string.upper local ulower = string.ulower local uupper = string.uupper

local data = mw.loadData("Module:User:Theknightwho/cmn-pron/data") local num_to_segment = data.num_to_segment local segment_to_num = data.segment_to_num

local export = {}

local function lower(str) return str and (#str > 1 and ulower or _lower)(str) or nil end

local function upper(str) return str and (#str > 1 and uupper or _upper)(str) or nil end

local function match(str, pat) return type(str) == "string" and str:match(pat) or nil end

local m_parser = require("Module:parser") local Parser = m_parser.Parser

-- Modified read method to add keep_capital parameter. function Parser:read(delta, keep_capital) local this = self.text[self.head + (delta or 0)] or "" return keep_capital and this or lower(this) end

local Node = m_parser.Node local Wikitext = m_parser.Wikitext

function Wikitext:iterate local proxy = self:new_proxy for node, parent, key in self:__pairs("next") do		if type(node) == "string" or node.type == "syllable" then proxy:build(node, parent, key) end end return proxy.iter, proxy, 0 end

local Syllable = Node:new_class("syllable") Syllable.next = Node.next_node

function Syllable:__index(k) local ret = Syllable[k] or rawget(self, segment_to_num[k] or k)	return ret ~= "" and ret or nil end

function Syllable:__newindex(k, v)	local segment_k = segment_to_num[k] rawset(self, segment_k or k, v and v or segment_k and "" or nil) end

function Syllable:__tostring return concat(self, nil, 1, 6) end

function Syllable:normalize_keys(raw_syl) local i, len = 0, raw_syl.len while i < len do		i = i + 1 local v = raw_syl[i] self[ v == "o" and (				self.nucleus and "glide2" or "nucleus"			) or			match(v, "[iuwy\195]\188?$") and ( -- iuüwy (final char)				(self.glide1 or self.nucleus) and "glide2" or "glide1"			) or			match(v, "^[ae\195]\170?$") and "nucleus" or -- aeê match(v, "^[mn]") and (				i > 1 or				len == 1 or				(len == 2 and match(raw_syl[2], "r"))			) and "nasal" or			i > 1 and match(v, "r") and "erhua" or			"initial" ] = v	end self.tone = raw_syl.tone or 5 return self end

function Syllable:check_invalid if (		self.initial == "gn" and self.glide1 == "u" or		not self.glide1 and ( self.initial == "gn" or			match(self.initial, "[jqx]") )	) then error("'" .. self.initial .. self.nucleus .. "' is not valid in pinyin.") elseif (		self.nucleus == "o" and self.glide2 == "i" or		match(self.nucleus, "^[e\195]\170?$") and match(self.glide2, "[ou]")	) then error("'" .. self.nucleus .. self.glide2 .. "' is not valid in pinyin.") end return self end

function Syllable:normalize_glide1 local glide1 = self.glide1 if not glide1 then if (			self.nucleus == "o" and			not (self.glide2 or self.nasal) and			match(self.initial, "[bfmpv]")		) then self.glide1 = "u" end elseif match(glide1, "^y?i?$") then self.glide1 = "i" elseif match(glide1, "^w?u?$") then self.glide1 = match(self.initial, "[jqx]") and "ü" or "u" elseif match(glide1, "^y?[u\195]?\188?$") then self.glide1 = "ü" else -- "wi" and "wü" are too weird to try to correct error("'" .. glide1 .. "' is not valid in pinyin.") end return self end

function Syllable:normalize_e_nucleus if self.glide2 or (self.glide1 and not self.nasal) then self.nucleus = "ê" end end

function Syllable:normalize_o_nucleus if self.glide2 or not self.nasal then return elseif match(self.glide1, "^[i\195]\188?$") then self.glide1 = "ü" self.nucleus = "e" elseif self.initial or self.glide1 then -- not "on(g)" self.glide1 = "u" self.nucleus = "e" end end

function Syllable:normalize_implicit_nucleus if self.glide2 then if self.nasal then self.glide1 = "ü" self.nucleus = "e" self.glide2 = nil elseif self.glide1 == "i" then self.nucleus = "o" else self.nucleus = "ê" end elseif self.nasal then self.nucleus = "e" elseif self.glide1 == "i" and match(self.initial, "[crsz]") then self.glide1 = nil self.nucleus = "ɨ" end end

function Syllable:convert(funcs) local output = {} for i = 1, 7 do		funcs[i](self, self[i], output) end if self.capitalize then output[1] = output[1]:gsub("^[%z\1-\127\194-\244][\128-\191]*", upper) end return concat(output) end

function Syllable:new(raw_syl) local syl = setmetatable({"", "", "", "", "", ""}, Syllable) :normalize_keys(raw_syl) :check_invalid :normalize_glide1 if syl.glide2 == "o" then syl.glide2 = "u" elseif syl.nucleus == "e" then syl:normalize_e_nucleus elseif syl.nucleus == "o" then syl:normalize_o_nucleus elseif syl.glide1 and not syl.nucleus then syl:normalize_implicit_nucleus end if raw_syl.capitalize then syl.capitalize = true end return syl end

do local tones = data.raw_tones local handle_initial local handle_glide1 local handle_nucleus local handle_glide2 local handle_nasal local handle_erhua local handle_number function handle_initial(self, this) self.n.handler = handle_glide1 if not match(this, "^[bcdfghjklmnpqrstvxz\197]\139?$") then -- bcdfghjklmnŋpqrstvxz return self:consume end local nxt = self:read(1) if (			match(this, "^[mn\197]\139?$") and			(tones[nxt] or match(nxt, "[0-5]"))		) then self.n.handler = handle_nasal return self:consume elseif match(this, "[csz]") and match(nxt, "^[h\204]\130?$") then -- h + circumflex self:advance this = this .. "h" nxt = self:read(1) elseif (			this == "n" and nxt == "g" or			( this == "g" and nxt == "n" and match(self:read(2), "^[aeimnou\197]\139?$") -- aeimnŋou ) -- not *gng etc.		) then self:advance this = this .. nxt nxt = self:read(1) elseif this == "ŋ" then this = "ng" end self:emit(this) if tones[nxt] or match(nxt, "[0-5]") then self:advance self.n.tone = tones[nxt] or tonumber(nxt == "0" and 5 or nxt) return self:pop end end function handle_glide1(self, this) self.n.handler = handle_nucleus if not match(this, "[iuwy]") then return self:consume end local nxt = self:read(1) if match(this, "[wy]") then if match(nxt, "[iu]") then self:advance if nxt == "u" and self:read(1) == "\204\136" then -- diaeresis this = this .. "ü" self:advance else this = this .. nxt end nxt = self:read(1) end self:emit(this) elseif this == "i" then self:emit(this) elseif this == "u" then if nxt == "\204\136" then -- diaeresis this = "ü" self:advance nxt = self:read(1) end self:emit(this) end if tones[nxt] then self:advance self.n.tone = tones[nxt] end end function handle_nucleus(self, this) self.n.handler = handle_glide2 local nxt = self:read(1) if self.n.tone and (			tones[nxt] or			nxt == "\204\130" and tones[self:read(2)] or -- circumflex			match(nxt, "[0-5]")		) then self:advance(-1) return self:pop elseif not match(this, "[aeo]") then return self:consume elseif this == "e" and self:read(1) == "\204\130" then -- circumflex this = "ê" self:advance end self:emit(this) nxt = self:read(1) if tones[nxt] then self:advance self.n.tone = tones[nxt] end end function handle_glide2(self, this) self.n.handler = handle_nasal local nxt = self:read(1) if (			self.n.tone and (tones[nxt] or match(nxt, "[0-5]")) or			this == "i" and match(self:emitted, "i$") or			this == "u" and match(self:emitted, "[u\195]\188?$") -- uü		) then self:advance(-1) return self:pop elseif match(this, "[iou]") then self:emit(this) else return self:consume end if tones[nxt] then self:advance self.n.tone = tones[nxt] end end function handle_nasal(self, this) self.n.handler = handle_erhua if not match(this, "^[mn\197]\139?$") then return self:consume end local emitted = self:emitted local nxt = self:read(1) if (			match(emitted, "^[mn]") or			match(nxt, "[aeiou]") or			(tones[nxt]) and match(emitted, "^[aeiou\195][\170\188]?$") or -- aeêiouü			self.n.tone and (tones[nxt] or match(nxt, "[0-5]"))		) then self:advance(-1) return self:pop elseif tones[nxt] then self:advance self.n.tone = tones[nxt] nxt = self:read(1) end if (			this == "n" and nxt == "g" and			not match(self:read(2), "^[aeiou\204][\128\129\132\140]?$") -- aeiou + tones		) then this = "ng" self:advance elseif this == "ŋ" then this = "ng" end self:emit(this) end function handle_erhua(self, this) self.n.handler = handle_number local nxt = self:read(1) if this ~= "r" and (this ~= "'" or nxt ~= "r") then return self:consume elseif (			self.n.tone and (tones[nxt] or match(nxt, "[0-5]")) or			match( this == "'" and self:read(2) or nxt, "^[aeiou\204][\128\129\132\140]?$" -- aeiou + tones )		) then self:advance(-1) return self:pop elseif this == "r" and (#self.n ~= 1 or self.n[1] ~= "e") then this = "'r"		elseif this == "'" and nxt == "r" then this = "'r"			self:advance end self:emit(this) end function handle_number(self, this) if not match(this, "^[0-5]$") then self:advance(-1) return self:pop end self.n.tone = tonumber(this == "0" and 5 or this) return self:pop end function Parser:do_syllable(capitalize) self.n.capitalize = capitalize and true or nil rawset(self.n, "handler", handle_initial) end end

do local function handle_syllable_break(self, this) if this ~= " " and this ~= "-" and this ~= "'" then self.n.override = nil return self:consume end end local function main_handler(self, this) if match(this, "^[%l\197]\139?$") then a = true self:emit(Syllable:new(self:get(				"do_syllable",				self.n.allow_capital and match(self:read(0, true), "^[%u\197]\138?$")			))) self.n.allow_capital = nil elseif this == " " or this == "-" then self:emit(this) self.n.allow_capital = true self.n.override = handle_syllable_break elseif this == "'" then self.n.override = handle_syllable_break elseif this == "" then return self:pop else error("Invalid character (" .. this .. ") at position " .. self.head .. ".") end end function Parser:do_parse self.n.allow_capital = true rawset(self.n, "handler", main_handler) end function export.normalize(text) return (select(2, Parser:parse{ text = explode(toNFD(text)), node = {Wikitext, true}, route = {"do_parse"} }))	end end

function Wikitext:convert(funcs) self.output = {} for i, syl, proxy in self:iterate do		--iteration(syl, prev) end --	return output end

local Converter = {}

do local function no_op(self, this, output) if this ~= "" then insert(output, this) end end function Converter:__index(k) return rawget(self, num_to_segment[k]) or no_op end end

function Converter:new return setmetatable({}, Converter) end

do local pinyin = Converter:new local tones = data.pinyin_tones local tone_priority = data.pinyin_tone_priority function pinyin.glide1(self, this, output) if this == "" then return elseif self.nucleus == "e" and not self.glide2 and self.nasal == "ng" then if this == "u" and self.initial then return elseif this == "ü" then this = "i" end end if not self.initial then insert(output, this == "u" and "w" or "y") if this == "ü" then insert(output, "u") elseif not (self.nucleus or self.glide2 or self.nasal) then insert(output, this) end return elseif (			this == "u" and			self.nucleus == "o" and			not (self.glide2 or self.nasal) and			match(self.initial, "[bfmpv]")		) then return elseif this == "ü" and match(self.initial, "[jqx]") then this = "u" end insert(output, this) end function pinyin.nucleus(self, this, output) if this == "" then return elseif this == "e" and self.glide1 and not self.glide2 and self.nasal then if self.glide1 == "u" and not self.initial then this = "e" elseif (				self.nasal == "ng" and				(self.glide1 == "u" or self.glide1 == "ü")			) then this = "o" elseif (				self.glide1 == "i" and				not (self.initial or self.glide2) and				self.nasal			) then this = "i" else return end elseif this == "ê" then if (				(self.initial or self.glide1 == "ü") and				self.glide2 == "i" and				not self.nasal and				(self.glide1 == "u" or self.glide1 == "ü")			) then return elseif self.glide2 or (self.glide1 and not self.nasal) then this = "e" end elseif (			this == "o" and			self.initial and			self.glide1 == "i" and			self.glide2 == "u" and			not self.nasal		) then return elseif this == "ɨ" then this = "i" end insert(output, this) end function pinyin.glide2(self, this, output) if this == "" then return elseif this == "u" and self.nucleus == "a" then this = "o" end insert(output, this) end function pinyin.erhua(self, this, output) if this == "" then return elseif this == "'r" and (			self.glide1 or			self.nucleus ~= "e" or			self.glide2 or			self.nasal		) then this = "r" end insert(output, this) end function pinyin.tone(self, this, output) local best, pos = 0 for i = 1, #output do			local score = tone_priority[output[i]] or 0 if score >= best then best = score pos = i			end end output[pos] = output[pos]:gsub(			"^[%z\1-\127\194-\244][\128-\191]*",			"%0" .. (tones[this] or "")		) if pos == 1 or output[1] == "ng" or output[1] == "gn" then insert(output, 1, "'") end end local function iteration(syl, output, prev) if type(syl) == "string" then insert(output, syl) return end syl = syl:convert(pinyin) if (			syl:sub(1, 1) == "'" and			(not prev or prev == " " or prev == "-")		) then syl = syl:sub(2) end insert(output, syl) end function export.pinyin(text) local output, prev = {} for syl in text:iterate do			iteration(syl, output, prev) prev = syl end return toNFC(concat(output)) end end

do local zhuyin = Converter:new local letters = data.zhuyin_letters local compounds = data.zhuyin_compounds local tones = data.zhuyin_tones function zhuyin.initial(self, this, output) if this == "" then return end insert(output, letters[this]) end function zhuyin.glide1(self, this, output) if this == "" or (			this == "u" and			self.nucleus == "o" and			not (self.glide2 or self.nasal) and			match(self.initial, "[bfmpv]")		) then return end insert(output, letters[this]) end zhuyin.nucleus = zhuyin.initial function zhuyin.glide2(self, this, output) if this == "" then return end this = letters[this] local prev = output[#output] if prev then local compound = compounds[prev .. this] if compound and compound ~= this then output[#output] = compound return end end insert(output, this) end zhuyin.nasal = zhuyin.glide2 function zhuyin.erhua(self, this, output) if this == "r" then output[#output] = "ㄦ" elseif this == "'r" then insert(output, "ㄦ") end end function zhuyin.tone(self, this, output) if this == 5 then insert(output, 1, "˙") elseif self.erhua == "'r" then insert(output, #output, tones[this]) else insert(output, tones[this]) end end local function iteration(syl, output, prev) if type(syl) == "string" then insert(output, syl) return end syl = syl:convert(zhuyin) if syl == "ㄦ" and prev.tone == 1 and not prev.erhua then syl = "ㄦˉ" end insert(output, syl) end function export.zhuyin(text) local output, prev = {} for syl in text:iterate do			iteration(syl, output, prev) prev = syl end return toNFC(concat(output)) end end

return export