Module:wuu-pron

local export = {} local data = mw.loadData("Module:wuu-pron/data")

--[=[ TODO: - do IPA for glottalised nasal intials (currently the glottal stop is dropped) - FIND DATA FOR 3+ SYLLABLE SANDHI AND RPS!!! (we can settle for trisyllabics right) - northern wu data: FIRST PRIORITY should be - CHUANSHA 川沙 (sandhi found in 當代, needs. understanding) - NEW KUNSHAN 崑山新派 (trisyllabic sandhi needed) - HUZHOU 湖州 (trisyllabic sandhi needed) - XIAOSHAN 蕭山 (大西博子 source doesn't exactly line up with wugniu 市志) - CIXI 慈溪 (慈溪方言研究 PDF exists but currently unretrieved, please inform @ND381 if a working PDF is in your possession ASAP!! zia-zia) - any other northern wu points are good!! pls note the source u used in the EDIT SUMMARY if u do add one zia-zia - MODULES for TAIZHOU & WENZHOU → and then we're done (probably) ]=]--

local loc_names = {     -- Basic 聲韻調    Disyll LPS    Trisyll LPS    Quad+ LPS    RPS    Logic ['sh'] = 'Shanghai', --     +               +             +             +         +       + ['jd'] = 'Jiading', --      +               +             +             -         -       ? ['sj'] = 'Songjiang',--     +               +             +             -         -       + -- ['cs'] = 'Chuansha', ['cm'] = 'Chongming',--     +               +             +             -         +       + ['sz'] = 'Suzhou',  --      +               +             +             +         +       + -- ['ks'] = 'Kunshan',          +               +             -             -         -       -    Wugniu "新派" ['cz'] = 'Changzhou',--     +               +             +             -         -       + ['jx'] = 'Jiaxing', --      +               +             +             -         -       + ['tx'] = 'Tongxiang',--     +               +             +             -         -       + ['hn'] = 'Haining', --      +               +             +             -         +       +    Xiashi locality ['hy'] = 'Haiyan',  --      +               +             +             -         -       + ['hz'] = 'Hangzhou', --     +               +             +             +         +       + -- ['xs'] = 'Xiaoshan', ['sx'] = 'Shaoxing', --     +               +             +             -         +       + -- ['cx'] = 'Cixi', ['nb'] = 'Ningbo',  --      +               +             +             -         +       + }

-- default to " dialect" if empty -- specifies the name of the Wikipedia article of the lect local wiki_names = { sh = 'Shanghainese', jd = 'Taihu Wu', sj = 'Taihu Wu', jx = 'Taihu Wu', tx = 'Taihu Wu', hn = 'Taihu Wu', hy = 'Taihu Wu' }

local minidict = { ['sh'] = true, ['jd'] = true, ['cm'] = true, ['sz'] = true, ['cz'] = true, ['jx'] = true, ['hz'] = true, ['sx'] = true, ['nb'] = true }

local order = {'sh', 'jd', 'sj', 'cm', 'sz', 'cz', 'jx', 'tx', 'hn', 'hy', 'hz', 'sx', 'nb'}

local ipa_initial = { ["p"] = "p", ["ph"] = "pʰ", ["b"] = "b", ["m"] = "m", ["f"] = "f", ["v"] = "v", ["t"] = "t", ["th"] = "tʰ", ["d"] = "d", ["n"] = "n", ["l"] = "l", ["ts"] = "t͡s", ["tsh"] = "t͡sʰ", ["s"] = "s", ["z"] = "z", ["c"] = "t͡ɕ", ["ch"] = "t͡ɕʰ", ["dz"] = "d͡z", ["j"] = "d͡ʑ", ["gn"] = "n̠ʲ", ["sh"] = "ɕ", ["zh"] = "ʑ", ["k"] = "k", ["kh"] = "kʰ", ["g"] = "ɡ", ["ng"] = "ŋ", ["h"] = "h", ["gh"] = "ɦ", [""] = "", }

local ipa_initial_override = { --this always takes priority over the table above --additional unique initials can also be defined here --a question mark means the initial does not exist --there must be empty tables for all locations, even if there is nothing there ['sh'] = { ["dz"] = "?" },	['jd'] = { ["dz"] = "?", ["zh"] = "?" },	['sj'] = { ["p"] = "ɓ", ["t"] = "ɗ", ["f"] = "ɸ", ["v"] = "β", ["ch"] = "cʰ", ["c"] = "c", ["j"] = "ɟ", ["sh"] = "ç", ["zh"] = "?", ["dz"] = "?" },	['cm'] = { ["v"] = "fv", ["z"] = "sz", ["zh"] = "ɕʑ", ["gh"] = "hɦ", },	['sz'] = { ["dz"] = "?", ["zh"] = "?" },	--['ks'] = {		["zh"] = "?", ["h"] = "x"		}, ['cz'] = { },	['jx'] = { ['dz'] = "?", ["vh"] = "ʔv" },	['tx'] = { },	['hn'] = { },	['hy'] = { ["zh"] = "?" },	['hz'] = { ["zh"] = "?" },	['sx'] = { },	['nb'] = { } }

local function get_initial(initial, loc) return ipa_initial_override[loc][initial] or ipa_initial[initial] or error('Invalid initial: "' .. initial .. '"') end

local function get_final(final, loc) return data.ipa_final[loc][final] or error('Invalid final: "' .. final .. '"') end

local ipa_syllabic = { ["m"] = "m̩", ["n"] = "n̩", ["ng"] = "ŋ̍", }

-- diagnose tone error local function diagnose_tones(word_length, loc, text, tone, tone2, tone3) -- the cap on number of syllables local syl_cap = ({sh=5,sj=3,cm=3,sz=4,cz=3,jx=3,tx=3,hn=3,hy=3,hz=5,sx=3,nb=3})[loc] if syl_cap and word_length > syl_cap then error(("Maximum %d syllables supported for %s."):format(syl_cap, loc)) end -- the cap on number of specified tones local tone_cap = ({sj=3,ks=2,cz=3,jx=3,tx=3,hn=3,hy=2})[loc] if tone_cap then local expected = math.min(tone_cap, word_length) local received = 1 + (tone2 ~=  and 1 or 0) + (tone3 ~=  and 1 or 0) if received ~= expected then error(('Expected %d tones, but received %d: "%s:%s".'):format(expected, received, loc, text)) end elseif loc == 'sz' or loc == 'sx' then -- sz: tone is 7 or 8, but second tone not provided error("For " .. loc .. ", second tone must be specified.") end error(('Incorrect tone notation "%s" for %s. See WT:AZH/Wu.'):format(tone..tone2..tone3, loc)) end

local function tone_superscript(text) return text:gsub('[1-5]',{['1']='¹',['2']='²',['3']='³',['4']='⁴',['5']='⁵'}) end

local function get_tone(text, loc) local word_length = text:gsub("[^ ]+", ""):len + 1 local tone, tone2, tone3 = text:match("^(.%u*)%w+ ?(%d?%u?)%w* ?(%d?%u?)") if loc == "jx" and tone == "3" then tone = text:find("^3[ptkc]s?h") and "3B" or "3A" elseif loc == "cm" then local result = nil if tone:find("[MP]") then -- Verb + Motion / Verb + Pronoun if word_length ~= 2 then error("cm: Unsupported word length.") end result = data.tone_contours[loc][tone] or error("cm: Wrong motion/pronoun format.") elseif tone:find("R",1,true) then -- Reduplication local main_tone, redup_type, word, sub_tone = text:match("^(%d)R([VCN])(%l+) (%d)%3$") main_tone, sub_tone = tonumber(main_tone), tonumber(sub_tone) local conv_tone = (redup_type == "N" and main_tone%2==0 and word:find("^g?[mnl]") and main_tone-1) or main_tone if sub_tone ~= conv_tone then error("cm: Wrong reduplication format.") end result = data.tone_contours[loc]["R"..redup_type..main_tone] end if result then return tone_superscript(result) end elseif loc == "sx" and tone:find("^%dA$") then return tone_superscript(data.tone_contours[loc][tone]) end local result = data.tone_contours[loc][word_length..tone..tone2..tone3] or data.tone_contours[loc][word_length..tone..tone2] or data.tone_contours[loc][word_length..tone] return result and tone_superscript(result) or diagnose_tones(word_length, loc, text, tone, tone2, tone3) end

local function RPS_tone_determ(word_length, tone, loc) local result if word_length == 1 then result = data.tone_contours[loc][tone .. "s"] or data.tone_contours[loc]['s'] else result = data.tone_contours[loc]["multiple"] end return tone_superscript(result) end

local function rom_check(text, locs) --this checks wugniu if text:match("%f[%l']['qx]") or text:match('ny') or text:match('hh') or text:match("h$") then error('Invalid syllable: ' .. text ..'. Wugniu expected, but another romanisation is supplied.') end if text:match('ghi') and locs ~= 'cm' then error('Invalid initial "ghi". Use "yi" instead.') end if text:match('ghu') and locs ~= 'cm' then error('Invalid initial "ghu". Use "wu" instead.') end if text:match('%f[%l]y%f[%L]') then error('Invalid syllable "y"') end if text:match('%f[%l]y[nq]') then error('Invalid syllable "yn" or "yq"') end if text:match('gn[aeou]') then error('Palatalization expected. Insert an "i" after the "gn".') end if text:match('uw') then error(('Invalid syllable in "%s".'):format(text)) end if locs:find('cm') and (text:find('ueu') or text:find('uon') or text:find('ui')) then error('cm: Mutation-only final found.') end for syl in text:gmatch("[%d%l%u]+") do		if not syl:match("%d") then require("Module:debug").track("wuu-pron/no-tone") if locs ~= "sh" then require("Module:debug").track("wuu-pron/no-tone-other") end end end return nil end

function export.ipa_syl_conv(text, loc, initials, finals, syllabics, i, main_tone, tone) -- get ipa from tables local initial, final = text:match("^([td]?[pbmfvtdnlszcjghk][hng]?)(.+)$") local if_syllabic = syllabics[text] if loc == 'sx' and text == 'gn' then if_syllabic = "ɲ̩" end if not initial or if_syllabic then initial, final = '', text end if loc == 'cm' then -- mutation local mutated_initial = i > 1 and initial == "z" and "z" local preglottal = "" if tone ~= "0" and (mutated_initial or initial:find("^g?[mnl]") or initial == "") then preglottal = (i > 1 or main_tone:find("^[1357]$")) and "ʔ" or "ɦ" end return preglottal .. (mutated_initial or initials(initial,loc)) .. (if_syllabic or finals(final,loc)) end return initials(initial,loc) .. (if_syllabic or finals(final,loc)) end

function export.wugniu_to_ipa(original_text, loc, initials, finals, syllabics, tones) local text, conv_text = "", "" local tone_number = "" original_text = original_text:gsub(" (%l+)(%d%u?)", ' %2%1') if loc == 'cm' then original_text = original_text:gsub("%f[%l]yi?","i"):gsub("%f[%l]wu?","u") else original_text = original_text:gsub("%f[%l]yi?","ghi"):gsub("%f[%l]wu?","ghu") end local reading = mw.text.split(original_text, ",", true) local syllable = {} local syl_tone = {} for reading_index = 1, #reading, 1 do		local components = mw.text.split(reading[reading_index], "&", true) for component_index = 1, #components do			local indep_words = mw.text.split(components[component_index], "+", true) for indep_index = 1, #indep_words do				text = indep_words[indep_index] tone_number = text:sub(1, 1) local tone = tones(text, loc) text = text:gsub("[^ %l]+", "") local syllable = mw.text.split(text, " ", true) local syl_tone = mw.text.split(tone, " ", true) for i = 1, #syllable, 1 do					--RPS if i == #syllable and indep_words[indep_index + 1] and tone ~= "³³" then syl_tone[i] = RPS_tone_determ(#syllable, tone_number, loc) end syllable[i] = (syllable[i] ~= "" and export.ipa_syl_conv(syllable[i], loc, initials, finals, syllabics, i, tone_number, syl_tone[i]) or "") .. (syl_tone[i] == "0" and "" or syl_tone[i]) end indep_words[indep_index] = table.concat(syllable, " ") end components[component_index] = table.concat(indep_words, " ") end reading[reading_index] = table.concat(components, " ") end return table.concat(reading, "/, /") end

function export.wikt_to_wugniu(text) require("Module:debug").track("wuu-pron/legacy") if type(text) == "table" then text = text.args[1] end return text --initials :gsub("'+", {["'"]=""}) :gsub("%f[%l][jqx][jx]?", {j="c", jj="j", q="ch", x="sh", xx="zh"}) :gsub("%f[%l]ny", "gn") :gsub("%f[%l]hh", "gh")

--vowels :gsub("un", "uen") :gsub("yoe", "ioe") :gsub("y", "iu") :gsub("aan", "aon") :gsub("%f[er]r", "y")

--syllabics :gsub("g?h?mm", "m") :gsub("g?h?ngg", "ng")

--tones :gsub("[2-5]", {['2']='5', ['3']='6', ['4']='7', ['5']='8'}) --gh rules :gsub("ghi", "yi") :gsub("yi%f[aeou]", "y") :gsub("ghu", "wu") :gsub("wu%f[aeo]", "w") end

local function wugniu_to_wikt(text) if type(text) == "table" then text = text.args[1] end --initials return export.wugniu_format(text		:gsub("%f[%l][cjszg][nh]?", {c="j", ch="q", j="jj", sh="x", zh="xx", gn="ny", gh="hh"})		:gsub("%f[%l]yi?", "hhi")		:gsub("wu?", "hhu")

--vowels :gsub("y%f[%L]", "r") :gsub("uen", "un") :gsub("ioe", "yoe") :gsub("iu", "y") :gsub("aon", "aan")

--syllabics :gsub("%f[%l][mn]g?%f[%L]", {m="mm",n="nn",ng="ngg"})

--initial hh and ' :gsub("([157])([mnl])", "%1'%2") :gsub("([68])([mn][mng]g?)%f[%L]", "%1hh%2")

--tones :gsub("[5-8]", {['5']='2', ['6']='3', ['7']='4', ['8']='5'})) end

function export.wugniu_format(text, loc) -- 1a a 1a 1a3 a1 -> ^1a-a-a_1-^1a_3-a_1 -- 1a3-3a5 -> ^1a_3-^3a_5 return text :gsub("[%- &+,]", {["-"]="", [" "]="-", ["&"]=" ", ["+"]=" ", [","]="; "}) :gsub("(%-?)(%d?%u?)('?%l+)(%d?%u?)", function(dash, tone1, main, tone2)			if dash == '-' and tone2 ==  then				tone1, tone2 = tone2, tone1			end			if tone1 ~=  then				tone1 =  .. tone1 .. 			end			if tone2 ~=  then				tone2 =  .. tone2 .. ''			end			return dash .. tone1 .. main .. tone2		end) end

local function wikt_format(text) return export.wugniu_format(text) end

local function minidict_format(text) -- 1A3 3B5 3C D3 E -> A^3 B^5 C^3 D^3 E	-- 1A B -> A^1 B	return text :gsub("-", "") :gsub("[&+]", " ") :gsub(",", "; ") :gsub("0", "") :gsub("[1-8]?(%l+)([1-8])", '%1%2') :gsub("([1-8])(%l+)", '%2%1') :gsub("%f[%l]([mnlr]%l*)([1357])", "'%1%2") :gsub("[1-8]",{			["1"]="平",["2"]="平",			["3"]="上",["4"]="上",			["5"]="去",["6"]="去",			["7"]="入",["8"]="入",		}) end

function export.wugniu_to_minidict(text, loc) if type(text) == "table" then text = text.args[1] end text = text:gsub('%f[%l][yw]', {y = 'yi', w = 'wu'}) if loc == 'sx' then text = text:gsub("[ei]+[nq]",{een="en",en="eon",iq="ieq"}) elseif loc == 'hz' then -- are we dealing with mergers? text = text:gsub("[aeiu]+q?%f[%L]",{eu="ei",ieu="iu",aq="eq",iaq="ieq",iq="ieq",uaq="ueq"}) elseif loc == 'sz' or loc == 'cz' then text = text:gsub("%f[%l]yie%f[%L]", "yiie") -- ye > yie elseif loc == 'nb' then text = text:gsub("yu%f[nq]", "oe") elseif loc == 'sh' then text = text:gsub("ie%f[%L]", "iae") elseif loc == 'cm' then text = text:gsub("→%l+", "") elseif loc == 'jd' then text = text:gsub("ue%f[%L]", "uie") end return minidict_format(text		--finals & syllabic		:gsub("iu([nq])", "iui%1")		:gsub("gher", "r")		:gsub("er", "r")		:gsub("q", "h"))

--initials --Glottal stops? text = text:gsub("", "'") :gsub("gn", "ny") :gsub("nyi%f[aeou]", "ny") :gsub('yi([aeiou])', 'y%1') :gsub('wu([aeiou])', 'w%1') end

-- various boilerplates function export.name_boilerplate(name, wiki) return ' .. name.. ' end

function export.consolas(text) return '' .. text .. ' ' end

function export.wugniu_boilerplate(text) return '\n*** Wugniu : ' .. export.consolas(text) end

function export.minidict_boilerplate(text) return '\n*** MiniDict : ' .. export.consolas(text) end

function export.wikt_boilerplate(text) return '\n*** Wiktionary Romanisation (Shanghai) : ' .. export.consolas(text) end

function export.IPA_boilerplate(text, name, wiki) text = text:gsub("(/?[^ /,]*/[^ /,]*/?)", ' %1 ') return '\n*** Sinological IPA' .. ' (' .. export.name_boilerplate(name, wiki) .. ') : ' .. ' /' .. text .. '/ ' end

local function preprocess_IPA(text, loc) if loc == 'hz' then return text:gsub("%f[%l]([td]?[sz]h?u)%f[aeonq]", "%1w") elseif loc == 'sx' then return text:gsub("[^,&]+%+[^,&]+", function(chain)			local tone1,mode,word1,tone2,word2 = chain:match("^(%d)([AP]?)(%l+)%+(%d)(%l+)$")			if not tone1 then error("sx: Wrong chain format.") end			if mode == '' then mode = 'O' end			if mode == 'A' then				return tone1..'A'..word1..'&'..tone2..word2			end			return tone1..word1..' '..tone2..mode..word2		end):gsub("#(%d)","%1N") elseif loc == 'cm' then return text:gsub("%f[%l]%l+<(%l*)>","%1") end return text end

local function preprocess_wugniu(text, loc) if loc == 'jx' then return text:gsub("3[AB]","3") elseif loc == 'cm' then return text:gsub("[CMPR][VCN]?","") :gsub("%f[%l](%l*)<(%l*)>(%l*)(%d?)","%1%3%4→%2%3") elseif loc == 'sx' then return text:gsub("[#CAP]","") end return text end

local function preprocess_mutation(text, locs) if locs:find('cm') then text = text:gsub(" (%d?C?)([vzgd]h?)([%w<>]+)", function(tone, initial, final)			local mutated_initial = ({v="u",zh="",gh=""})[initial]			if mutated_initial == "u" and final:find("^u") then				mutated_initial = ""			elseif initial == "d" and final:find("^i") then				mutated_initial = "l"			end			if final:find("<") or not mutated_initial then				return " "..tone..initial..final			end			return " "..tone..initial.."<"..mutated_initial..">"..final		end) end if text:find("<") and locs ~= "cm" then error("cm: Mutation is incompatible with collapsing.") end return text end

function export.make(text, w_count) if not text:match(':') then -- assume Shanghainese text = 'sh:'..text end local show = "" local hide = "" local roms = {} local input_seen, duplicated = {}, false text = mw.text.split(text, ';', true) local show_name = "Northern" if #text == 1 and text[1]:find("^..:") then -- single locality local loc = text[1]:sub(1,2) show_name = export.name_boilerplate(loc_names[loc], wiki_names[loc]) end for i = 1,#text,1 do		local s = mw.text.split(text[i], ':', true) if not duplicated then if input_seen[s[2]] then duplicated = true end input_seen[s[2]] = true end if #s ~= 2 or #s[1] == 0 then error("Wugniu: prefix is required or too many prefixes") end local locs, t = mw.text.split(s[1], ',', true), s[2] t = preprocess_mutation(t, s[1]) local list = {} local format_text = t		for _, loc in ipairs(locs) do			if loc_names[loc] then list[loc] = true else error('Wugniu: prefix "' .. loc .. '" is not recognized') end format_text = preprocess_wugniu(format_text, loc) end rom_check(t, s[1]) local wugniu_text = export.wugniu_format(format_text, locs[1]) table.insert(roms,wugniu_text) local names = {} local minidicts = {} local minidicts_seen = {} local IPAs = {} for _, loc in ipairs(order) do if list[loc] then table.insert(names, export.name_boilerplate(loc_names[loc], wiki_names[loc])) if minidict[loc] then local minidict_result = export.wugniu_to_minidict(format_text, loc) if not minidicts_seen[minidict_result] then table.insert(minidicts, minidict_result) minidicts_seen[minidict_result] = true end end local ipa_text = preprocess_IPA(t, loc) ipa_text = export.wugniu_to_ipa(ipa_text, loc, get_initial, get_final, ipa_syllabic, get_tone) table.insert(IPAs,export.IPA_boilerplate(ipa_text, loc_names[loc], wiki_names[loc])) end end hide = hide .. '\n** (Northern: ' .. table.concat(names,', ') .. ') ' hide = hide .. export.wugniu_boilerplate(wugniu_text) for _,minidict_text in ipairs(minidicts) do hide = hide .. export.minidict_boilerplate(minidict_text) end if list.sh then hide = hide .. export.wikt_boilerplate(wugniu_to_wikt(format_text)) end hide = hide .. table.concat(IPAs, '') end if not w_count or w_count > 1 then show = '\n** ('..show_name..') : ' .. export.consolas(table.concat(roms, ' / ')) else show = ' ('..show_name..', Wugniu) : ' .. export.consolas(table.concat(roms, ' / ')) end if duplicated then require("Module:debug").track("wuu-pron/duplicated") end return show, hide end

return export