User:Arafsymudwr/Sandbox

Module:cy-IPA
local export = {}

local lang = require("Module:languages").getByCode("cy")

local m_IPA = require("Module:IPA") local m_a = require("Module:accent qualifier") local m_table = require("Module:table")

local parse_utilities_module = "Module:parse utilities" local patut_module = "Module:pattern utilities"

local listToSet = require("Module:table").listToSet

--[=[ FIXME:

1. Some words in ng have /ŋ/ and others have /ŋg// Wiktionary already sorts these separately. 2. Consonant clusters assimilate by losing voice, not regressive or progressive assimilation 3. Some common words in y and u are /i/ or /ɪ/ in North Wales 4. North Wales colloquial: unstressed /ɛ/ as /a/ 5. South Wales colloquial, lots of monophthongisation ]=]

local usub = mw.ustring.sub local rfind = mw.ustring.find local rmatch = mw.ustring.match local rsplit = mw.text.split local rsubn = mw.ustring.gsub local ulower = mw.ustring.lower local u = mw.ustring.char local ugcodepoint = mw.ustring.gcodepoint

export.dialects = {"NW", "SW"} export.dialects_to_names = { NW = "North Wales", SW = "South Wales", }

local written_unstressed_vowel_l = "aeiouwyAEIOUWY" local written_long_vowel_l = "àáâäèéêëìíîïòóôöùúûẁẃŵẅüỳýŷÿÀÁÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜẀẂŴẄỲÝŶŸ" local written_stressed_not_long_vowel_l = "àèìòùẁỳÀÈÌÒÙẀỲ" local written_stressed_vowel_l = written_long_vowel_l .. written_stressed_not_long_vowel_l local ipa_vowel_l = "ɪɨ̞ʊɛəɔ" local written_vowel_l = written_unstressed_vowel_l .. written_stressed_vowel_l local vowel_l = written_vowel_l .. ipa_vowel_l local V = "[" .. vowel_l .. "]" local written_stressed_to_plain_vowel = { ["à"] = "a", ["á"] = "a", ["â"] = "a", ["ä"] = "a", ["è"] = "e", ["é"] = "e", ["ê"] = "e", ["ë"] = "e", ["ì"] = "i", ["í"] = "i", ["î"] = "i", ["ï"] = "i", ["ò"] = "o", ["ó"] = "o", ["ô"] = "o", ["ö"] = "o", ["ù"] = "u", ["ú"] = "u", ["û"] = "u", ["ü"] = "u", ["ẁ"] = "w", ["ẃ"] = "w", ["ŵ"] = "w", ["ẅ"] = "w", ["ỳ"] = "y", ["ý"] = "y", ["ŷ"] = "y", ["ÿ"] = "y", ["À"] = "A", ["Á"] = "A", ["Â"] = "A", ["Ä"] = "A", ["È"] = "E", ["É"] = "E", ["Ê"] = "E", ["Ë"] = "E", ["Ì"] = "I", ["Í"] = "I", ["Î"] = "I", ["Ï"] = "I", ["Ò"] = "O", ["Ó"] = "O", ["Ô"] = "O", ["Ö"] = "O", ["Ù"] = "U", ["Ú"] = "U", ["Û"] = "U", ["Ü"] = "U", ["Ẁ"] = "W", ["Ẃ"] = "W", ["Ŵ"] = "W", ["Ẅ"] = "W", ["Ỳ"] = "Y", ["Ý"] = "Y", ["Ŷ"] = "Y", ["Ÿ"] = "Y", }

local sequences = { ["a"] = { ["a"  ] = "a"; ["à"  ] = "a"; ["á"  ] = "a"; ["â"  ] = "a"; ["ä"  ] = "a"; ["ae" ] = "ɑːɨ̯"; ["ai" ] = "ai̯"; ["au" ] = "aɨ̯"; ["aw" ] = "ɑːu̯"; };	["c"] = { ["c"  ] = "k"; ["ch" ] = "χ"; };	["d"] = { ["d"  ] = "d"; ["dd" ] = "ð"; };	["e"] = { ["e"  ] = "ɛ"; ["è"  ] = "ɛ"; ["é"  ] = "eː"; ["ê"  ] = "eː"; ["ë"  ] = "e"; ["ei" ] = "ɛi̯"; ["eu" ] = "əɨ̯"; ["ew" ] = "eːu̯"; ["ey" ] = "aɨ̯"; };	["f"] = { ["f"  ] = "v"; ["ff" ] = "f"; };	["g"] = "ɡ"; ["h"] = "h"; ["i"] = { ["i"  ] = "ɪ"; ["ì"  ] = "ɪ"; ["í"  ] = "iː"; ["î"  ] = "i"; ["ï"  ] = "iː"; ["iw" ] = "ɛi̯"; };	["j"] = "d͡ʒ"; ["k"] = { ["c"  ] = "k"; ["ck" ] = "k"; };	["l"] = { ["l"  ] = "l"; ["ll" ] = "ɬ"; };	["m"] = { ["m"  ] = "m"; ["mh" ] = "m̥"; };	["n"] = { ["n"  ] = "n"; ["nh" ] = "n̥"; ["ng" ] = "ŋ"; ["nn" ] = "n"; };	["o"] = { ["oo" ] = "oː"; ["os" ] = { "ɔ", "s" }; ["o"  ] = "ɔ"; };	["ö"] = { -- XXX: manchmal /øː/ ["ö"  ] = "œ"; ["ös" ] = { "œ", "s" }; };	["p"] = { ["ph" ] = "f"; ["pp" ] = "p"; ["p"  ] = "p"; };	["q"] = { ["qu" ] = { "k", "f" }; ["q"  ] = "k"; -- XXX };	["r"] = { -- XXX: /ʀ/? /r/?; manchmal /ɐ/ ("Uhr"); auch /ər/ ("oder") ["r"  ] = "r"; ["rr" ] = "r"; };	["s"] = { ["s"  ] = "s"; ["sch" ] = "ʃ"; ["sp" ] = { "ʃ", "p" }; ["ss" ] = "s"; ["st" ] = { "ʃ", "t" }; };	["t"] = { ["t"  ] = "t"; ["tsch"] = "t͡ʃ"; ["tt" ] = "t"; ["tion"] = { "t͡s", "i̯", "o", "n" }; };	["u"] = { ["u"  ] = "ʊ"; ["uch" ] = { "ʊ", "x" }; };	["ü"] = { ["ü"  ] = "yː"; ["üh" ] = "yː"; };	["v"] = "f"; ["w"] = "ʋ"; ["x"] = { "k", "s" }; -- XXX ["y"] = "i"; ["z"] = "z"; -- already converted from s	["ß"] = "s"; ["́"] = "ˈ"; -- FIXME ["-"] = {}; }

local AC = u(0x0301) -- acute = ́ local GR = u(0x0300) -- grave = ̀ local CFLEX = u(0x0302) -- circumflex = ̂ local DOTOVER = u(0x0307) -- dot over = ̇ local DIA = u(0x0308) -- diaeresis = ̈ local LINEUNDER = u(0x0331) -- lineunder = ̱

local stress_l = AC .. GR local stress_c = "[" .. stress_l .. "]" local ipa_stress_l = "ˈˌ" local ipa_stress_c = "[" .. ipa_stress_l .. "]" local sylsep_l = "%-." -- hyphen included for syllabifying from spelling; FIXME: formerly included SYLDIV local sylsep_c = "[" .. sylsep_l .. "]" local tie_l = "‿'" local tie_c = "[" .. tie_l .. "]" local charsep_l = sylsep_l .. tie_l .. stress_l .. ipa_stress_l local charsep_c = "[" .. charsep_l .. "]" local wordsep_l = "# " local wordsep_c = "[" .. wordsep_l .. "]" local separator_l = charsep_l .. wordsep_l local separator_c = "[" .. separator_l .. "]" local neg_guts_of_cons = vowel_l .. separator_l local C = "[^" .. neg_guts_of_cons .. "]" -- consonant class including h

export.mid_vowel_hints = "éèêëóòô" export.mid_vowel_hint_c = "[" .. export.mid_vowel_hints .. "]"

local TEMP_PAREN_R = u(0xFFF1) local TEMP_PAREN_RR = u(0xFFF2) -- Pseudo-consonant at the edge of prefixes ending in a vowel and suffixes beginning with a vowel; FIXME: not currently -- used. local PSEUDOCONS = u(0xFFF3) -- local PREFIX_MARKER = u(0xFFF4) -- marker indicating a prefix so we can convert primary to secondary accents

local valid_onsets = listToSet { "b", "bl", "br", "c", "cl", "cr", "ç", "d", "dj", "dr", "f", "fl", "fr", "g", "gl", "gr", "gu", "gü", "h", "i", "j", "k", "kl", "kr", "l", "ll", "m", "n", "ny", "ñ", "p", "pl", "pr", "qu", "qü", "r", "rr", "s", "ss", "t", "tg", "tj", "tr", "tx", "tz", "u", "v", "vl", "vr", "w", "x", "ʃ", -- e.g. 'χruʃóf' respelling of Khrusxov "χ", -- in case of respelling "y", "z", }

local decompose_dotover = { -- No composed i, u or U with DOTOVER. ["ȧ"] = "a" .. DOTOVER, ["ė"] = "e" .. DOTOVER, ["ȯ"] = "o" .. DOTOVER, ["ẏ"] = "y" .. DOTOVER, ["Ȧ"] = "A" .. DOTOVER, ["Ė"] = "E" .. DOTOVER, ["İ"] = "I" .. DOTOVER, ["Ȯ"] = "O" .. DOTOVER, ["Ẏ"] = "Y" .. DOTOVER, }

local unstressed_words = listToSet { -- proclitic object pronouns "em", "et", "es", "el", "la", "els", "les", "li", "ens", "us", "ho", "hi", "en", -- enclitic object pronouns usually attach with hyphen to preceding verb but not always, cf. tant me fa "me", "te", "se", "lo", "los", "nos", "vos", "ne", -- contracted object pronouns and articles attached with apostrophe so no need to include -- unstressed possessives "mon", "ma", "mos", "mes", "ton", "ta", "tos", "tes", "son", "sa", "sos", "ses", -- prepositions "a", "de", "per", "amb", "ab", -- 'en' already included as proclitic object pronouns -- prepositional contractions "al", "als", "del", "dels", "pel", "pels", -- articles 'el', 'la', 'els', 'les' already included as proclitic pronouns -- personal articles "na", -- 'en' already included above -- indefinite articles "un", "uns", -- salat articles "ets", "so", -- 'es' already included as proclitic object pronouns and 'ses', 'sa', 'sos' as possessives -- conjunctions "i", "o", "si", "ni", "que", }

-- Version of rsubn that discards all but the first return value. local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

-- Version of rsubn that returns a 2nd argument boolean indicating whether a substitution was made. local function rsubb(term, foo, bar) local retval, nsubs = rsubn(term, foo, bar) return retval, nsubs > 0 end

-- Apply rsub repeatedly until no change. local function rsub_repeatedly(term, foo, bar) while true do		local new_term = rsub(term, foo, bar) if new_term == term then return term end term = new_term end end

local function split_into_chars(text) local chars = {} for codepoint in ugcodepoint(text) do		table.insert(chars, u(codepoint)) end return chars end

local function split_on_comma(term) if term:find(",%s") or term:find("\\") then return require(parse_utilities_module).split_on_comma(term) else return rsplit(term, ",") end end

local function concat_keys(tab) local res = {} for k, _ in pairs(tab) do		table.insert(res, k)	end return table.concat(res) end

local function handle_unstressed_words(words) words = m_table.deepcopy(words)

-- Lowercase all words for ease in further processing. for i, wordobj in ipairs(words) do		wordobj.term = ulower(wordobj.term) end

-- Check if the word at index `i` in `words` is "amb" and the following word begins with a vowel. local function is_amb_to_join(words, i) return i < #words and words[i].term == "a" .. DOTOVER .. "mb" and rfind(words[i + 1].term, "^h?" .. V)	end local saw_amb_to_join = true

-- Mark all unstressed words with DOTOVER, so that split_syllables doesn't assign stress. We need to do this -- before special handling for amb, because amb may join to another unstressed word like el, in the -- process losing the identity of the two words. In the process, see if amb occurs before a following -- vowel-initial word (which may begin with h-). for i, wordobj in ipairs(words) do -- Put DOTOVER after the last vowel (to handle the case of que). It doesn't actually matter where we put -- it, because split_syllables just looks for DOTOVER anywhere in the word. if unstressed_words[wordobj.term] then wordobj.term = rsub(wordobj.term, "^(.*" .. V .. ")", "%1" .. DOTOVER) end if is_amb_to_join(words, i) then saw_amb_to_join = true end end

-- Join amb before vowel-initial word with following word. if saw_amb_to_join then local new_words = {} local i = 1 while i <= #words do			if is_amb_to_join(words, i) then table.insert(new_words, {term = words[i].term .. "‿" .. words[i + 1].term, pos = words[i + 1].pos}) i = i + 2 else table.insert(new_words, words[i]) i = i + 1 end end words = new_words end

-- Finally, rewrite some unstressed words to get the right pronunciation. Any remaining amb not before a	-- vowel-initial word is pronounced [am] even in Valencian (where [amp]/[amb] would be expected), and per always -- has a pronounced . local unstressed_word_replacement = { ["a" .. DOTOVER .. "mb"] = "a" .. DOTOVER .. "m", ["pe" .. DOTOVER .. "r"] = "pe" .. DOTOVER .. "rr", }

for i, wordobj in ipairs(words) do		wordobj.term = unstressed_word_replacement[wordobj.term] or wordobj.term end

return words end

local function fix_prefixes(word) -- Voiced s in prefix roots -fons-, -dins-, -trans- word = rsub(word, "^enfons([aàeèéiíoòóuú])", "enfonz%1") word = rsub(word, "^endins([aàeèéiíoòóuú])", "endinz%1") word = rsub(word, "tr([aà])ns([aàeèéiíoòóuúbdghlmv])", "tr%1nz%2")

-- in + ex > ineks/inegz word = rsub(word, "^inex", "in.ex")

return word end

local function restore_diaereses(word) -- Some structural forms do not have diaeresis per diacritic savings, let's restore it to identify hiatus

word = rsub(word, "([iu])um(s?)$", "%1üm%2") -- Latinisms (-ius is ambiguous but rare)

word = rsub(word, "([aeiou])isme(s?)$", "%1ísme%2") -- suffix -isme word = rsub(word, "([aeiou])ist([ae]s?)$", "%1íst%2") -- suffix -ista

word = rsub(word, "([aeou])ir$", "%1ír") -- verbs -ir word = rsub(word, "([aeou])int$", "%1ínt") -- present participle word = rsub(word, "([aeo])ir([éà])$", "%1ïr%2") -- future word = rsub(word, "([^gq]u)ir([éà])$", "%1ïr%2") word = rsub(word, "([aeo])iràs$", "%1ïràs") word = rsub(word, "([^gq]u)iràs$", "%1ïràs") word = rsub(word, "([aeo])ir(e[mu])$", "%1ïr%2") word = rsub(word, "([^gq]u)ir(e[mu])$", "%1ïr%2") word = rsub(word, "([aeo])iran$", "%1ïran") word = rsub(word, "([^gq]u)iran$", "%1ïran") word = rsub(word, "([aeo])iria$", "%1ïria") -- conditional word = rsub(word, "([^gq]u)iria$", "%1ïria") word = rsub(word, "([aeo])ir(ie[sn])$", "%1ïr%2") word = rsub(word, "([^gq]u)ir(ie[sn])$", "%1ïr%2")

return word end

local function fix_y(word) -- y > vowel i else consonant /j/, except ny

word = rsub(word, "ny", "ñ")

word = rsub(word, "y([^aeiouàèéêëíòóôúïü])", "i%1") -- vowel if not next to another vowel word = rsub(word, "([^aeiouàèéêëíòóôúïü·%-%.])y", "%1i") -- excluding also syllables separators

return word end

local function mid_vowel_fixes(word) local function track_mid_vowel(vowel, cont) require("Module:debug/track"){"cy-IPA/" .. vowel, "cy-IPA/" .. vowel .. "/" .. cont} return true end local changed -- final -el (not -ell) usually è but not too many cases word, changed = rsubb(word, "e(nts?)$", "é%1") if changed then track_mid_vowel("e", "nt-nts") end word, changed = rsubb(word, "e(rs?)$", "é%1") if changed then track_mid_vowel("e", "r-rs") end word, changed = rsubb(word, "o(rs?)$", "ó%1") if changed then track_mid_vowel("o", "r-rs") end word, changed = rsubb(word, "è(s?)$", "ê%1") if changed then track_mid_vowel("è", "s-blank") end word, changed = rsubb(word, "e(s[oe]s)$", "ê%1") if changed then track_mid_vowel("e", "sos-sa-ses") end word, changed = rsubb(word, "e(sa)$", "ê%1") if changed then track_mid_vowel("e", "sos-sa-ses") end return word end

local function word_fixes(word, dialect) word = rsub(word, "%(rr%)", TEMP_PAREN_RR) word = rsub(word, "%(r%)", TEMP_PAREN_R) word = rsub(word, "%-([rs]?)", "-%1%1") if dialect == "val" then word = rsub(word, "%-x", "-tx") end word = rsub(word, "rç$", "rrs") -- silent r only in plurals -rs word = fix_prefixes(word) -- internal pause after a prefix word = restore_diaereses(word) -- no diaeresis saving word = fix_y(word) -- ny > ñ else y > i vowel or consonant word = mid_vowel_fixes(word) -- all words in pn- (e.g. pneumotòrax and mn- (e.g. mnemònic) have silent p/m in both Central and Valencian	word = rsub(word, "^[pm]n", "n")	-- Respell ch + vowel as tx, before we remove other h's after consonants.	word = rsub(word, "ch(" .. V ..")", "tx%1")	-- Delete h after a consonant. This must happen here, before split_syllables. We don't delete h after a vowel	-- yet because it indicates a hiatus.	word = rsub(word, "(" .. C .. ")h", "%1")

return word end

local function split_vowels(vowels, saw_dotover, saw_lineunder) local syllables = vowels = usub(vowels, 2)

while vowels ~= "" do		local syll = {onset = "", vowel = "", coda = ""} syll.onset, syll.vowel, vowels = rmatch(vowels, "^([iu]?)(.)(.-)$") table.insert(syllables, syll) end

local count = #syllables

if count >= 2 and (syllables[count].vowel == "i" or syllables[count].vowel == "u") then syllables[count - 1].coda = syllables[count].vowel syllables[count] = nil end

return syllables end

-- Split the word into syllables. Return a list of syllable objects, each of which contains fields `onset`, `vowel`, -- `coda`, `separator` (a user-specified syllable divider that goes before the syllable; one of '·', '-' or '.') and -- `stressed` (a boolean indicating that the syllable is stressed). In addition, the list has fields `stress` (the -- index of the syllable with primary stress) and `is_prefix` (true if the word is a prefix, i.e. it ends in '-'). -- Normally, prefixes are treated as unstressed if a stressed syllable isn't explicitly marked, but this can be -- overridden with `stress_prefixes`, which causes the automatic stress-assignment algorithm to run for these terms. local function split_syllables(word, stress_prefixes, may_be_uppercase) local syllables = {} local saw_dotover = false

local remainder = word local is_prefix = false if remainder:find("%-$") then -- prefix is_prefix = true remainder = remainder:gsub("%-$", "") end local is_suffix = false if remainder:find("^%-") then -- suffix is_suffix = true remainder = remainder:gsub("^%-", "") end

while remainder ~= "" do		local consonants, vowels

-- FIXME: Using C and V below instead of the existing patterns slows things down TREMENDOUSLY. -- Not sure why. local vowel_list = may_be_uppercase and "aeiouàèéêëíòóôúïüAEIOUÀÈÉÊËÍÒÓÔÚÏÜ" .. DOTOVER .. LINEUNDER or "aeiouàèéêëíòóôúïü" .. DOTOVER .. LINEUNDER consonants, remainder = rmatch(remainder, "^([^" .. vowel_list .. "]*)(.-)$")		vowels, remainder = rmatch(remainder, "^([" .. vowel_list .. "]*)(.-)$")		local this_saw_dotover = not not rfind(vowels, DOTOVER) if this_saw_dotover then saw_dotover = true vowels = vowels:gsub(DOTOVER, "") end local this_saw_lineunder = not not rfind(vowels, LINEUNDER) if this_saw_lineunder then vowels = vowels:gsub(LINEUNDER, "") end

if vowels == "" then if #syllables > 0 then syllables[#syllables].coda = syllables[#syllables].coda .. consonants else -- word without vowels, e.g. foot boundary | table.insert(syllables, {onset = consonants, vowel = "", coda = "", separator = ""}) end else local onset = consonants local first_vowel = usub(vowels, 1, 1)

if (rfind(onset, "[gqGQ]$") and (first_vowel == "ü" or (first_vowel == "u" and vowels ~= "u"))) or ((onset == "" or onset == "h" or onset == "H") and #syllables == 0 and				(first_vowel == "i" or first_vowel == "I") and (vowels ~= "i" and vowels ~= "I")) then onset = onset .. usub(vowels, 1, 1) vowels = usub(vowels, 2) end

local vsyllables = split_vowels(vowels, this_saw_dotover, this_saw_lineunder) vsyllables[1].onset = onset .. vsyllables[1].onset

for _, s in ipairs(vsyllables) do				table.insert(syllables, s)			end end end

-- Shift over consonants from the onset to the preceding coda, until the syllable onset is valid for i = 2, #syllables do		local current = syllables[i] local previous = syllables[i-1]

while not (current.onset == "" or valid_onsets[rsub(rsub(current.onset, tie_c .. "[hH]?$", ""), "_", "")]) do			local letter = usub(current.onset, 1, 1) current.onset = usub(current.onset, 2) if rfind(letter, "[·%-%.]") then -- syllable separators current.separator = letter break else previous.coda = previous.coda .. letter if rfind(letter, tie_c) then break end end end end

-- Detect stress for i, syll in ipairs(syllables) do if rfind(syll.vowel, "^[" .. written_stressed_vowel_l .. "]$") then syll.stressed = true -- primary stress: the last one stressed without LINEUNDER if not syll.has_lineunder then syllables.stress = i			end end end

-- Assign default stress if not syllables.stress and not saw_dotover and (stress_prefixes or not is_prefix) then local count = #syllables

if count == 1 then if syllables[1].vowel ~= "" then -- vowel-less words don't get stress syllables.stress = 1 end else local final = syllables[count]

-- Take account of tie symbols (apostrophes and ‿). if rfind(final.coda, "^[s" .. tie_l .. "]*$") or (rfind(final.coda, "^" .. tie_c .. "*n" .. tie_c .. "*$") and ( final.vowel == "e" or final.vowel == "i" or final.vowel == "ï")) then syllables.stress = count - 1 else syllables.stress = count end end if syllables.stress then syllables[syllables.stress].stressed = true end end

syllables.is_prefix = is_prefix syllables.is_suffix = is_suffix return syllables end

local function reconstitute_word_from_syllables(syllables) local parts = {} local function ins(txt) table.insert(parts, txt) end if syllables.is_suffix then ins("-") end for _, syl in ipairs(syllables) do		ins(syl.separator) ins(syl.onset) ins(syl.vowel) if syl.has_dotover then ins(DOTOVER) end if syl.has_lineunder then ins(LINEUNDER) end ins(syl.coda) end if syllables.is_prefix then ins("-") end return table.concat(parts) end

local function decompose_respelling(text) local dotover_keys = concat_keys(decompose_dotover) return rsub(text, "[" .. dotover_keys .. "]", decompose_dotover) end

local function canon_respelling(text) local function canon_spaces(text) text = rsub(text, "%s+", " ") text = rsub(text, "^ ", "") text = rsub(text, " $", "") return text end

text = canon_spaces(text) -- eliminate upside down punctuation text = rsub(text, "[¡¿]", "") -- eliminate utterance-final punctuation text = rsub(text, "[!?.]$", "") -- eliminate double and triple quotes text = rsub(text, "''+", "") -- Convert commas and em/en dashes to IPA foot boundaries; require a space after commas and en dashes (for the	-- latter, in particular, to avoid treating the en dash in 'Bose–Einstein condensate' as a foot boundary.	text = rsub(text, " *[,–] ", " | ")	text = rsub(text, " *[—] *", " | ")	-- ... in phrases like com es diu...en català and necessito ... become foot boundaries	text = rsub(text, " *%.%.%. *", " | ")	-- remaining commas and en dashes become spaces	text = rsub(text, "[,–]", " ")	-- may need to eliminate extraneous spaces again, e.g. if there was a space before or after an eliminated	-- punctuation mark	text = canon_spaces(text)	-- question mark or exclamation point in the middle of a sentence -> IPA foot boundary	text = rsub(text, "([^ ]) *[!?] *([^ ])", "%1 | %2")	return text end

local IPA_vowels_central = { ["ê"] = "ɛ", ["ë"] = "ɛ", ["ô"] = "ɔ", } local IPA_vowels_balearic = { ["ê"] = "ə", ["ë"] = "ɛ", ["ô"] = "ɔ", } local IPA_vowels_valencian = { ["ê"] = "e", ["ë"] = "e", ["ô"] = "o", }

local IPA_vowels = { ["à"] = "a", ["è"] = "ɛ", ["ê"] = "ɛ", ["ë"] = "ɛ", ["é"] = "e", ["í"] = "i", ["ï"] = "i", ["ò"] = "ɔ", ["ô"] = "ɔ", ["ó"] = "o", ["ú"] = "u", ["ü"] = "u", }

local function replace_context_free(cons) cons = rsub(cons, "ŀ", "l")

cons = rsub(cons, "r", "ɾ") cons = rsub(cons, "ɾɾ", "r") cons = rsub(cons, "ss", "s") cons = rsub(cons, "ll", "ʎ") cons = rsub(cons, "ñ", "ɲ") -- hint ny > ñ

-- NOTE: We use single-character affricate symbols during processing for ease in handling, and convert them -- to tied multi-character affricates at the end of join_syllables. cons = rsub(cons, "[dt]j", "ʤ") cons = rsub(cons, "tx", "ʧ") cons = rsub(cons, "[dt]z", "ʣ")

cons = rsub(cons, "ç", "s") cons = rsub(cons, "[cq]", "k") cons = rsub(cons, "h", "") cons = rsub(cons, "j", "ʒ") -- Don't replace x -> ʃ yet so we can distinguish x from manually specified ʃ.

cons = rsub(cons, "i", "j") -- must be after j > ʒ cons = rsub(cons, "y", "j") -- must be after j > ʒ and fix_y cons = rsub(cons, "[uü]", "w") cons = rsub(cons, "'", "‿")

return cons end

-- Do context-sensitive phonological changes. Formerly this was all done syllable-by-syllable but that made the code -- tricky (since it often had to look at adjacent syllables) and full of subtle bugs. Now we first concatenate the -- syllables back to words and the words to the combined text and work on the text as a whole. FIXME: We should move -- more of the work done in preprocess_word, e.g. most of replace_context_free, here. local function postprocess_general(text, dialect) local function verify(cond, msg) if not cond then error(("Internal error: %s; processed respelling at this point is '%s'"):format(msg, text)) end return true end

local voiced = listToSet {"b", "d", "g", "m", "n", "ɲ", "l", "ʎ", "r", "ɾ", "v", "z", "ʒ", "ʣ", "ʤ"} local voiced_keys = concat_keys(voiced) local voiceless = listToSet {"p", "t", "k", "f", "s", "ʃ", "ʦ", "ʧ"} local voiceless_keys = concat_keys(voiceless) local voicing = {["p"] = "b", ["t"] = "d", ["k"] = "g", ["f"] = "v", ["s"] = "z", ["ʃ"] = "ʒ", ["ʦ"] = "ʤ", ["ʧ"] = "ʤ"} local voicing_keys = concat_keys(voicing) local devoicing = {} for k, v in pairs(voicing) do		devoicing[v] = k	end local devoicing_keys = concat_keys(devoicing)

-- Handle 

-- Handle ex- + vowel > -egz-. We handle -x- on either side of the syllable boundary. Note that this also handles -- inex- + vowel because in fix_prefixes we respell inex- as in.ex-, which ends up at this stage as in.e.xV. text = rsub_repeatedly(text, "([.#][eɛ]" .. stress_c .. "*)(" .. charsep_c .. "*)x(" .. charsep_c .. "*" .. V .. ")", function(e, pre, post)			-- Preserve other character separators (especially the tie character ‿).			pre = pre:gsub("%.", "")			post = post:gsub("%.", "")			return e .. pre .. "g.z" .. post		end) -- -x- at the beginning of a coda becomes [ks], e.g. annex, apèndix, extracció; but not elsewhere in -- the coda, e.g. in romanx, ponx; words with [ks] in -nx such as esfinx, linx, manx need -- respelling with [ks]; words ending in vowel + x like ídix need respelling with [ʃ] text = rsub(text, "(" .. V .. stress_c .. "*)x", "%1ks") if dialect == "val" then -- Word-initial  as well as  after a consonant other than /j/ (including in the coda, e.g. ponx) -- becomes [t͡ʃ]. text = rsub(text, "#x", "#ʧ") text = rsub(text, "([^" .. vowel_l .. separator_l .. "j]" .. charsep_c .. "*)x", "%1ʧ") end -- Other x becomes [ʃ] text = rsub(text, "x", "ʃ")

-- Doubled ss -> s e.g. in exs-, exc(e/i)-, sc(e/i)-; FIXME: should this apply across word boundaries? text = rsub(text, "s(" .. charsep_c .. "*)s", "%1s")

-- Coda consonant losses

-- In Central Catalan, coda losses happen everywhere, but otherwise they don't happen when -- absolutely word-finally before a vowel or end of utterance (e.g. blanc has /k/ in Balearic and	-- Valencian but not blancs). Must precede consonant assimilations. local boundary = dialect == "cen" and "(.)" or "([^#])" text = rsub(text, "m[pb]" .. boundary, "m%1") text = rsub(text, "([ln])[td]" .. boundary, "%1%2") text = rsub(text, "[nŋ][kg]" .. boundary, "ŋ%1") if dialect == "val" or dialect == "bal" then local before_cons = "(" .. separator_c .. "*" .. C .. ")" text = rsub(text, "m[pb]" .. before_cons, "m%1") text = rsub(text, "([ln])[td]" .. before_cons, "%1%2") text = rsub(text, "[nŋ][kg]" .. before_cons, "ŋ%1") end

-- Delete /t/ between /s/ and any consonant other than /s/ or /ɾ/. Must precede voicing assimilation and -- t + lateral/nasal assimilation. text = rsub(text, "st(" .. sylsep_c .. "*[^" .. neg_guts_of_cons .. "sɾ])", "s%1") -- Consonant assimilations

if dialect == "cen" then -- v > b in onsets (not in codas, e.g. ovni [ɔ́vni] and hafni [ávni]). This needs to precede -- assimilation of nb -> mb. text = rsub(text, "v(" .. C .. "*" .. V ..")", "b%1") end

-- t + lateral assimilation -> geminate across syllable boundary. We don't any more do t + nasal assimiation -- because there are too many exceptions, e.g. aritmètic, atmosfèric, ètnia. Instead, we require that -- cases where it does happen use respelling to effect this. FIXME: this doesn't always happen in -tl- either, -- e.g. atlàntic has [əllántik] in GDLC but [adlántik] in DNV. --	-- FIXME: Clean this up, maybe move below voicing assimilation, investigate whether it operates across words, -- move stuff below that special-cases tll in Valencian here. text = rsub(text, "t(" .. sylsep_c .. ")([lʎ])", "%2%1%2")

-- n + labial > labialized assimilation text = rsub(text, "n(" .. separator_c .. "*[mbp])", "m%1") text = rsub(text, "n(" .. separator_c .. "*[fv])", "ɱ%1")

-- n + velar > velarized assimilation text = rsub(text, "n(" .. separator_c .. "*[kg])", "ŋ%1")

-- l/n + palatal > palatalized assimilation text = rsub(text, "([ln])(" .. separator_c .. "*[ʎɲʃʒʧʤ])", function(ln, palatal)		ln = ({["l"] = "ʎ", ["n"] = "ɲ"})[ln]		return ln .. palatal	end)

-- ɲs > ɲʃ; FIXME: not sure the purpose of this; it doesn't apply in menys or derived terms like menyspreu -- NOTE: Per, it does apply in these scenarios but the result is -- somewhere between [s] and [ʃ], which is why it isn't shown in GDLC. -- text = rsub(text, "ɲs", "%1ʃ")

-- Handle 

-- In replace_context_free, we converted single r to ɾ and double rr to r.	if dialect == "cen" then text = rsub(text, TEMP_PAREN_R, "") text = rsub(text, TEMP_PAREN_RR, "r") elseif dialect == "bal" then text = rsub(text, TEMP_PAREN_R, "") text = rsub(text, TEMP_PAREN_RR, "") else verify(dialect == "val", ("Unrecognized dialect '%s'"):format(dialect)) text = rsub(text, TEMP_PAREN_R, "ɾ") text = rsub(text, TEMP_PAREN_RR, "ɾ") end if dialect ~= "val" then -- Coda /ɾ/ -> /r/ -- FIXME: This is inherited from the older code. Correct? text = rsub(text, "(" .. V .. stress_c .. "*" .. C .. "*)ɾ", "%1r") end

-- ɾ -> r word-initially or after [lns]; needs to precede voicing assimilation as will be voiced to [z] before -- /ɾ/. text = rsub(text, "([#lns]" .. sylsep_c .. "*)ɾ", "%1r")

-- Voicing assimilation

-- Voicing or devoicing; we want to proceed from right to left, and due to the limitations of patterns (in	-- particular, the lack of support for alternations), it's difficult to do this cleanly using Lua patterns, so we -- do it character by character. local chars = split_into_chars(text) -- We need to look two characters ahead in some cases, so start two characters from the end. This is safe because -- the overall respelling ends in "##". (Similarly, as an optimization, don't check the first two characters, which	-- are always "##".) for i = #chars - 2, 3, -1 do -- We are looking for two consonants next to each other, possibly separated by a syllable or word divider. -- We also handle a consonant followed by a syllable divider then a vowel, and a consonant word-finally. -- Note that only coda consonants can change voicing, so we need to check to make sure we're in the coda. local first = chars[i] -- If `second` is nil, no assimilation occurs. Otherwise, `second` should be a consonant or empty string (which		-- represents a syllable or word boundary followed by a vowel or end of string), and we assimilate to that -- consonant (empty string forces devoicing). local second -- If set to true, we're processing a consonant directly before a word boundary followed by a word beginning -- with a vowel. In this context, voiceless sibilants voice. Note that we handle voicing of word-internally -- separately, in preprocess_word [FIXME: maybe move much of the processing in preprocess_word into this -- function]. local word_boundary_before_vowel if not rfind(first, C) then -- leave `second` at nil; no assimilation elseif chars[i + 1] == "#" then -- word boundary if chars[i + 2] == " " then -- chars[i + 3] should always be "#" verify(chars[i + 3] == "#", "Word boundary followed by space but not #") if rfind(chars[i + 4], C) then second = chars[i + 4] else second = "" word_boundary_before_vowel = true end else second = "" end elseif rfind(chars[i + 1], sylsep_c) then -- syllable boundary if rfind(chars[i + 2], C) then second = chars[i + 2] else second = "" end elseif rfind(chars[i + 1], C) then second = chars[i + 1] else -- followed by a vowel not across a syllable or word boundary; leave `second` as nil, no assimilation end if second then -- Make sure we're in the coda. We have to look backwards until we find a vowel or syllable/word boundary. local in_coda = false local j = i - 1 while true do				verify(j > 0, "Missing word boundary at beginning of overall respelling") if rfind(chars[j], "[" .. sylsep_l .. wordsep_l .. "]") then break elseif rfind(chars[j], V) then in_coda = true break end j = j - 1 end if in_coda then if word_boundary_before_vowel and rfind(first, "[zʒʣʤ]") then -- leave alone elseif voiced[second] and voicing[first] or word_boundary_before_vowel and rfind(first, "[sʃʦʧ]") then chars[i] = voicing[first] elseif (voiceless[second] or second == "") and devoicing[first] then chars[i] = devoicing[first] end end end end text = table.concat(chars)

-- gn -> ŋn e.g. regnar (including word-initial gn- e.g. gnòmic, gneis) -- FIXME: This should be moved below voicing assimilation, and we need to investigate if it operates across words -- (here I'm guessing yes). if dialect ~= "cen" then text = rsub(text, "#gn", "#n") end text = rsub(text, "g(" .. separator_c .. "*n)", "ŋ%1")

-- gʒ > d͡ʒ -- FIXME: We need to investigate if it operates across words text = rsub(text, "g(" .. sylsep_c .. "*)ʒ", "%1ʤ") -- sʃ -> ʃ (desxifrar), zʒ -> ʒ (disjuntor) if dialect ~= "val" then text = rsub(text, "s(" .. separator_c .. "*ʃ)", "%1") text = rsub(text, "z(" .. separator_c .. "*ʒ)", "%1") end

-- Gemination of , 

if dialect ~= "val" then -- bl -> bbl, gl -> ggl after the stress when following a vowel; to avoid this, use  or . -- This must follow v > b above. To force a hard ungeminated [b] or [g], use <_b> or <_g>. text = rsub(text, "(" .. stress_c .. ")(" .. sylsep_c .. ")([bg])l", "%1%3%2%3l") else -- Valencian; undo manually written 'bbl', 'ggl' in words like poblar, reglament text = rsub(text, "([bg])(" .. sylsep_c .. ")%1l", "%2%1l") end

-- Lenition of voiced stops

-- In Central Catalan, b/d/g become fricatives (actually approximants, like in Spanish) in the onset following a	-- vowel and (except for ) after  and  (cf. GDLC cabellblanc [kəβɛ̀ʎβláŋ]). This also happens across -- word boundaries but doesn't happen after stops, nor in Central Catalan after [r], [ɾ] or [z] (and hence probably	-- not after [ʒ] either, although I can't find any examples in GDLC). --	-- In Valencian,  doesn't lenite (at least formally?), but  and  do lenite after [r], [ɾ] or [z]. --	-- Balearic is like Valencian in not leniting , and probably like Central Catalan otherwise. local lenite_bdg = {["b"] = "β", ["d"] = "ð", ["g"] = "ɣ"} if dialect == "cen" then text = rsub(text, "([" .. vowel_l .. "jwlʎv]" .. separator_c .. "*[.#]" .. separator_c .. "*)([bdg])",			function(before, bdg) return before .. lenite_bdg[bdg] end) elseif dialect == "val" then text = rsub(text, "([" .. vowel_l .. "jwlʎvrɾzʣ]" .. separator_c .. "*[.#]" .. separator_c .. "*)([dg])",			function(before, dg) return before .. lenite_bdg[dg] end) else verify(dialect == "bal", ("Unrecognized dialect '%s'"):format(dialect)) text = rsub(text, "([" .. vowel_l .. "jwlʎv]" .. separator_c .. "*[.#]" .. separator_c .. "*)([dg])",			function(before, dg) return before .. lenite_bdg[dg] end) end

-- Vowel reduction

-- Reduction of unstressed a,e in Central and Balearic (Eastern Catalan). if dialect ~= "val" then -- The following rules seem to apply, based on the old code: -- (1) Stressed a and e are never reduced. -- (2) Unstressed e directly following ə is not reduced. -- (3) Unstressed e directly before written <a> or before /ɔ/ is not reduced. -- (4) Written <ee> when both vowels precede the primary stress is reduced to [əə]. (This rule preempts #2.) -- (5) Written <ee> when both vowels follow the primary stress isn't reduced at all. -- Rule #2 in particular seems to require that we proceed left to right, which is how the old code was -- implemented. -- FIXME: These rules seem overly complex and may produce incorrect results in some circumstances. local words = rsplit(text, " ") for j, word in ipairs(words) do			local chars = split_into_chars(word) -- See above where voicing assimilation is handled. The overall respelling begins and ends in #, which we -- can ignore. We need to look ahead three chars in some circumstances, but in all those circumstances we -- shoudn't run off the end (and have assertions to check this). local seen_primary_stress = false for i = 2, #chars - 1 do				local this = chars[i] if chars[i] == AC then seen_primary_stress = true end if (this ~= "a" and this ~= "e") or rfind(chars[i + 1], stress_c) then -- Not a/e, or a stressed vowel; continue else local reduction = true local prev, prev_stress, nxt, nxt_stress if not rfind(chars[i - 1], sylsep_c) then prev = "" else prev = chars[i - 2] -- this should be non-nil as chars[i - 1] is a syllable separator (not #) verify(prev, "Missing # at word boundary") prev_stress = "" if rfind(prev, stress_c) then prev_stress = prev prev = chars[i - 3] -- As above; chars[i - 2] is a stress indicator (not #). verify(prev, "Missing # at word boundary") end end if not rfind(chars[i + 1], sylsep_c) then nxt = "" -- leave nxt at nil else nxt = chars[i + 2] nxt_stress = chars[i + 3] -- chars[i + 1] is a syllable separator, so chars[i + 2] should not be a word boundary, so -- chars[i + 3] should exist. verify(nxt and nxt_stress, "Syllable separator at word boundary or missing # at word boundary") end if this == "e" and rfind(prev, "ə") then reduction = false elseif this == "e" and rfind(nxt, "[aɔ]") then reduction = false elseif this == "e" and nxt == "e" and not rfind(nxt_stress, AC) then -- FIXME: Check specifically for AC duplicates previous logic but is probably wrong or unnecessary. if not seen_primary_stress then chars[i + 2] = "ə" else reduction = false end end if reduction then chars[i] = "ə" end end end words[j] = table.concat(chars) end text = table.concat(words, " ") end

if dialect == "cen" then -- Reduction of unstressed o (not before w) text = rsub(text, "o([^" .. stress_l .. "w])", "u%1") elseif dialect == "bal" then -- Reduction of unstressed o per vowel harmony: unstressed /o/ -> /u/ directly before stressed /i/ or /u/; -- as a Lua pattern, o can be followed only by consonants and/or syllable separators (no vowels, stress marks		-- or word separators). text = rsub(text, "o([^" .. vowel_l .. stress_l .. wordsep_l .. "]*[iu]" .. stress_c .. ")", "u%1") end

-- Final losses. text = rsub(text, "j(ʧs?#)", "%1") -- boigs /bɔt͡ʃ/ text = rsub(text, "([ʃʧs])s#", "%1#") -- homophone plurals -xs, -igs, -çs

if dialect ~= "val" then -- Remove j before palatal obstruents text = rsub(text, "j(" .. sylsep_c .. "*[ʃʒʧʤ])", "%1") else -- Valencian -- Fortition of palatal fricatives text = rsub(text, "ʒ", "ʤ") text = rsub(text, "(i" .. stress_c .. "*" .. sylsep_c .. ")ʣ", "%1z") end

if dialect ~= "cen" then -- No palatal gemination ʎʎ > ll or ʎ, in Valencian and Balearic. -- FIXME: These conditions seem to be targeting specific words and should probably be fixed using respelling -- instead. text = rsub(text, "([bpw]a" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "([mv]e" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(ti" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(m[oɔ]" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "(u" .. stress_c .. "*)ʎ(" .. sylsep_c .. "*)ʎ", "%1l%2l") text = rsub(text, "ʎ(" .. sylsep_c .. "*ʎ)", "%1") end

-- Convert pseudo-symbols to real ones.

-- Convert g to IPA ɡ. text = rsub(text, "g", "ɡ")

-- Convert pseudo-afficate symbols to full affricates. local full_affricates = { ["ʦ"] = "t͡s", ["ʣ"] = "d͡z", ["ʧ"] = "t͡ʃ", ["ʤ"] = "d͡ʒ" } text = rsub(text, "([ʦʣʧʤ])", full_affricates)

-- Generate IPA stress marks.

-- Convert acute and grave to IPA stress marks. text = rsub(text, AC, "ˈ") text = rsub(text, GR, "ˌ") -- Move IPA stress marks to the beginning of the syllable. text = rsub_repeatedly(text, "([#.])([^#.]*)(" .. ipa_stress_c .. ")", "%1%3%2")	-- Suppress syllable divider before IPA stress indicator. text = rsub(text, "%.(#?" .. ipa_stress_c .. ")", "%1")	-- Make all primary stresses but the last one in a given word be secondary. May be fed by the first rule above. -- FIXME: Currently this is handled earlier, but we might want to move it here, as is done in Module:pt-pronunc. -- text = rsub_repeatedly(text, "ˈ([^ ]+)ˈ", "ˌ%1ˈ") -- Make primary stresses in prefixes become secondary. (FIXME: Handled earlier now.) -- text = rsub_repeatedly(text, "ˈ([^#]*#" .. PREFIX_MARKER .. ")", "ˌ%1")

-- Remove # symbols at word/text boundaries, as well as _ (which forces separate interpretation), pseudo-consonant -- markers (at edges of some prefixes/suffixes), and prefix markers, and recompose. text = rsub(text, "[#_" .. PSEUDOCONS .. "]", "") text = mw.ustring.toNFC(text)

return text end

local function preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word) -- Stressed vowel is ambiguous if syllables.stress then local stressed_vowel = syllables[syllables.stress].vowel if rfind(stressed_vowel, "[eo]") then local marks = {["e"] = {AC, GR, CFLEX, DIA}, ["o"] = {AC, GR, CFLEX}} local marked_vowels = {} for _, mark in ipairs(marks[stressed_vowel]) do table.insert(marked_vowels, stressed_vowel .. mark) end

error(("In respelling '%s', the stressed vowel '%s' is ambiguous. Please mark it with an acute, " .. "grave, or combined accent: %s."):format(orig_word, stressed_vowel, m_table.serialCommaJoin(marked_vowels, {dontTag = true, conj = "or"}))) end end

-- Final -r is ambiguous in many cases. local final = syllables[#syllables] -- Stressed final r after a or i in non-monosyllables is treated as (r), i.e. verbal infinitives are assumed (NOTE:	-- not always the case, e.g. there are many adjectives and nouns in -ar that should be marked as '(rr)', and	-- several loanword nouns in -ir that should be marked as 'rr'). Likewise for stressed final r or rs after é in	-- non-monosyllables (which are usually adjectives or nouns with the -er ending, but may be verbal infinitives,	-- which should be marked as 'ê(r)'). That is, it disappears other than in Valencian. All other final r and final -- rs are considered ambiguous and need to be rewritten using rr, (rr) or (r). if #syllables > 1 and final.stressed then if final.coda == "r" and rfind(final.vowel, "[aàiíé]") or final.coda == "rs" and final.vowel == "é" or			final.vowel == "ó" and rfind(final.coda, "^rs?$") and rfind(final.onset, "[stdç]") then final.coda = TEMP_PAREN_R end end if rfind(final.coda, "^rs?$") or rfind(final.coda, "[^r]rs?$") then error(("In respelling '%s', final -r by itself or in -rs is ambiguous except in the verbal endings -ar or " .. "-ir, in the nominal or adjectival endings -er(s) and -[dtsç]or(s). In all other cases it needs to be " .. "rewritten using one of 'rr' (pronounced everywhere), '(rr)' (pronounced everywhere but Balearic) or " .. "'(r)' (pronounced only in Valencian). Note that adjectives in -ar usually need rewriting using '(rr)'; " .. "nouns in -ar referring to places should be rewritten using '(r)'; and loanword nouns in -ir usually " .. "need rewriting using 'rr'."):format(orig_word)) end

local syllables_IPA = {stress = syllables.stress, is_prefix = syllables.is_prefix, is_suffix = syllables.is_suffix}

for key, val in ipairs(syllables) do		syllables_IPA[key] = {onset = val.onset, vowel = val.vowel, coda = val.coda, stressed = val.stressed} end

-- Replace letters with IPA equivalents for i, syll in ipairs(syllables_IPA) do		-- Voicing of s		if syll.onset == "s" and i > 1 and rfind(syllables[i - 1].coda, "^[iu]?$") then syll.onset = "z" end

if rfind(syll.vowel, "^[eèéêëií]$") then syll.onset = rsub(syll.onset, "tg$", "ʤ") syll.onset = rsub(syll.onset, "[cg]$", {["c"] = "s", ["g"] = "ʒ"}) syll.onset = rsub(syll.onset, "[qg]u$", {["qu"] = "k", ["gu"] = "g"}) end

syll.coda = rsub(syll.coda, "igs?$", "iʤ")

syll.onset = replace_context_free(syll.onset) syll.coda = replace_context_free(syll.coda)

syll.vowel = rsub(syll.vowel, ".",			dialect == "cen" and IPA_vowels_central or			dialect == "bal" and IPA_vowels_balearic or			IPA_vowels_valencian		) syll.vowel = rsub(syll.vowel, ".", IPA_vowels) end

for _, suffix_syl in ipairs(suffix_syllables) do		table.insert(syllables_IPA, suffix_syl) end

return syllables_IPA end

-- Given a single substitution spec, `to`, figure out the corresponding value of `from` used in a complete -- substitution spec. `pagename` is the name of the page, either the actual one or taken from the `pagename` param. -- `whole_word`, if set, indicates that the match must be to a whole word (it was preceded by ~). local function convert_single_substitution_to_original(to, pagename, whole_word) -- Replace specially-handled characters with a class matching the character and possible replacements. local escaped_from = to	-- Handling of '(rr)', '(r)', '.' and '-' needs to be done before calling pattern_escape; otherwise they will be -- escaped. escaped_from = escaped_from:gsub("%(rr%)", "r") escaped_from = escaped_from:gsub("%(r%)", "r") escaped_from = escaped_from:gsub("ks", "x"):gsub("Ks", "X"):gsub("gz", "x"):gsub("([bg])%1l", "%1l"):gsub("[._]", "") escaped_from = require(patut_module).pattern_escape(escaped_from) escaped_from = escaped_from:gsub("rr", "rr?") escaped_from = escaped_from:gsub("ss", "ss?") escaped_from = escaped_from:gsub("ʃ", "[xX]") -- This is tricky, because we already passed `escaped_from` through pattern_escape causing a hyphen to get a	-- % sign before it, and have to double up the percent signs to match and replace a literal %. escaped_from = escaped_from:gsub("%%%-", "%%-?") -- Tie sign (‿) should match against space, hyphen or nothing in the original. escaped_from = escaped_from:gsub("‿", "[ %%-]?") escaped_from = rsub(escaped_from, "[" .. written_accented_vowel_l .. "]",		function(v) return "[" .. v .. written_accented_to_plain_vowel[v] .. "]" end) escaped_from = escaped_from:gsub(DOTOVER, DOTOVER .. "?"):gsub(LINEUNDER, LINEUNDER .. "?") escaped_from = "(" .. escaped_from .. ")" if whole_word then escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]" end local match = rmatch(pagename, escaped_from) if match then if match == to then error(("Single substitution spec '%s' found in pagename '%s', replacement would have no effect"):				format(to, pagename)) end return match end error(("Single substitution spec '%s' couldn't be matched to pagename '%s'"):format(to, pagename)) end

local function apply_substitution_spec(respelling, pagename, pos, allow_mid_vowel_hints, parse_err) local subs = split_on_comma(rmatch(respelling, "^%[(.*)%]$")) respelling = pagename local mid_vowel_hint local regular_subs = {} for _, sub in ipairs(subs) do if rfind(sub, "^" .. export.mid_vowel_hint_c .. "$") then if mid_vowel_hint then parse_err(("Specified mid vowel hint twice, '%s' and '%s'"):format( mid_vowel_hint, sub)) end mid_vowel_hint = sub else table.insert(regular_subs, sub) end end if mid_vowel_hint then if not allow_mid_vowel_hints then parse_err(("Mid vowel hint '%s' not allowed when apply one substitution spec to multiple words"):format( mid_vowel_hint)) end local suffix = "" -- FIXME: This duplicates logic in to_IPA. if not pos or pos == "adverb" then local part_before_ment, ment = rmatch(respelling, "^(.*)(m[eé]nt)$") if part_before_ment and (pos == "adverb" or not rfind(part_before_ment, "[iï]$") and				rfind(part_before_ment, V .. ".*" .. V)) then suffix = ment respelling = part_before_ment end end local syllables = split_syllables(respelling, "stress prefixes", "may be uppercase") local stressed_vowel = syllables[syllables.stress].vowel if stressed_vowel == mid_vowel_hint then -- do nothing elseif rfind(mid_vowel_hint, "[èéêë]") and rfind(stressed_vowel, "[eEèÈ]") or			rfind(mid_vowel_hint, "[òóô]") and rfind(stressed_vowel, "[oO]") then syllables[syllables.stress].vowel = mid_vowel_hint else parse_err(("Stressed vowel '%s' not compatible with mid vowel hint '%s'"):format( stressed_vowel, mid_vowel_hint)) end respelling = reconstitute_word_from_syllables(syllables) .. suffix end

for _, sub in ipairs(regular_subs) do		local from, escaped_from, to, escaped_to, whole_word if rfind(sub, "^~") then -- whole-word match sub = rmatch(sub, "^~(.*)$") whole_word = true end if sub:find(":") then from, to = rmatch(sub, "^(.-):(.*)$") else to = sub from = convert_single_substitution_to_original(to, pagename, whole_word) end if from then local patut = require(patut_module) escaped_from = patut.pattern_escape(from) if whole_word then escaped_from = "%f[%a]" .. escaped_from .. "%f[%A]" end escaped_to = patut.replacement_escape(to) local subbed_respelling, nsubs = rsubn(respelling, escaped_from, escaped_to) if nsubs == 0 then parse_err(("Substitution spec %s -> %s didn't match processed pagename '%s'"):format( from, to, respelling)) elseif nsubs > 1 then parse_err(("Substitution spec %s -> %s matched multiple substrings in processed pagename '%s', add " .. "more context"):format(from, to, respelling)) else respelling = subbed_respelling end end end

return respelling end

local canonicalize_pos = { n = "noun", noun = "noun", v = "verb", vb = "verb", verb = "verb", a = "adjective", adj = "adjective", adjective = "adjective", av = "adverb", adv = "adverb", adverb = "adverb", o = "other", other = "other", }

local function parse_off_pos(respelling, parse_err) local pos, rest = respelling:match("^([a-z]+)/(.*)$") if pos then if not canonicalize_pos[pos] then local valid_pos = {} for vp, _ in pairs(canonicalize_pos) do				table.insert(valid_pos, vp) end table.sort(valid_pos) parse_err(("Unrecognized part of speech '%s', should be one of %s"):format(pos, table.concat(valid_pos, ", "))) end pos = canonicalize_pos[pos] respelling = rest if respelling == "" then respelling = "+" end end return pos, respelling end

-- Parse a respelling given by the user, allowing for '+' for pagename, mid vowel hints in place of a respelling and -- substitution specs like '[ks]' or [val:vol,ê,ks]. In general, return an object {words = {WORD, WORD, ...}} where -- WORD is of the form {term = PARSED_RESPELLING, pos = POS}. Other fields are set in special cases: If a raw respelling -- was seen, the fields `raw_phonemic` and/or `raw_phonetic` are set; if '?' is seen, the field `unknown` is set; and if -- '-' is seen, the field `omitted` is set. local function parse_respelling(respelling, pagename, parse_err) if respelling == "?" then return { unknown = true }	end if respelling == "-" then return { omitted = true }	end local saw_raw local remaining_respelling = respelling:match("^raw:(.*)$") if remaining_respelling then saw_raw = true respelling = remaining_respelling end local raw_phonemic, raw_phonetic = respelling:match("^/(.*)/ %[(.*)%]$") if not raw_phonemic then raw_phonemic = respelling:match("^/(.*)/$") end if not raw_phonemic and saw_raw then raw_phonetic = respelling:match("^%[(.*)%]$") end if raw_phonemic or raw_phonetic then return { raw_phonemic = raw_phonemic, raw_phonetic = raw_phonetic, }	end

pagename = decompose_respelling(pagename) respelling = decompose_respelling(respelling)

local function split_respelling_into_words(respelling, parse_pos) respelling = canon_respelling(respelling) local word_objs = {} local respelling_words = rsplit(respelling, " ") for _, word in ipairs(respelling_words) do			local pos if parse_pos then pos, word = parse_off_pos(word, parse_err) end table.insert(word_objs, {term = word, pos = pos}) end return {words = word_objs} end

local function substitute_respelling_word(respelling_word, pagename_word) local pos pos, respelling_word = parse_off_pos(respelling_word, parse_err) if respelling_word == "+" then respelling_word = pagename_word else if rfind(respelling_word, "^" .. export.mid_vowel_hint_c .. "$") then respelling_word = "[" .. respelling_word .. "]"			end if rfind(respelling_word, "^%[.*%]$") then respelling_word = apply_substitution_spec(respelling_word, pagename_word, pos,					"allow mid vowel hint", parse_err) end end return {term = respelling_word, pos = pos} end

-- At this point, if there are multiple words in the pagename, there are three syntaxes allowed: all-at-once, -- replacement or word-by-word. All-at-once syntax involves either a + representing the entire pagename, or a	-- substitution spec that applies to all words in the pagename. This syntax cannot have a prefixed part of speech -- because it wouldn't be clear which word to apply the part of speech to. Replacement syntax simply spells out the -- respelling without any substitution specs or +'s (but possibly with parts of speech prefixed to individual	-- words), and can have a different number of words than the pagename (essentially, the pagename is disregarded). -- Word-by-word syntax involves a combination of respelled words, per-word substitution specs and/or a + -- representing an individual word, and must have the same number of words as the pagename so that substitution -- specs and +'s can be lined up with words in the pagename. In all cases, the return value is in the same format; -- see comment at top of function. if pagename:find(" ") or respelling:find(" ") then if respelling == "+" then return split_respelling_into_words(pagename) elseif rfind(respelling, "^%[.*%]$") then -- all-at-once syntax with substitution spec return split_respelling_into_words(apply_substitution_spec(respelling, pagename, nil, false, parse_err)) elseif rfind(respelling, "^([a-z]+)/$") or rfind(respelling, "^([a-z]+)/%^%[%*%]$") then -- attempt to include a part of speech in all-at-once syntax parse_err(("Part of speech not allowed when pagename is multiword and all-at-once syntax is used in " .. "the respelling, but saw '%s'"):format(respelling)) elseif rfind(respelling, "^" .. export.mid_vowel_hint_c .. "$") then -- attempt to use a mid-vowel hint in all-at-once syntax parse_err(("Single mid-vowel hint not allowed when pagename is multiword because it's not clear which " .. "word to apply it to, but saw '%s'"):format(respelling)) elseif rfind(respelling, "[+%[%]]") or rfind(respelling, "^" .. export.mid_vowel_hint_c .. " ") or rfind(respelling, " " .. export.mid_vowel_hint_c .. " ") or rfind(respelling, " " .. export.mid_vowel_hint_c .. "$") then -- word-by-word syntax local sub_with_space = rmatch(respelling, "%^%[%* [^%[%]]*%]") if sub_with_space then parse_err(("When using word-by-word syntax with a multiword pagename, saw substitution spec '%s' " .. "with spaces, which is not allowed because it must match a single word"):format(sub_with_space)) end pagename = canon_respelling(pagename) respelling = canon_respelling(respelling) local pagename_words = rsplit(pagename, " ") local respelling_words = rsplit(respelling, " ") if #pagename_words ~= #respelling_words then parse_err(("When using word-by-word syntax with a multiword pagename, saw %s words in pagename but " .. "%s word%s in respelling; they need to match"):format(#pagename_words, #respelling_words, #respelling_words > 1 and "s" or "")) end local word_objs = {} for i = 1, #pagename_words do				table.insert(word_objs, substitute_respelling_word(respelling_words[i], pagename_words[i])) end return {words = word_objs} else -- replacement syntax; pagename ignored return split_respelling_into_words(respelling, "parse pos") end else local word_obj = substitute_respelling_word(respelling, pagename) word_obj.term = canon_respelling(word_obj.term) return {words = {word_obj}} end end

-- Parse a list of comma-split runs containing one or more respellings, i.e. after calling parse_balanced_segment_run -- or the like followed by split_alternating_runs or the like (see Module:parse utilities). `pagename` is the -- pagename, for use when a respelling is just '+', a mid-vowel hint like 'ê' or a substitution spec like '[ks]'. -- `original_input` is the raw input and `input_param` the name of the param containing the raw input; both are used -- only in error messages. Return an object specifying the respellings, currently with a single field 'terms' (this -- format is used in case other outer properties exist in the future), where 'terms' is a list of term objects. Each -- term object contains either a field `term` with the respelling and an optional part of speech `pos`, or fields -- `raw_phonemic` and/or `raw_phonetic` (if the user specified raw IPA using "/.../" or "/.../ [...]" or "raw:[...]"), -- `unknown` (if the user specified "?"), or `omitted` (if the user specified "-"). In addition, there may be fields -- `q`, `qq`, `a`, `aa`, and/or `ref` corresponding to inline modifiers. Each such field is a list; all are lists of -- strings except for `ref`, which is a list of objects as returned by parse_references in Module:references. function export.parse_comma_separated_groups(comma_separated_groups, pagename, original_input, input_param) local function generate_obj(respelling, parse_err) return parse_respelling(respelling, pagename == true and respelling or pagename, parse_err) end local put = require(parse_utilities_module)

local outer_container = {terms = {}} for _, group in ipairs(comma_separated_groups) do -- Rejoin runs that don't involve <...>. local j = 2 while j <= #group do			if not group[j]:find("^<.*>$") then group[j - 1] = group[j - 1] .. group[j] .. group[j + 1] table.remove(group, j)				table.remove(group, j)			else j = j + 2 end end

local param_mods = { -- pre = { overall = true }, -- post = { overall = true }, ref = { store = "insert", convert = function(arg, parse_err) return require("Module:references").parse_references(arg) end }, q = { store = "insert" }, qq = { store = "insert" }, a = { store = "insert" }, aa = { store = "insert" }, }

table.insert(outer_container.terms, put.parse_inline_modifiers_from_segments {			group = group,			arg = original_input,			props = {				paramname = input_param,				param_mods = param_mods,				generate_obj = generate_obj,				splitchar = ",",				outer_container = outer_container,			},		}) end

return outer_container end

-- Generate the pronunciation of `words` (a list of word objects representing respellings, each of which is an object -- of the form {term = RESPELLING, pos = PART_OF_SPEECH} in `dialect` ("cen", "bal" or "val"). local function to_IPA(words, dialect)	local pronuns = {}

for _, wordobj in ipairs(words) do		if rfind(wordobj.term, "[áìùÁÌÙ]") then error(("Invalid accented character in respelling '%s'; use accented à í ú, not the reversed versions" ):format(wordobj.term)) end end words = handle_unstressed_words(words) for _, wordobj in ipairs(words) do		local word = wordobj.term local pos = wordobj.pos local suffix_syllables = {} local orig_word = word

word = ulower(word) if not pos or pos == "adverb" then local word_before_ment, ment = rmatch(word, "^(.*)(m[eé]nt)$") if word_before_ment and (pos == "adverb" or not rfind(word_before_ment, "[iï]$") and				rfind(word_before_ment, V .. ".*" .. V)) then suffix_syllables = pos = "adjective" word = word_before_ment end end

word = word_fixes(word, dialect) local syllables = split_syllables(word) syllables = preprocess_word(syllables, suffix_syllables, dialect, pos, orig_word) -- Combine syllables. local combined = {} local has_ment = #suffix_syllables > 0 for i, syll in ipairs(syllables) do			local ac = (i == syllables.stress and not syllables.is_prefix and not has_ment or				has_ment and i == #syllables) and AC or -- primary stress syllables[i].stressed and GR or -- secondary stress ""			table.insert(combined, syll.onset .. syll.vowel .. ac .. syll.coda) end table.insert(pronuns, table.concat(combined, ".")) end

-- Put double ## at utterance boundaries (beginning/end of string) and at foot boundaries (marked with |). -- Note that if the string without pound signs is 'foo bar baz | bat quux', the final string will be -- '##foo# #bar# #baz## #|# ##bat# #quux##'. local text = "##" .. table.concat(pronuns, " ") .. "##"	text = rsub(text, " | ", "# | #") text = rsub(text, " ", "# #") return postprocess_general(text, dialect) end

-- Generate the phonemic and phonetic pronunciations of the respellings in `parsed_respellings`, which is a table whose -- keys are dialect identifiers (e.g. "cen" for Central Catalan, "val" for Valencian) and whose values are objects of -- the format returned by parse_comma_separated_groups (see comment above that function). This destructively modifies -- `parsed_respellings`, adding fields `phonemic` and `phonetic` containing the generated pronunciations and removing -- the input fields used to generate those output fields. (FIXME: Currently only phonetic pronunciation is generated.) function export.generate_phonemic_phonetic(parsed_respellings) -- Convert each canonicalized respelling to phonemic/phonetic IPA. for dialect, respelling_spec in pairs(parsed_respellings) do		for _, termobj in ipairs(respelling_spec.terms) do			if termobj.unknown or termobj.omitted then -- leave alone, will handle later elseif termobj.raw_phonemic or termobj.raw_phonetic then termobj.phonemic = termobj.raw_phonemic termobj.phonetic = termobj.raw_phonetic -- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers termobj.raw_phonemic = nil termobj.raw_phonetic = nil else termobj.phonetic = to_IPA(termobj.words, dialect) -- set to nil so by-value comparisons respect only the resulting phonemic/phonetic and qualifiers termobj.words = nil end end end end

-- Group pronunciations by dialect, i.e. grouping pronunciations that are identical in every way (including both the -- pronunciation(s) and any qualifiers and other inline modifiers). `parsed_respellings` contains the output from -- generate_phonemic_phonetic, and the return value is a list of grouped pronunciations, where each object in the list -- contains fields `dialects` (a list of dialects containing the pronunciations) and `pronuns` (a list of -- pronunciations, where each pronunciation is specified by an object containing fields `phonemic` and `phonetic`, as -- generated by generate_phonemic_phonetic, along with any inline modifier fields `q`, `qq`, `a`, `aa` and/or `ref`). function export.group_pronuns_by_dialect(parsed_respellings) local grouped_pronuns = {} for dialect, pronun_spec in pairs(parsed_respellings) do		local saw_omitted = false for _, termobj in ipairs(pronun_spec.terms) do			if termobj.omitted then saw_omitted = true break end end if not saw_omitted then local saw_existing = false for _, group in ipairs(grouped_pronuns) do				if m_table.deepEquals(group.pronuns, pronun_spec.terms) then table.insert(group.dialects, dialect) saw_existing = true break end end if not saw_existing then table.insert(grouped_pronuns, {dialects = {dialect}, pronuns = pronun_spec.terms}) end end end return grouped_pronuns end

-- Format pronunciations grouped by dialect. `grouped_pronuns` contains the output of group_pronuns_by_dialect. -- This destructively modifies `grouped_pronuns`, adding a field 'formatted' to the first-level values of -- `grouped_pronuns` containing the formatted pronunciation(s) for a given set of dialects. function export.format_grouped_pronunciations(grouped_pronuns) for _, grouped_pronun_spec in pairs(grouped_pronuns) do		local pronunciations = {}

-- Loop through each pronunciation. For each one, add the phonemic and phonetic versions to `pronunciations`, -- for formatting by Module:IPA or raw (for use in Module:cy-headword). for j, pronun in ipairs(grouped_pronun_spec.pronuns) do			-- Add dialect tags to left accent qualifiers if first one local as = pronun.a			if j == 1 then if as then as = m_table.deepcopy(as) else as = {} end for _, dialect in ipairs(grouped_pronun_spec.dialects) do					table.insert(as, export.dialects_to_names[dialect]) end end

local first_pronun = #pronunciations + 1

if pronun.unknown then -- FIXME: This is a massive hack but it works for now. table.insert(pronunciations, { pron = "", pretext = "unknown" }) else if not pronun.phonemic and not pronun.phonetic then error("Internal error: Saw neither phonemic nor phonetic pronunciation") end

if pronun.phonemic then -- missing if 'raw:[...]' given local slash_pron = "/" .. pronun.phonemic .. "/"					table.insert(pronunciations, {						pron = slash_pron,					}) end

if pronun.phonetic then -- missing if '/.../' given local bracket_pron = "[" .. pronun.phonetic .. "]"					table.insert(pronunciations, {						pron = bracket_pron,					}) end end

local last_pronun = #pronunciations

if pronun.q then pronunciations[first_pronun].q = pronun.q			end if as then pronunciations[first_pronun].a = as			end if j > 1 then pronunciations[first_pronun].separator = ", " end if pronun.qq then pronunciations[last_pronun].qq = pronun.qq			end if pronun.aa then pronunciations[last_pronun].aa = pronun.aa			end if pronun.refs then pronunciations[last_pronun].refs = pronun.refs end if first_pronun ~= last_pronun then pronunciations[last_pronun].separator = " " end end

grouped_pronun_spec.formatted = m_IPA.format_IPA_full(lang, pronunciations, nil, "") end end

function export.show(frame) local params = { [1] = {},		indent = {}, pagename = {} -- for testing or documentation pages }

for _, dialect in ipairs(export.dialects) do		params[dialect] = {} end for dialect_group, _ in pairs(export.dialect_groups) do		params[dialect_group] = {} end

local args = require("Module:parameters").process(frame:getParent.args, params) local pagename = args.pagename or mw.title.getCurrentTitle.subpageText

-- Set inputs local inputs = {} -- If 1= specified, do all dialects. if args[1] then for _, dialect in ipairs(export.dialects) do			inputs[dialect] = {input = args[1], param = 1} end end -- Then do dialect groups. for dialect_group, group_dialects in pairs(export.dialect_groups) do		if args[dialect_group] then for _, dialect in ipairs(group_dialects) do				inputs[dialect] = {input = args[dialect_group], param = dialect_group} end end end -- Then do individual dialect settings. for _, dialect in ipairs(export.dialects) do		if args[dialect] then inputs[dialect] = {input = args[dialect], param = dialect} end end -- If no inputs given, set all dialects based on current pagename. if not next(inputs) then for _, dialect in ipairs(export.dialects) do			inputs[dialect] = {input = "+", param = "(pagename)"} end end

-- Parse the arguments. local parsed_respellings = {} for dialect, inputspec in pairs(inputs) do		local function generate_obj(respelling, parse_err) return parse_respelling(respelling, pagename, parse_err) end

if inputspec.input:find("[<%[]") then local put = require(parse_utilities_module) -- Parse balanced segment runs involving either [...] (substitution notation) or <...> (inline modifiers). -- We do this because we don't want commas inside of square or angle brackets to count as respelling -- delimiters. However, we need to rejoin square-bracketed segments with nearby ones after splitting -- alternating runs on comma. For example, if we are given -- "a[x]a<q:learned>,[vol:vôl,ks]<q:nonstandard>", after calling -- parse_multi_delimiter_balanced_segment_run we get the following output: --			-- {"a", "[x]", "a", "<q:learned>", ",", "[vol:vôl,ks]", "", "<q:nonstandard>", ""} --			-- After calling split_alternating_runs, we get the following: --			-- {{"a", "[x]", "a", "<q:learned>", ""}, {"", "[vol:vôl,ks]", "", "<q:nonstandard>", ""}} --			-- We need to rejoin stuff on either side of the square-bracketed portions. local segments = put.parse_multi_delimiter_balanced_segment_run(inputspec.input, {{"<", ">"}, {"[", "]"}})

local comma_separated_groups = put.split_alternating_runs_on_comma(segments)

-- Process each value. local outer_container = export.parse_comma_separated_groups(comma_separated_groups, pagename,				inputspec.input, inputspec.param) parsed_respellings[dialect] = outer_container else local termobjs = {} local function parse_err(msg) error(msg .. ": " .. inputspec.param .. "=" .. inputspec.input) end for _, term in ipairs(split_on_comma(inputspec.input)) do				table.insert(termobjs, generate_obj(term, parse_err)) end parsed_respellings[dialect] = { terms = termobjs, }		end end

-- Convert each canonicalized respelling to phonemic/phonetic IPA. export.generate_phonemic_phonetic(parsed_respellings)

-- Group the results. local grouped_pronuns = export.group_pronuns_by_dialect(parsed_respellings)

-- Format the results. export.format_grouped_pronunciations(grouped_pronuns)

-- Concatenate formatted results. local formatted = {} for _, grouped_pronun_spec in ipairs(grouped_pronuns) do		table.insert(formatted, grouped_pronun_spec.formatted) end local indent = (args.indent or "*") .. " "	local out = table.concat(formatted, "\n" .. indent) if args.indent then out = indent .. out end

return out end

-- Used by Module:cy-IPA/testcases. function export.test(pagename, respelling, dialect) local function parse_err(msg) error(msg) end local parsed = parse_respelling(respelling, pagename, parse_err) return to_IPA(parsed.words, dialect) end

return export