Module:User:AmazingJus/de

--[=====[ TODO: predictable/automatable, e.g. /ɪ, ɔ, ʊ/ > /i, o, u/ for short vowels in closed syllables, penultimate or final stress but there are some exceptions; use a macron (e.g. ā ē ī) to force a long vowel, and a breve (e.g. ă ĕ ĭ) to force a short vowel. Below are the general rules: - vowels are long in an open syllable (no final consonant, e.g. bēten, hōlen) - vowels are also long before a single consonant (e.g. kām), as well as before a silent ⟨h⟩ (e.g. gēhen, zēhn) - vowels are also short before a double (geminate) consonant (e.g. Wăsser, Mŭtter) - however, vowels before two unique consonants are not predictable (they can either be   long, e.g. Mōnd, or short, e.g. Mŭnd) - note that a long ⟨i⟩ is usually written as ⟨ie⟩, except word-initially (e.g. Īgel) and the exception of short ⟨ie⟩ in vier and its derivatives (e.g. vierzehn) - vowels are usually long in a stressed final syllable before a single consonant (but with possible exceptions, e.g. '-eg') - unstressed syllables do not have long vowels - syllables with secondary stress are treated as if stressed - syllables directly following a known prefix (aus-, zu-, über-, ge-, etc.) should be treated as if stressed, whether they are actually stressed or not - when there's an explicit slash to separate compounds, all parts should be   treated as if they were separate words for vowel-length purposes (e.g.    '-tag' in 'Reichs/tag' should be long) - what about other unstressed syllables? --]=====]
 * Function for final obstruent devoicing of d, g, b, s, r (ɐ̯)
 * Function for pre-consonantal obstruent devoicing of d, g, b, s
 * Function to reduce geminates [DONE]
 * List of environments which trigger the palatalisation of /x/ (liquids + non-low front vowels) [DONE]
 * Function to determine if H is word initial (> /h/) or non-initial (> 0) [DONE]
 * Function to put stress in general, function to check for prefixes and realign stress accordingly
 * Function to convert ⟨e⟩ in unstressed syllables to ə > Function to reduce -ər to -r + "devoicing"
 * Function to convert ⟨c⟩ before front vowels to /t͡s/ [DONE]
 * Function to convert final ⟨-ehe⟩ as /eː/ (verbs only)
 * Function to mark whether the word is Germanic or Romanic - makes a lot of exceptions
 * Inseparable prefixes do not take stress > Stress on the 2nd syllable
 * A complete list could be compiled and the process automated, instead of making the user enter the stress by hand
 * Rules to determine when to make vowels short vs. long. These are usually predictable,
 * Stress is usually on the first syllable, but there are some exceptions:

local export = {}

local u = require("Module:string/char") local rfind = mw.ustring.find local rsubn = mw.ustring.gsub local rmatch = mw.ustring.match local rsplit = mw.text.split local ucomp = mw.ustring.toNFC local udecomp = mw.ustring.toNFD local ulower = mw.ustring.lower local uupper = mw.ustring.upper local usub = mw.ustring.sub local ulen = mw.ustring.len

-- version of rsubn that discards all but the first return value local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

-- apply rsub repeatedly until no change local function rsub_repeatedly(term, foo, bar) while true do		local new_term = rsub(term, foo, bar) if new_term == term then return term end term = new_term end end

local function ine(x) if x == "" then return nil else return x end end

local AC = u(0x0301) -- acute accent local GR = u(0x0300) -- grave accent local MA = u(0x0304) -- macron local BR = u(0x0306) -- breve local DI = u(0x0308) -- diaeresis

local stress_accent = AC .. GR local length_accent = MA .. BR local all_accents = stress_accent .. length_accent local front_vowel = "eiyäöü" local back_vowel = "aou" local vowel = front_vowel .. back_vowel .. all_accents local cons_c = "[^" .. vowel .. ".⁀ %-]" local cons_or_boundary_c = "[^" .. vowel .. "rl. %-]" -- includes ⁀ -- I have added /l/ & /r/ as a stopgap against Brücke -> /ˈprʏkə/, but this may need a new name.

local devoiced_cons = { b = "p", d = "t", g = "k" } local sequences = { ["a"] = { [BR   ] = "a"; [MA   ] = "aː"; ["aach"] = { "aː", "χ" }; ["auch"] = { "aʊ̯", "χ" }; ["ach" ] = { "a", "χ" }; ["ai" ] = "aɪ̯"; ["au" ] = "aʊ̯"; ["ay" ] = "aɪ̯"; };	["ä"] = { [BR   ] = "ɛ"; [MA   ] = "ɛː"; ["äu" ] = "ɔʏ̯"; };	["b"] = "b"; ["c"] = { ["chs" ] = { "k", "s" }; ["ch" ] = "ç"; ["ck" ] = "k"; [false ] = "t͡s"; };	["d"] = { ["dsch"] = "d͡ʒ"; ["dt" ] = "t"; [false ] = "d"; };	["e"] = { [BR   ] = "ɛ"; [MA   ] = "eː"; ["ei" ] = "aɪ̯"; ["eu" ] = "ɔʏ̯"; ["ey" ] = "aɪ̯"; };	["f"] = "f"; ["g"] = "ɡ"; ["h"] = "h"; ["i"] = { [BR   ] = "ɪ"; [MA   ] = "iː"; ["ieh" ] = "iː"; ["ie" ] = "iː"; };	["j"] = "j"; ["k"] = "k"; ["l"] = "l"; ["m"] = "m"; ["n"] = { ["nk" ] = { "ŋ", "k" }; ["ng" ] = "ŋ"; [false ] = "n"; };	["o"] = { [BR   ] = "ɔ"; [MA   ] = "oː"; ["ooch"] = { "oː", "χ" }; ["och" ] = { "ɔ", "χ" }; };	["ö"] = { [BR   ] = "œ"; [MA   ] = "œː"; -- sometimes /øː/ };	["p"] = { ["pf" ] = "p͡f"; ["ph" ] = "f"; [false ] = "p"; };	["q"] = { ["qu" ] = { "k", "v" }; -- FIXME: only before another vowel [false ] = "k"; };	["r"] = "r"; -- phonetically [ʀ] syllable-initially; /ɐ/ syllable-finally ["s"] = { ["sch" ] = "ʃ"; [false ] = "s"; };	["t"] = { ["tsch"] = "t͡ʃ"; ["tz" ] = "t͡s"; [false ] = "t"; };	["u"] = { [BR   ] = "ʊ"; [MA   ] = "uː"; ["uuch"] = { "uː", "χ" }; ["uch" ] = { "ʊ", "χ" }; };	["ü"] = { [BR   ] = "ʏ"; [MA   ] = "yː"; };	["v"] = "f"; ["w"] = "v"; ["x"] = { "k", "s" }; -- XXX ["y"] = { [BR   ] = "ʏ"; [MA   ] = "yː"; };	["z"] = "z"; -- already converted from s	["ß"] = "s"; ["́"] = "ˈ"; -- FIXME }

-- decompose, then recompose umlauted vowels, -- then convert ae, oe, ue to umlauted vowels local function normalise(text) -- decompose accented characters into their base and combining parts text = udecomp(text) -- avoid confusion of wrongly-ordered umlauts/e's and other accents text = rsub(text, "([" .. all_accents .. "]*)([e" .. DI .. "])", "%2%1")	-- recompose umlauted vowels text = rsub(text, "([aou])[e" .. DI .. "]", {a = "ä", o = "ö", u = "ü"}) -- put breves before acute/grave accents return rsub(text, "([" .. stress_accent .. "])" .. BR, BR .. "%1") end

-- respell the text more phonetically to allow easier conversion to IPA local function respell(text, orig, pos) -- to simplify checking for word boundaries and liaison markers, add -- ⁀ at the beginning and end of all words, and remove it at the end -- note that the liaison marker is ‿ text = rsub(text, "%s*,%s*", "⁀⁀ | ⁀⁀") text = rsub(text, "%s+", "⁀ ⁀") text = rsub(text, "%-+", "⁀-⁀") text = "⁀⁀" .. text .. "⁀⁀"

-- handle predictable vowel lengths; other cases must explicitly be	-- marked by the user or else the module will return an error text = rsub(text, "([" .. vowel .. "])(" .. cons_c .. "[" .. vowel .. "])", "%1" .. MA .. "%2") -- long vowel before consonant + vowel text = rsub(text, "([" .. vowel .. "])⁀", "%1" .. MA .. "⁀") -- long vowel before a word boundary text = rsub(text, "([" .. vowel .. "])(" .. cons_c .. ")%2", "%1" .. BR .. "%2%2") -- short vowel before a double consonant

-- handle ⟨c⟩/⟨z⟩ text = rsub(text, "c([" .. back_vowel .. "])", "k%1") -- convert ⟨c⟩ before back vowels to /k/ text = rsub(text, "z", "c") -- convert ⟨z⟩ to /t͡s/

-- handle consonant voicing text = rsub(text, "s([" .. vowel .. "])", "z%1") -- ⟨s⟩ is voiced as z before vowels text = rsub(text, "([bdg])(" .. cons_or_boundary_c .. ")", -- devoice syllable-final obstruents		function(c1, c2)			return devoiced_cons[c1] .. c2		end)

return text end

-- convert letters to phonemes using the sequences table, -- then return the concatenated phoneme string local function phonemise(text, orig, pos) local phones, i, n = {}, 1, ulen(text) while i <= n do		local bid, bid_next = usub(text, i, i), usub(text, i + 1, i + 1) local value = sequences[bid] local phone, bidl

if rmatch(bid, "[" .. vowel .. "]") then bidl = 2 if rfind(bid_next, "[" .. length_accent .. "]") then phone = value[bid_next] elseif bid_next == "h" or bid_next == bid then phone = value[MA] else local found = false for seq, seq_phone in pairs(value) do					if type(seq) == "table" and usub(text, i, i + ulen(seq) - 1) == seq then phone = seq_phone bidl = ulen(seq) found = true break end end if not found then -- error("Vowel length is ambiguous. Please specify vowel length.") phone = value[BR] -- temporary solution bidl = 1 end end else if type(value) ~= "table" or value[1] then phone = value bidl = 1 elseif bid_next == bid then phone = value[bid] bidl = 2 else local found = false for seq, seq_phone in pairs(value) do					if type(seq) == "string" and usub(text, i, i + ulen(seq) - 1) == seq then phone = seq_phone bidl = ulen(seq) found = true break end end if not found then phone = value[false] bidl = 1 end end end

if type(phone) == "string" then table.insert(phones, phone) elseif type(phone) == "table" then for _, p in ipairs(phone) do				table.insert(phones, p)			end end

i = i + bidl end

-- concatenate the phonemes into a string return table.concat(phones) end

function export.toIPA(text, orig, pos) if type(text) == 'table' then text, orig, pos = ine(text.args[1]), ine(text.args.orig), ine(text.args.pos) end text = text or mw.title.getCurrentTitle.text text = ulower(text)

text = normalise(text) text = respell(text, orig, pos) text = phonemise(text, orig, pos)

-- remove hyphens and word-boundary markers return rsub(text, '[⁀%-]', '') end

return export