Module:User:AmazingJus/sco

local export = {}

local lang = require("Module:languages").getByCode("sco") local m_IPA = require("Module:IPA")

local gmatch = mw.ustring.gmatch local gsplit = mw.text.gsplit local match = mw.ustring.match local gsubn = mw.ustring.gsub local len = mw.ustring.len local lower = mw.ustring.lower local sub = mw.ustring.sub

-- version of gsubn that discards all but the first return value local function gsub(term, foo, bar, n)	local retval = gsubn(term, foo, bar, n)	return retval end

-- Dialect abbreviations: --
 * Insular:
 * Orkney: or
 * Shetland: sh
 * Northern:
 * North Northern: nn
 * Mid Northern: mn
 * South Northern: sn
 * Central:
 * North East Central: nec
 * South East Central: sec
 * West Central: wc
 * South West Central: swc
 * Southern: s
 * Ulster:
 * Western Ulster: wu
 * Central Ulster: cu
 * Eastern Ulster: eu

-- TODO: -- * Consider unstressed vowels (schwa) -- * Place the morpheme splitting in the main evaluation function -- * Work on consonant rules -- * Consider adding unique dialects based on word inputted -- * Consider unique pronunciation for suffixes --

-- DATA STRUCTURES -- -- list pronunciations for different vowel spellings -- {"pattern", "pos"} represent the surrounding letters with before (-1) or after (1) a word, false represents every other condition -- "ˑ" indicates a vowel affected by the scottish vowel length rule local s = { ["a"] = { [{"n[gd]?", 1}] = "a~ɑ", [{nil, 1}] = "ɑˑ,e", [false] = "aˑ" },	["e"] = { [false] = "ɛˑ~æˑ" },	["i"] = { [{"n?g", 1}] = "əi", [{"ch", 1}] = "əi", [{"wh", -1}] = "ʌ", [false] = "ɪ" },	["o"] = { [{nil, -1}] = "wʌˑ", [{"ch", 1}] = "ʌu", [false] = "ɔˑ" },	["u"] = { [false] = "ʌ" } }

-- all possible multi-letter graphemes needed for tokenisation local multigraphs = { "a_e", "e_e", "i_e", "o_e", "owe", "u_e", "y_e", "aa", "ae", "ai", "au", "aw", "ay", "ea", "ee", "ei", "eu", "ew", "ey", "ie", "oa", "oi", "oo", "ou", "ow", "oy", "ui", "ch", "ck", "kn", "ld", "mb", "nd", "ng", "nk", "qu", "sh", "th", "wh", "wr" }

-- common morphemes local morphemes = { unstressed = {"ae", "ane", "dae", "hae", "na", "nae", "sae", "tae", "the"}, -- unstressed particles prefixes = {"a"}, -- prefixes suffixes = {"fu", "le", "na", "the", "se"} -- suffixes }

-- HELPER FUNCTIONS -- -- handle vowel length according to scottish vowel length rule local function handle_vowel_length(word) -- long if before /r/ and voiced fricatives word = gsub(word, "ˑ([rvzðʒ])", "ː%1") -- also long morpheme-finally word = gsub(word, "ˑ$", "ː")

-- otherwise short word = gsub(word, "ˑ", "")

return word end

-- handle stress local function handle_stress(word) -- apply morpheme rules if no explicit stress marker and not an unstressed particle if not match(word, "ˈ") and not morphemes.unstressed[word] then -- stress after prefix "a-" if match(word, "^a[^aeiou][aeiou]") then word = "aˈ" .. sub(word, 2) -- otherwise add stress on the first syllable of a morpheme else word = "ˈ" .. word end end

return word end

-- split any potential suffixes from word local function split_suffixes(word) -- loop over all possible suffixes for _, suffix in ipairs(morphemes.suffixes) do		if sub(word, -len(suffix)) == suffix then return sub(word, 1, -len(suffix)-1), suffix end end

-- return suffixless word otherwise return word, nil end

-- MAIN FUNCTIONS -- -- tokenise word into individual graphemes and affixes local function tokenise(word) -- initialise index and tokenised array local i = 1 local tokenised = {}

-- split any suffixes from the base word local base_word, suffix = split_suffixes(word)

-- respell vowel + consonant + e as vowel + _e + consonant for easier parsing base_word = gsub(base_word, "([^aeiou][aeiouy])([^aeiouwy])e([^aeiou])", "%1_e%2%3") base_word = gsub(base_word, "([^aeiou][aeiouy])([^aeiouwy])e$", "%1_e%2")

-- loop over entire base word while i <= len(base_word) do		-- loop over all possible multigraphs local found = false for _, multigraph in ipairs(multigraphs) do			-- check for a matching multigraph if sub(base_word, i, i + len(multigraph) - 1) == multigraph then -- add multigraph to tokenised table.insert(tokenised, multigraph) i = i + len(multigraph) found = true break end end -- add single grapheme if no multigraph found if not found then table.insert(tokenised, sub(base_word, i, i)) i = i + 1 end end

-- add suffix to the tokenized table at the end if suffix then table.insert(tokenised, suffix) end

return tokenised end

-- process phonemes for tokens local function to_phonemes(tokens) local phonemes = {}

for i = 1, #tokens do		local char = tokens[i]

-- ensure char is not nil and exists in table if char and s[char] then -- use s temporarily -- determine surrounding context local before = i > 1 and tokens[i - 1] or nil local after = i < #tokens and tokens[i + 1] or nil

local match_found = false -- check conditions in table for pattern, replacement in pairs(s[char]) do				-- if no specific condition were defined for character if pattern == false then phonemes[#phonemes + 1] = replacement match_found = true break elseif type(pattern) == "table" then local pos = pattern[2] local context = pos == -1 and before or after -- match true given a context and pattern if (not context and not pattern[1]) or (context and match(pattern[1], context)) then match_found = true phonemes[#phonemes + 1] = replacement break end end end

-- add match to table if not match_found then phonemes[#phonemes + 1] = char end else -- otherwise append char as is			phonemes[#phonemes + 1] = char or '' end end

return table.concat(phonemes) end

-- generate IPA pronunciation of word function export.toIPA(entry) if type(entry) == "table" then entry = entry.args[1] end

-- make text lowercase entry = lower(entry)

local words = {} -- loop over each word for word in gsplit(entry, "%s") do		-- tokenise word into graphemes local tokenised = tokenise(word)

-- process phonemes for tokens local processed = to_phonemes(tokenised)

-- add processed word to word array table.insert(words, processed) end

return table.concat(words, " ") end

-- export function for IPA function export.show(entry) if type(entry) == "table" then entry = entry.args[1] end

-- return processed pronunciation return export.toIPA(entry) end

return export