Module:sla-common

local export = {}

local m_links = require("Module:links") local m_table_tools = require("Module:table tools")

local lang = require("Module:languages").getByCode("sla-pro")

local u = mw.ustring.char local rfind = mw.ustring.find local rsubn = mw.ustring.gsub local rmatch = mw.ustring.match local rsplit = mw.text.split local ulower = mw.ustring.lower local uupper = mw.ustring.upper local usub = mw.ustring.sub

local AC = u(0x0301) -- acute = ́ local GR = u(0x0300) -- grave = ̀ local CFLEX = u(0x0302) -- circumflex = ̂ local TILDE = u(0x0303) -- tilde = ̃ local BREVE = u(0x0306) -- breve = ̆ local INVBREVE = u(0x0311) -- inverse breve = ̑ local DOUBLEAC = u(0x030B) -- double acute = ̋ local DOUBLEGR = u(0x030F) -- double grave = ̏ local MACRON = u(0x0304) -- macron = ̄ local CARON = u(0x030C) -- caron = ̌ local OGONEK = u(0x0328) -- ogonek = ̨

local stressed_accents = AC .. GR .. INVBREVE .. DOUBLEGR .. DOUBLEAC .. TILDE local stressed_accents_c = "[" .. stressed_accents .. "]" local accents = stressed_accents .. MACRON local accents_c = "[" .. accents .. "]" local vowels = "aeiouyьъěęǫ" local vowels_c = "[" .. vowels .. "]" local non_vowels_c = "[^" .. vowels .. "]" local short_vowels = "eoьъ" local short_vowels_c = "[" .. short_vowels .. "]" local long_vowels = "aiuyěęǫ" local long_vowels_c = "[" .. long_vowels .. "]" local cons_c = "[^" .. vowels .. accents .. "]" local iotated_cons = "čďjľňřšťž" local iotated_cons_c = "[" .. iotated_cons .. "]"

-- version of rsubn that discards all but the first return value local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

function export.tag_form(form, tag) if form ~= "" then return "<" .. (tag or "span") .. " lang=\"sla-pro\" class=\"Unicode\">*" .. form .. ""	else return "&mdash;" end end

-- Make a link out of a form, or show a dash if empty. function export.link_form(form, tag) local SUBPAGENAME = mw.title.getCurrentTitle.subpageText if type(form) == "table" then if not form.notesym then local retval = {} for _, subform in ipairs(form) do				table.insert(retval, export.link_form(subform, tag)) end return table.concat(retval, ", ") else return m_links.full_link({ lang = lang, term = "*" .. form[1] }) .. m_table_tools.superscript_notes(form.notesym) end elseif form ~= "" then return m_links.full_link({ lang = lang, term = "*" .. form }) else return "&mdash;" end end

local recomposer = { ["e" .. CARON] = "ě", -- Latin e and E	["E" .. CARON] = "Ě", ["e" .. OGONEK] = "ę", -- Latin e and E	["E" .. OGONEK] = "Ę", ["o" .. OGONEK] = "ǫ", -- Latin o and O	["O" .. OGONEK] = "Ǫ", ["c" .. CARON] = "č", ["C" .. CARON] = "Č", ["d" .. CARON] = "ď", ["D" .. CARON] = "Ď", ["l" .. CARON] = "ľ", ["L" .. CARON] = "Ľ", ["n" .. CARON] = "ň", ["N" .. CARON] = "Ň", ["r" .. CARON] = "ř", ["R" .. CARON] = "Ř", ["s" .. CARON] = "š", ["S" .. CARON] = "Š", ["t" .. CARON] = "ť", ["T" .. CARON] = "Ť", ["z" .. CARON] = "ž", ["Z" .. CARON] = "Ž", }

-- Decompose acute, grave, etc. on letters into individivual character + -- combining accent. But recompose characters that we want to treat -- as units and get caught in the crossfire. function export.decompose(text) text = mw.ustring.toNFD(text) text = rsub(text, ".[" .. CARON .. OGONEK .. "]", recomposer) return text end

-- Decompose as in export.decompose, but also canonicalize circumflex to -- inverse breve in case it accidentally gets used. function export.canon_decompose(text) text = export.decompose(text) text = rsub(text, CFLEX, INVBREVE) return text end

function export.assert_decomposed(text) assert(not rfind(text, "[áéíóúýàèìòùỳãẽĩõũỹāēīōūȳȃȇȋȏȗȁȅȉȍȕÁÉÍÓÚÝÀÈÌÒÙỲÃẼĨÕŨỸĀĒĪŌŪȲȂȆȊȎȖȀȄȈȌȔ]")) end

function export.first_palatalization(stem) stem = rsub(stem, "...$", {["zdz"]="ždž"}) stem = rsub(stem, "..$", {["sk"]="šč", ["zg"]="ždž", ["dz"]="ž", ["sc"]="šč"}) stem = rsub(stem, ".$", {["k"]="č", ["g"]="ž", ["x"]="š", ["c"]="č", ["ś"]="š"}) return stem end

function export.second_palatalization(stem) return rsub(stem, ".$", {["k"]="c", ["g"]="dz", ["x"]="ś"}) end

function export.iotate(stem) stem = rsub(stem, "...$", {["zdz"]="ždž"}) stem = rsub(stem, "..$", {["sk"]="šč", ["zg"]="ždž", ["dz"]="ž", ["sc"]="šč"}) stem = rsub(stem, ".$", {		["b"]="bľ",		["c"]="č",		["d"]="ď",		["g"]="ž",		["k"]="č",		["l"]="ľ",		["m"]="mľ",		["n"]="ň",		["p"]="pľ",		["r"]="ř",		["s"]="š",		["ś"]="š",		["t"]="ť",		["v"]="vľ",		["x"]="š",		["z"]="ž",	}) if not rfind(stem, iotated_cons_c .. "$") then stem = stem .. "j" end return stem end

-- Check if word has a stress accent function export.is_stressed(word) export.assert_decomposed(word) return rfind(word, stressed_accents_c) end

-- Remove any stress accents from the word function export.make_unstressed(word) export.assert_decomposed(word) return rsub(word, stressed_accents_c, "") end

-- Check if word is nonsyllabic (has no vowels) function export.is_nonsyllabic(word) export.assert_decomposed(word) return rfind(word, "^" .. non_vowels_c .. "*$") end

-- Check if word is monosyllabic (has only one vowel) function export.is_monosyllabic(word) export.assert_decomposed(word) return rfind(word, "^" .. non_vowels_c .. "*" .. vowels_c .. non_vowels_c .. "*$") end

-- Set the accent in STEM to ACCENT, replacing any stressed accent already -- there. If there isn't such an accent already then: -- (1) If the accent is inverse breve (= old circumflex or short accent) or --    double grave (= old short accent), put it on the first syllable; -- (2) If the accent is tilde (= neoacute), put it on the last syllable; -- (3) If the accent is a single grave (= old acute), put it on the vowel if --    there's only one, otherwise don't add it as it can go anywhere. -- Placing the accent will replace any unstressed accent already there -- (specifically the macron). -- -- In addition, if the accent is tilde (= neoacute), we put the accent on the -- last syllable of the stem, regardless of any existing accent. The logic here -- is that, in nouns at least, a neoacute on the stem that we request (i.e. not -- already in the stem) is always retracted from the ending, and thus should -- go on the last syllable if there is more than one. FIXME: May not apply to -- verbs. -- -- Also apply certain conversions to the result: -- (1) Original short vowels e o ь ъ can't get a macron. Per Derksen 2008, --    this also includes liquid diphthongs, which normally behave like --    long vowels; cf. 'borzdà' "burrow" in class b, where you expect the --    preceding vowel to be long if possible. However, we go against --    Derksen in this respect when the first vowel is e or o because Czech, --    Slovak and Polish show clear length distinctions (or reflections thereof) --    in original pre-tonic syllables in class b vs. c. (Serbo-Croat reflects --     length in both classes but this can be a later development due to --     analogy.) Per Kortlandt, the metathesis of liquid diphthongs preceded --    Dybo's law and (probably) the shortening of pre-tonic vowels. -- (2) Original long vowels a i u y ě ę ǫ can't get a double grave, nor can --    liquid diphthongs; instead, convert to inverse breve (circumflex accent). -- (3) Original short vowels e o ь ъ not in liquid diphthongs can't receive a --    tilde (neoacute) per the May 2019 discussion in --     Wiktionary talk:About Proto-Slavic; --    instead we convert to single grave. function export.set_accent(stem, accent) export.assert_decomposed(stem) -- string containing a hyphen is the value of UNK = unknown, and removes -- all accents including macrons if accent == "-" then return rsub(stem, accents_c, "") end if accent == DOUBLEGR then error("Double grave should not be specified as an accent; use inverted breve instead") end if not export.is_stressed(stem) and accent ~= TILDE and (accent ~= GR or export.is_monosyllabic(stem)) then -- If no stressed accent, put one on the first syllable, removing any -- non-stress accent, i.e. macron (it doesn't matter which accent we put		-- as long as it's a stress accent, as it will be overwritten in the		-- next clause). But don't do this if accent is a tilde (no point, it		-- will be ignored and removed in the next clause), and if the accent is -- a grave, only do this if the stem is monosyllabic. stem = rsub(stem, "^(.-" .. vowels_c .. ")" .. accents_c .. "*",			"%1" .. INVBREVE) end if accent == TILDE then -- If a tilde, cancel out any existing stressed accent and put the tilde -- on the last syllable. (FIXME, might not apply to verbs.) Later on -- we will conver this to a single grave if it's on a short monophthong. stem = export.make_unstressed(stem) stem = rsub(stem, "^(.*" .. vowels_c .. ")" .. accents_c .. "*",			"%1" .. TILDE) else -- Otherwise just replace the stressed accent, if any, with the given -- accent. There will always be such an accent except in multisyllabic -- words where the accent is a single grave; in other circumstances -- we added an accent on the first syllable if it was missing. stem = rsub(stem, stressed_accents_c .. "+", accent) end if accent == MACRON then -- hack to handle liquid diphthongs: generate two macrons, since the -- following regex will remove one. stem = rsub(stem, "([eo])" .. MACRON .. "([lr]" .. cons_c .. ")",			"%1" .. MACRON .. MACRON .. "%2") stem = rsub(stem, "(" .. short_vowels_c .. ")" .. MACRON, "%1") end -- Convert inverse breve after short vowel not in liquid diphthong to -- double grave. if rfind(stem, short_vowels_c .. INVBREVE) and not rfind(stem, short_vowels_c .. INVBREVE .. "[lr]" .. cons_c) then stem = rsub(stem, INVBREVE, DOUBLEGR) end -- Convert tilde after short vowel not in liquid diphthong to single grave. if rfind(stem, short_vowels_c .. TILDE) and not rfind(stem, short_vowels_c .. TILDE .. "[lr]" .. cons_c) then stem = rsub(stem, TILDE, GR) end return stem end

-- Infer the accentual pattern for a given unstressed word and accent pattern. -- Return a list of possibilities (possibly empty if no accent could be inferred, -- possibly containing more than one entry if multiple accentual patterns are -- possible, e.g. with *voľa-type nouns), each of which is a three-entry list of -- {stem, desinence, final_accent}. local function infer_accent(word, ap) assert(not export.is_stressed(word)) local possible_accents = {} local stem, desinence, final_accent = export.split_stem_desinence(word) if ap == "a" then if export.is_monosyllabic(stem) then table.insert(possible_accents, {export.set_accent(stem, GR), desinence, final_accent}) end elseif ap == "b" then if export.is_nonsyllabic(stem) then table.insert(possible_accents, {stem, desinence, GR}) elseif desinence == "ь" or desinence == "ъ" then table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent}) else table.insert(possible_accents, {stem, desinence, GR}) if desinence == "a" and rfind(stem, iotated_cons_c .. "$") then -- *voľa-type accent table.insert(possible_accents, {export.set_accent(stem, TILDE), desinence, final_accent}) end end elseif ap == "c" then if export.is_nonsyllabic(stem) then table.insert(possible_accents, {stem, desinence, INVBREVE}) elseif desinence == "a" then table.insert(possible_accents, {stem, desinence, GR}) else table.insert(possible_accents, {export.set_accent(stem, INVBREVE), desinence, final_accent}) end end return possible_accents end

-- If WORD is unstressed, add the appropriate accent for the accent pattern AP -- if possible (it won't be possible with accent pattern a in words with a -- multisyllabic stem). If WORD is stressed, check that the accent on the word -- is appropriate for the accent pattern, and throw an error if not. In either -- case, return three values, STEM, DESINENCE and FINAL_ACCENT, which when -- concatenated together produce the original word. function export.auto_accent_and_check_accents(word, ap) local unstressed = not export.is_stressed(word) if unstressed then local possible_accents = infer_accent(word, ap) if #possible_accents == 0 then return export.split_stem_desinence(word) end local first = possible_accents[1] local stem, desinence, final_accent = first[1], first[2], first[3] return stem, desinence, final_accent else local uword = export.make_unstressed(word) local possible_accents = infer_accent(uword, ap) if #possible_accents == 0 then return export.split_stem_desinence(word) end local possible_words = {} for _, split_possible in ipairs(possible_accents) do			local stem, desinence, final_accent = split_possible[1], split_possible[2], split_possible[3] local possible_word = stem .. desinence .. final_accent if possible_word == word then return stem, desinence, final_accent end table.insert(possible_words, possible_word) end error("For accent pattern " .. ap .. ", accented lemma should look like " ..			table.concat(possible_words, " or ") .. " but is actually " .. word) end end

function export.split_stem_desinence(word) export.assert_decomposed(word) local stem, desinence, final_accent = rmatch(word, "^(.-)(.)(" .. accents_c .. "?)$")	if not stem or not desinence then error("Something wrong with '" .. word .. "', probably too short") end return stem, desinence, final_accent end

return export