Module:User:Erutuon/grc-accent/sandbox

local export = {}

local m_data = mw.loadData("Module:grc-utilities/data") local m_table = require("Module:table")

local diacritics_list = m_data.diacritics local ACUTE = diacritics_list.acute local GRAVE = diacritics_list.grave local CIRCUMFLEX = diacritics_list.circum local DIAERESIS = diacritics_list.diaeresis local SMOOTH = diacritics_list.smooth local ROUGH = diacritics_list.rough local MACRON = diacritics_list.macron local BREVE = diacritics_list.breve local SUBSCRIPT = diacritics_list.subscript

local diacritic_pattern = m_data.all local diacritic_groups = m_data.diacritic_groups local tonal_diacritic = diacritic_groups[3] -- acute, grave, circumflex local long_diacritics = MACRON .. SUBSCRIPT .. CIRCUMFLEX

local either_vowel = "[ΑαΙιΥυ]"

local find = mw.ustring.find local gsub = mw.ustring.gsub local match = mw.ustring.match local lower = mw.ustring.lower local decompose = mw.ustring.toNFD

local tokenize = require('Module:grc-utilities').tokenize local copy = m_table.shallowcopy

local function if_not_empty(var) if var == "" then return nil else return var end end

local function contains_vowel(token) return match(token, '[ΑΕΗΙΟΥΩαεηιουω]') end

export.contains_vowel = contains_vowel

local function is_diphthong(token) if find(token, "[ΑαΕεΗηΙιΟοΥυΩω][ΙιΥυ]") then return true else return false end end

local libraryUtil = require('libraryUtil') local checkType = libraryUtil.checkType local checkTypeMulti = libraryUtil.checkTypeMulti

local function _check(funcName, expectType) if type(expectType) == "string" then return function(argIndex, arg, nilOk) checkType(funcName, argIndex, arg, expectType, nilOk) end else return function(argIndex, arg, expectType, nilOk) if type(expectType) == "table" then checkTypeMulti(funcName, argIndex, arg, expectType, nilOk) else checkType(funcName, argIndex, arg, expectType, nilOk) end end end end

--	A vowel with a breve or a lone epsilon or omicron is considered short.	Everything else is considered long, including unmarked alphas, iotas, and	upsilons. Sigh. local function is_short(token) if match(token, BREVE) or match(token, '[ΕΟεο]') and not match(token, '[ιυ]') then return true else return false end end

local function conditional_gsub(...) local str, count = gsub(...) if count and count > 0 then return str else return nil end end

-- Assumes decomposed vowels (NFD). local function add(vowel, accent) local functions = { -- This will not throw an error if η or ω has a macron on it. [CIRCUMFLEX] = function(vowel) return (gsub( vowel, "([ΑαΗηΙιΥυΩω])" .. MACRON .. "?(" .. diacritic_groups[2] .. "?)(" .. SUBSCRIPT .. "?)$", "%1%2" .. CIRCUMFLEX .. "%3"			))		end, [ACUTE] = function(vowel) return (				conditional_gsub(vowel, "([Εε])([Ωω])", "%1" .. ACUTE .. "%2") or				gsub(vowel, "([ΑαΕεΗηΙιΟοΥυΩω]" .. diacritic_groups[1] .. "?" .. diacritic_groups[2] .. "?)(" .. SUBSCRIPT .. "?)$", "%1" .. ACUTE .. "%2"))		end, [MACRON] = function(vowel) if find(vowel, "[" .. long_diacritics .. "]") or is_diphthong(vowel) then return vowel elseif find(vowel, "[ΕΟεο]") then error("The vowel " .. vowel ..						" is short, so a macron cannot be added to it.") else return (gsub(vowel, "(" .. either_vowel .. ")", "%1" .. MACRON)) end end, [BREVE] = function(vowel) if find(vowel, "[" .. long_diacritics .. "]") then error("The vowel " .. vowel ..						" has a iota subscript, a macron, or a circumflex, so a breve cannot be added to it.") elseif is_diphthong(vowel) then error("The vowel " .. vowel ..						" is a diphthong, so a breve cannot be added to it.") else return (gsub(vowel, "(" .. either_vowel .. ")", "%1" .. BREVE)) end end, -- This will insert a diaeresis on a single iota or upsilon, or on a -- iota or upsilon that is the second element of a diphthong. -- It does nothing if the vowel has a breathing on it. [DIAERESIS] = function(vowel) return (gsub( vowel, "([ΙιΥυ]" .. diacritic_groups[1] .. "?)(" .. tonal_diacritic .. "?)$", "%1" .. DIAERESIS .. "%2"			))		end }	if type(functions[accent]) == "function" then return functions[accent](vowel) else local name = m_table.keyFor(diacritics_list, accent) if name == "circum" then name = "circumflex" end error("No function for adding a " .. name .. ".") end end

function export.strip_accent(word) word = decompose(word) -- Parentheses suppress second return value of gsub, the number of substitutions. return (gsub(word, diacritic_pattern, '')) end

function export.strip_tone(word) word = decompose(word) if match(word, CIRCUMFLEX) then word = copy(tokenize(word)) for i = 1, #word do -- Add a macron to every vowel with a circumflex and remove the circumflex. word[i] = gsub(word[i],				'^([αΑιΙυΥ])([' .. SMOOTH .. ROUGH .. DIAERESIS .. ']*)' .. CIRCUMFLEX .. '$',				'%1' .. MACRON .. '%2') end word = table.concat(word) end return (gsub(word, tonal_diacritic, '')) end

function export.ult(word) word = decompose(word) if match(word, tonal_diacritic) then return word end word = copy(tokenize(word)) for i, token in m_table.reverseIpairs(word) do		if contains_vowel(token) then --fortunately accents go last in combining order word[i] = add(token, ACUTE) break end end return table.concat(word, '') end

-- WARNING: Given an unmarked α ι υ, this function will return a circmflex. That said, if you ran into this situation in the first place, you probably are doing something wrong. -- function export.circ(word) word = decompose(word) if match(word, tonal_diacritic) then return word end word = copy(tokenize(word)) for i, token in m_table.reverseIpairs(word) do		if contains_vowel(token) then if is_short(token) then word[i] = add(token, ACUTE) else word[i] = add(token, CIRCUMFLEX) end break end end return table.concat(word, '') end

function export.penult(orig) local word = decompose(orig) if match(word, tonal_diacritic) then return word end word = copy(tokenize(word)) local syllables = 0 for i, token in m_table.reverseIpairs(word) do		if token == '-' then return orig elseif contains_vowel(token) then syllables = syllables + 1 if syllables == 2 then word[i] = add(token, ACUTE) return table.concat(word, '') end end end return export.circ(orig) end

function export.pencirc(orig) local word = decompose(orig) if match(word, tonal_diacritic) then return word end word = copy(tokenize(word)) local syllables = 0 local long_ult = false for i, token in m_table.reverseIpairs(word) do		if token == '-' then return orig end if contains_vowel(token) then syllables = syllables + 1 if syllables == 1 and not is_short(token) then long_ult = true if word[#word] == 'αι' or word[#word] == 'οι' then long_ult = false end elseif syllables == 2 then if is_short(token) or long_ult then word[i] = add(token, ACUTE) else word[i] = add(token, CIRCUMFLEX) end return table.concat(word, '') end end end return export.circ(orig) end

function export.antepenult(orig) local word = decompose(orig) if match(word, tonal_diacritic) then return word end word = copy(tokenize(word)) local syllables = 0 local long_ult = false for i, token in m_table.reverseIpairs(word) do		if token == '-' then return orig end if contains_vowel(token) then syllables = syllables + 1 if syllables == 1 and not is_short(token) then long_ult = true if word[#word] == 'αι' or word[#word] == 'οι' then long_ult = false end elseif syllables == 2 and long_ult then word[i] = add(token, ACUTE) return table.concat(word, '') elseif syllables == 3 then word[i] = add(token, ACUTE) return table.concat(word, '') end end end return export.pencirc(orig) end

--	Counts from the beginning or end of the word, and returns the position and	type of the first accent found. Position means the number of vowels	(syllables) that have been encountered, not the number of characters.	Arguments:	- word:			string	(Ancient Greek word)	- from_end:		boolean	(whether to count from the end of the word) local accent_cache = { [true] = {}, [false] = {} }

function export.detect_accent(word, from_end) local check = _check("detect_accent") check(1, word, "string") check(2, from_end, "boolean", true) local cache = accent_cache[from_end == true][decompose(word)] if cache then return unpack(cache) end local names = { [ACUTE] 		= "acute", [GRAVE] 		= "grave", [CIRCUMFLEX]	= "circumflex", }	local syllable = 0 local accent_name for _, token in			(from_end and m_table.reverseIpairs or ipairs)(tokenize(word)) do if contains_vowel(token) then syllable = syllable + 1 accent_name = names[match(token, tonal_diacritic)] if accent_name then accent_cache[from_end == true][decompose(word)] = { syllable, accent_name } return syllable, accent_name end end end return nil end

--	Returns classification based on first accent found	when traveling back from the end of the word. function export.get_accent_term(word) local syllable, accent_name = export.detect_accent(word, true) local terms = { ["grave"]		= { "barytone" }, ["acute"] 		= { "oxytone", "paroxytone", "proparoxytone" }, ["circumflex"]	= { "perispomenon", "properispomenon" }, }	local ordinals = { "first", "second", "third", "fourth", "fifth", } if syllable and accent_name then return terms[accent_name][syllable] or error('There is no term for a word with a ' .. accent_name ..						' accent on the ' .. ordinals[syllable] ..						' syllable from the end of the word.') else return nil end end

-- is_noun is a boolean or nil; if it is true, αι and οι will be -- treated as short. function export.get_length(token, short_diphthong) local token = lower(token) -- not needed at the moment -- token = decompose(token) if not contains_vowel(token) then return nil -- error("The thing supplied to get_length does not have any vowels") end -- η, ω; ᾳ, ῃ, ῳ; ᾱ, ῑ, ῡ; diphthongs if find(token, "[ηω" .. long_diacritics .. "]") then return "long" end if short_diphthong and find(token, "^[αο]ι") then return "short" end if is_diphthong(token) then return "long" end -- ε, ο; ᾰ, ῐ, ῠ if find(token, "[εο" .. BREVE .. "]") then return "short" end -- anything else return "either" end

-- Takes a table of tokens and returns a table containing tables of each vowel's -- characteristics. function export.get_vowel_info(tokens, short_diphthong) if type(tokens) ~= "table" then error("The argument to get_vowel_info must be a table.") end local vowels = {} local vowel_i = 1 if find(tokens[#tokens], m_data.consonant .. "$") then short_diphthong = false end for i, token in m_table.reverseIpairs(tokens) do		if contains_vowel(token) then if vowel_i ~= 1 then short_diphthong = false end local length, accent = export.get_length(token, short_diphthong), if_not_empty(match(token, "[" .. ACUTE .. GRAVE .. CIRCUMFLEX .. "]"))			vowels[vowel_i] = { index = i,					length = length, accent = accent, }			vowel_i = vowel_i + 1 end end return vowels end

function export.mark_implied_length(word, return_tokens, short_diphthong) word = decompose(word) -- Do nothing if there are no vowel letters that could be ambiguous. if not find(word, either_vowel) then if return_tokens then return tokenize(word) else return word end end local tokens = copy(tokenize(word)) local vowels = export.get_vowel_info(tokens, short_diphthong) if #vowels >= 2 then local ultima = vowels[1] local ultima_i = ultima.index local penult = vowels[2] local penult_i = penult.index if penult.length == "either" and ultima.length == "short" then if penult.accent == CIRCUMFLEX then tokens[penult_i] = add(tokens[penult_i], MACRON) elseif penult.accent == ACUTE then tokens[penult_i] = add(tokens[penult_i], BREVE) end elseif penult.length == "long" and ultima.length == "either" then if penult.accent == CIRCUMFLEX then tokens[ultima_i] = add(tokens[ultima_i], BREVE) elseif penult.accent == ACUTE then tokens[ultima_i] = add(tokens[ultima_i], MACRON) end end local antepenult = vowels[3] if antepenult and antepenult.accent and ultima.length == "either" then tokens[ultima_i] = add(tokens[ultima_i], BREVE) end end if return_tokens then return tokens else return table.concat(tokens) end end

-- Returns the length of a syllable specified by its position from the end of the word. function export.length_at(word, syllable) local tokens = tokenize(word) if type(word) ~= "string" then error("First argument of length_at should be a string.") end if type(syllable) ~= "number" then error("Second argument of length_at should be a number.") end local syllable_count = 0 for _, token in m_table.reverseIpairs(tokens) do		local length = export.get_length(token) if length then syllable_count = syllable_count + 1 if syllable_count == syllable then return length end end end if syllable_count < syllable then error("Length for syllable " .. syllable .. " from the end of the word was not found.") end end

local function find_breathing(token) return match(token, "([" .. ROUGH .. SMOOTH .. "])") end

local function has_same_breathing_as(token1, token2) return find_breathing(token1) == find_breathing(token2) end

-- Make token have the length specified by the string "length". local function change_length(length, token) local diacritic if length == "long" then diacritic = MACRON elseif length == "short" then diacritic = BREVE end if diacritic then return add(token, diacritic) else return token end end

--	Take two words, mark implied length on each, then harmonize any macrons and	breves that disagree. function export.harmonize_length(word1, word2) word1 = decompose(word1) -- Do nothing if there are no vowel letters that could be ambiguous. if not (find(word1, either_vowel) or find(word2, either_vowel)) then return word1, word2 end local tokens1, tokens2 = export.mark_implied_length(word1, true), export.mark_implied_length(word2, true) local strip1, strip2 = copy(tokenize(export.strip_accent(word1))), copy(tokenize(export.strip_accent(word2))) for i, token1 in pairs(tokens1) do		local token2 = tokens2[i] if strip1[i] == strip2[i] then if has_same_breathing_as(token1, token2) then local length1, length2 = export.get_length(token1), export.get_length(token2) if length1 and length2 and length1 ~= length2 then if length1 == "either" then tokens1[i] = change_length(length2, token1) elseif length2 == "either" then tokens2[i] = change_length(length1, token2) end end else break end else break end end local new_word1, new_word2 = table.concat(tokens1), table.concat(tokens2) return new_word1, new_word2 end

--	Get weight of nth syllable from end of word. Position defaults to 1, the last	syllable. Returns "heavy" or "light", or nil if syllable is open with an	ambiguous vowel. function export.get_weight(word, position) if not if_not_empty(word) then return nil end local tokens = tokenize(word) if not position then position = 1 end local vowel local vowel_index = 0 -- Find nth vowel from end of word. for i, token in m_table.reverseIpairs(tokens) do		local length = export.get_length(token) if length then vowel_index = vowel_index + 1 if vowel_index == position then vowel = { index = i, length = length } break end end end if not vowel then return nil end if vowel.length == "long" then return "heavy" else -- Count consonants after the vowel. local consonant_count = 0 for i = vowel.index + 1, #tokens do			if not contains_vowel(tokens[i]) then consonant_count = consonant_count + 1 else break end end if consonant_count > 1 then return "heavy" elseif vowel.length == "short" then return "light" else return nil end end end

--	Add accent mark at position. Position is a number that refers to the nth	vowel from the beginning of the word. Respects the rules of accent.	Examples:	- δημος,	1		=> δῆμος	- προτερᾱ,	1		=> προτέρᾱ	(position changed to 2 because ultima is long)	- μοιρα,	1, true	=> μοῖρα	(circumflex can be added because ultima is									ambiguous)	- προτερᾱ,	2, true	=> προτέρᾱ	(circumflex can't be added because ultima									is long)	- τοιουτος,	2		=> τοιοῦτος	(circumflex because ultima is short)	Arguments:	- word:			string	(hopefully an Ancient Greek word or stem)	- syllable_position:		number	(less than the number of monophthongs or diphthongs							in the word)	- circumflex:	boolean	(add a circumflex if allowed) function export.add_accent(word, syllable_position, options) local check = _check("add_accent") check(1, word, "string") check(2, syllable_position, "number") check(3, options, "table", true) word = decompose(word) if find(word, tonal_diacritic) then return word end options = options or {} local tokens = copy(tokenize(word)) local vowels = export.get_vowel_info(tokens, options.short_diphthong) local vowel_count = #vowels -- Convert positions in relation to the beginning of the word -- to positions in relation to the end of the word. -- The farthest back that an accent can be placed is 3 (the antepenult), -- so that is the greatest allowed position. if syllable_position > 0 then syllable_position = math.min(3, vowel_count - syllable_position + 1) -- If the position is in relation to the end of the word and it is greater -- than the length of the word, then reduce it to the length of the word. -- This is for practical reasons. Positions in relation to the beginning of -- the word do not need leeway. elseif syllable_position < 0 then syllable_position = math.min(-syllable_position, vowel_count) end if syllable_position == 0 then error("Invalid position value " .. syllable_position .. ".") elseif syllable_position > vowel_count then error("The position " .. syllable_position .. " is invalid, because the word has only " .. vowel_count .. " vowels.") end -- Apply accent rules to change the accent's position or type. local accent_mark = options.circumflex and CIRCUMFLEX or ACUTE local ultima = vowels[1] -- 	if syllable_position == 3 then local penult = vowels[2] if ultima.length == "long" and not (options.synaeresis				and ("Εε"):find(tokens[penult.index], 1, true)				and ("Ωω"):find(tokens[ultima.index], 1, true)				and ultima.index - penult.index == 1) then syllable_position = 2 else accent_mark = ACUTE end end if syllable_position == 2 then if ultima.length == "short" and vowels[2].length == "long" then accent_mark = CIRCUMFLEX elseif ultima.length == "long" then accent_mark = ACUTE end end local vowel = vowels[syllable_position] if not vowel then error('No vowel at position ' .. syllable_position ..			' from the end of the word ' .. word .. '.') end if vowel.length == "short" then accent_mark = ACUTE end local i = vowel.index tokens[i] = add(tokens[i], accent_mark) return table.concat(tokens) end

function export.syllables(word, func, number) local check = _check('syllables') check(1, word, 'string') check(2, func, 'string', true) check(3, number, 'number', true) if not func then error('No function specified') end local functions = { eq = function (word, number) local vowels = 0 for _, token in ipairs(tokenize(word)) do				if contains_vowel(token) then vowels = vowels + 1 if vowels > number then return false end end end if vowels == number then return true end return false end }	func = functions[func] if func then return func(word, number) else error('No function ' .. func) end end

return export