Module:User:Ssvb/ru-autoaccent

local export = {}

-- TODO: Implement more special cases from Benwing2's --      https://github.com/benwing2/RuNounChanges/blob/master/auto_accent_auto_bracket_ru.py --       All of these need to get proper test coverage. -- TODO: Maybe handle words with more than one letter "ё". The whole dictionary only contains --      just a few of them and for now we may just ignore them. See the "words_with_double_jo" --      table returned from query_extra_info. But ignoring these words is also safe. -- TODO: Maybe have configuration knobs for enabling/disabling certain features? For example, --      the pre-1918 orthography conversion could be turned off, forceful correction of the --      incorrect accents in the original text could be done, maybe the words could be turned --      into links, etc.

local decompose = require("Module:ru-common").decompose local m_dict = require("Module:User:Ssvb/ru-accentdict") local lookup_word = m_dict.lookup_word local max_stress_search_steps = m_dict.query_extra_info["max_stress_search_steps"] local max_jo_search_steps = m_dict.query_extra_info["max_jo_search_steps"]

local vowels = "аеёєэиіїоуюяыѣѵАЕЁЄЭИІЇОУЮЯЫѢѴ" local vowel = "[" .. vowels .. "]" local nonvowel = "[^" .. vowels .. "]" local GR = require("Module:string/char")(0x0300) -- grave local AC = require("Module:string/char")(0x0301) -- acute

local rsubn = mw.ustring.gsub local rfind = mw.ustring.find

local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

-- Try to add stress accent annotation to a word. Implemented via doing dictionary lookups -- for all possible stress positions (taking into account the 'max_stress_search_steps' -- statistical parameter as a useful performance optimization and an anti-DoS safeguard). -- -- Returns: --  unaccented word  - the word is ambiguous and can't be safely accented --  accented word    - the suggested accent position --  nil              - abstain from making any accent placement decisions local function try_to_recover_ac(word) -- If the word with an already annotated accent is found in the dictionary, then it -- is perfectly fine and gets our stamp of approval. If the word without an annotated -- accent is found, then this is also perfectly fine because there are certain -- multi-syllable words without accent (such as подо or обо), not to mention -- many single-syllable words too. The author of the input text is likely to have -- added explicit accent specifically to resolve ambiguity, so now we don't need -- to do any further analysis to confirm this whether this was or wasn't the case. if lookup_word(word) then return word end -- If the word did have an accent, but wasn't found in the dictionary, then it's	-- possible that this happened because of a missing "ё". So abstain from making -- any decision and return nil. Alternatively, the author of the text may have -- just made a mistake with accenting, but it's not our call to judge him here. if rfind(word, AC) then return end -- There's no accents of any kind in the 'word' variable. So try to probe adding -- accent in different positions, starting from the last vowel and going backwards. local step, cnt, result = 0, 0, word local tmp = rsub(word, "(" .. vowel .. ")(" .. nonvowel .. "*)$", "%1" .. AC .. "%2") while word ~= tmp and step < max_stress_search_steps do		step = step + 1 word = tmp if lookup_word(word) then cnt = cnt + 1 -- Combine multiple accents in the 'result' variable, as this is needed for "ка́та́рсис" -- and other similar words with multiple valid accent positions. local accent_pos = word:find(AC) result = result:sub(1, accent_pos - 1) .. AC .. result:sub(accent_pos) end tmp = rsub(word, "(" .. vowel .. ")(" .. nonvowel .. "*" .. vowel .. ")" .. AC, "%1" .. AC .. "%2") end if cnt > 1 then -- If more than one accent position was found, then it's either something with multiple -- valid accent positions like "ка́та́рсис" or an ambiguity between "замо́к" and "за́мок". -- Doing a dictionary lookup for the word with the combined accents clarifies everything. return (lookup_word(result) and result) or rsub(result, AC, "") elseif cnt == 1 then -- If only one possible accent position was found, then we have our answer return result end end

-- Try to recover one missing "ё" letter in the word and also accent position. -- Multiple "ё" letters in a single word are extremely uncommon, so this edge -- case is not supported right now. -- -- Returns: --  unmodified word  - the word is ambiguous and can't be modified safely --  modified word    - the suggested adjustment of the word --  nil              - abstain from making any word adjustment decisions local flip_jejo = { ["е"] = "ё", ["ё"] = "е", ["Е"] = "Ё", ["Ё"] = "Е" } local function try_to_recover_jo(word) local step = 0 local tmp = rsub(word, "([еЕ])([^еЕ]*)$", function (je, suffix)		return flip_jejo[je] .. suffix end) while word ~= tmp and step < max_jo_search_steps do		step = step + 1 word = tmp local result = try_to_recover_ac(word) if result then return result end tmp = rsub(word, "([еЕ])([^еЕёЁ]*)([ёЁ])", function (je, midpart, jo)			return flip_jejo[je] .. midpart .. flip_jejo[jo] end) end end

-- Strip accents, but save them in a table and return as a second return value local function strip_accents(word) local pos_gr, pos_ac = {}, {} local cnt = 1 word = rsub(word, "(" .. vowel .. ")([" .. GR .. AC .. "]?)", function (letter, stress)		if stress == AC then			pos_ac[cnt] = true		elseif stress == GR then			pos_gr[cnt] = true		end		cnt = cnt + 1		return letter	end) return word, { pos_ac, pos_gr } end

-- Restore the previously saved acute and grave accents, replacing any other accents local function restore_accents(word, accents_backup_tbl) local cnt = 1 return rsub(word, "(" .. vowel .. ")([" .. GR .. AC .. "]*)", function (letter, stress)		if accents_backup_tbl[1][cnt] then			stress = AC		elseif accents_backup_tbl[2][cnt] then			stress = GR		else			stress = ""		end		cnt = cnt + 1		return letter .. stress	end) end

-- Restore only the grave accent, keeping all other accents intact local function restore_gr(word, accents_backup_tbl) local cnt = 1 return rsub(word, "(" .. vowel .. ")([" .. GR .. AC .. "]*)", function (letter, stress)		if accents_backup_tbl[2][cnt] then			stress = GR		end		cnt = cnt + 1		return letter .. stress	end) end

-- Restore the previously saved accents in a word and make somewhat smart -- decisions about how to do it. If there were no accents saved, then just -- keep accents in the word. If acute was saved, then override all accents. -- Or just restore grave if it was present, while keeping the possibly -- automatically assigned acute. local function smart_restore_accents(word, accents_backup_tbl) if next(accents_backup_tbl[1]) ~= nil then return restore_accents(word, accents_backup_tbl) elseif next(accents_backup_tbl[2]) ~= nil then return restore_gr(word, accents_backup_tbl) else return word end end

-- Convert from the pre-1918 to modern Russian orthography. Check the following -- links as reference materials for the conversion rules: --  https://ru.wikisource.org/wiki/Декрет_о_введении_нового_правописания --  https://ru.wikisource.org/wiki/Декрет_о_введении_новой_орфографии --  https://arzamas.academy/materials/1164 and https://историк.рф/journal/post/6042 -- -- The idea is to apply changes to the pre-reform word until a match is found in -- the modern dictionary. For example: -- * the word "самыя" is not a correct modern spelling. So it's transformed --   into "са́мые" and confirmed by a dictionary lookup. -- * the word "любившаго" is not a correct modern spelling either and there are two --   candidates for it: "любившого" and "любившего". The word "люби́вшего" is confirmed. -- * the word "губернскаго" is not a correct modern spelling and there are two --   candidates for it: "губернского" and "губернскего". The word "губе́рнского" --   is confirmed. -- * the word "большаго" is not a correct modern spelling and there are two candidates --   for it: "большо́го" and "бо́льшего". Ironically, both of these candidates are --   different correct words in modern spelling, albeit with different stress positions. --   It's not totally clear, which variant to pick, but we prioritize "бо́льшего", --   because "большого" is also a valid word in pre-reform spelling. -- -- Technically, the underlying dictionary can have a separate section specifically -- for identifying genitive singular adverbs, participles and pronouns for "-аго/-яго" -- and nominative/accusative feminine/neuter plural adverbs, participles and -- pronouns for "-ыя/-ія". But we'll cross that bridge when we get to it. -- -- Additionally, it goes without saying that for doing correct conversion to modern -- spelling, the dictionary MUST not be contaminated with pre-reform spelling forms. -- So it's important to have all pre-reform Russian words correctly annotated in -- English Wiktionary and this information should be machine readable.

local prereform_subst_words = { ["онѣ"] = "они", ["однѣ"] = "одни", ["однѣхъ"] = "одних", ["однѣми"] = "одними", ["ея"] = "её", ["Онѣ"] = "Они", ["Однѣ"] = "Одни", ["Однѣхъ"] = "Одних", ["Однѣми"] = "Одними", ["Ея"] = "Её", } local prereform_subst_letters = { ["ѣ"] = "е", ["Ѣ"] = "Е",	["і"] = "и", ["І"] = "И", ["ѳ"] = "ф", ["Ѳ"] = "Ф", ["ѵ"] = "и", ["Ѵ"] = "И" } local prereform_subst_prefix = { ["из"] = "ис", ["воз"] = "вос", ["вз"] = "вс", ["раз"] = "рас", ["роз"] = "рос", ["низ"] = "нис", ["без"] = "бес", ["чрез"] = "чрес", ["через"] = "черес", ["Из"] = "Ис", ["Воз"] = "Вос", ["Вз"] = "Вс", ["Раз"] = "Рас", ["Роз"] = "Рос", ["Низ"] = "Нис", ["Без"] = "Бес", ["Чрез"] = "Чрес", ["Через"] = "Черес", } local function with_different_orthography(callback_function, word) -- Temporarily strip acute and grave accents local word, accents_backup = strip_accents(word) local have_ija = rfind(word, "ія$") -- These transformations can be safely applied and hopefully don't cause any -- ambiguity. But if a counter-example is presented, then some solution can -- be found. word = prereform_subst_words[word] or word word = rsub(word, ".", prereform_subst_letters) word = rsub(word, "ъ$", "") word = rsub(word, "ъ(" .. nonvowel .. ")", "%1")	word = rsub(word, "^([^з]+з)([пфѳтсшкчщцх])", function (pref, letter)		return (prereform_subst_prefix[pref] or pref) .. letter	end) local tmp = callback_function(word) if tmp then return smart_restore_accents(tmp, accents_backup) end -- There's a possible ambiguity in "бѣлорусскія сочиненія", because the -- former needs to be converted to "белору́сские" and the latter needs to -- be converted to "сочине́ния" despite both having the "-ія" suffix. This -- ambiguity is resolved by doing dictionary lookups. tmp = rsub(word, "аго$", "его") tmp = rsub(tmp, "яго$", "его") tmp = rsub(tmp, "ыя$", "ые") if have_ija then tmp = rsub(tmp, "ия$", "ие") end if tmp ~= word then local result = callback_function(tmp) if result then return smart_restore_accents(result, accents_backup) end end -- There's another ambiguity in "независимаго отъ него, долженствовавшаго". One -- word needs to be converted to "незави́симого" and another to "долженствовавшего" -- despite both having the "-аго" suffix. This is again resolved by doing one -- more dictionary lookup. tmp = rsub(word, "аго$", "ого") if tmp ~= word then local result = callback_function(tmp) if result then return smart_restore_accents(result, accents_backup) end end return restore_accents(word, accents_backup) end

-- Combine two alternative 'subst1' and 'subst2' word alternation variants, such -- as an accent or "ё" recovery. They may possibly agree with each other, conflict -- each other or simply abstain (nil means 'no real opinion'). This function merges -- them into a single one. The 'word' argument is a fallback variant. local function evaluate_alternatives(subst1, subst2, word) if not subst1 then return subst2 end if not subst2 then return subst1 end if subst1 ~= subst2 then return word else return subst1 end end

-- Try to do something using a callback function with both the original and the -- de-capitalized variants of the same word to see if we end up getting different -- results (such as, for example, "Ока́" vs. "о́ка" accent positions) local function with_different_capitalization(callback_function, word) local decap_word = rsub(word, "^.", function (letter) return mw.ustring.lower(letter) end) -- If the word wasn't actually capitalized, then there's only one way to do it	if decap_word == word then return callback_function(word) end -- Process both the original and de-capitalized variants local subst1 = callback_function(word) local subst2 = callback_function(decap_word) -- Restore the capitalization of the de-capitalized variant if subst2 ~= nil then subst2 = rsub(subst2, "^.", function (letter) return mw.ustring.upper(letter) end) end -- Compare the results of the two routes with different capitalization return evaluate_alternatives(subst1, subst2, word) end

-- Process a chunk of text and opportunistically add letters "ё" and accents to Russian -- words where it is possible to do so in an unambiguous way. While doing this, the -- pre-1918 spelling is also converted to modern Russian spelling. function export.normalize(text) text = decompose(text) local nested_square_brackets = 0 local prev_word, cur_word local prev_sep = "", next_sep local result = rsub(text, "([" .. AC .. GR .. "Ѐ-џҊ-ԧꚀ-ꚗѣѢѳѲѵѴ]*)([^" .. AC .. GR .. "Ѐ-џҊ-ԧꚀ-ꚗѣѢѳѲѵѴ]*)", function (word, sep)		-- Don't do anything with the text enclosed in double square brackets (possible wikilinks or other markup)		local cur_nested_square_brackets = nested_square_brackets		rsub(sep, "%[%[", function (match) nested_square_brackets = nested_square_brackets + 1 end)		rsub(sep, "%]%]", function (match) nested_square_brackets = nested_square_brackets - 1 end)		if cur_nested_square_brackets ~= 0 then			return		end		-- First a basic check for obviously problematic words or word combinations		prev_sep = next_sep		next_sep = sep		prev_word = cur_word		cur_word = word		local lowercase_cur_word = mw.ustring.lower(cur_word)		local capitalized_cur_word = rsub(cur_word, "^.", function (letter) return mw.ustring.upper(letter) end)		-- Avoid mixed case, such as "шИзОФреНия" or other similarly weird formatting style		if cur_word ~= lowercase_cur_word and cur_word ~= capitalized_cur_word then return end -- Ignore two words separated by "(", such as "галер(е́я)"		if (prev_sep == "(" and prev_word and prev_word ~= "") or next_sep == "(" then			return		end		-- Do conversion to modern orthography, accent and "ё" recovery		if prev_word and prev_word ~= "" and rfind(prev_sep, "^[%s,]+$") then			-- *Definitely* the middle of a sentence here. Only spaces or commas separate us from the			-- previous word. Maybe hyphens or semicolons could be added too, but they are less reliable.			word = with_different_orthography(function (word) -- Need to explore two routes (with and without "ё" recovery) to reliably resolve the -- "слёзы" vs. "слезы́" or "узна́ем" vs. "узнаём" ambiguity. And since it's the middle of -- a sentence here, the actual capitalization of the word does matter for proper nouns. local subst1 = try_to_recover_ac(word) or with_different_capitalization(try_to_recover_ac, word) local subst2 = try_to_recover_jo(word) or with_different_capitalization(try_to_recover_jo, word) return evaluate_alternatives(subst1, subst2, word) end, word)		else			-- *Likely* the first word of a sentence. Making a wrong guess is safe, because this only			-- adds ambiguity and prevents us from accenting certain words. Such as "О́ка" vs. "Ока́".			prev_word = nil			word = with_different_orthography(function (word) -- Need to explore two routes (with and without "ё" recovery) to reliably resolve the -- "слёзы" vs. "слезы́" or "узна́ем" vs. "узнаём" ambiguity. And since it's the start of -- a sentence here, the actual capitalization of the word is completely irrelevant. local subst1 = with_different_capitalization(try_to_recover_ac, word) local subst2 = with_different_capitalization(try_to_recover_jo, word) return evaluate_alternatives(subst1, subst2, word) end, word)		end		-- Strip stress accent, but keep modern orthography		local modernized_without_ac = rsub(word, AC, "")		-- The current word is preceded by a single syllable preposition, which was already		-- accented in the original text		if prev_word and rfind(prev_word, AC) and mw.ustring.len(rsub(prev_word, nonvowel, "")) == 1 then			if not rfind(cur_word, AC) then				word = modernized_without_ac			end		end		-- Handle "предложно-именное сочетание" constructs (suppress accent in the second part of		-- "до́ смерти" if it's found in the dictionary even without having the explicit accent markup)		if prev_word then			local combined = prev_word .. " " .. modernized_without_ac			if lookup_word(combined) then				word = modernized_without_ac			end			local decap_combined = rsub(combined, "^.", function (letter) return mw.ustring.lower(letter) end)			if decap_combined ~= combined and lookup_word(decap_combined) then word = modernized_without_ac end end return word .. sep end)	return mw.ustring.toNFC(result) end

return export