Module:User:Theknightwho/ru-translit

local export = {}

--[=[

FIXME:

1. (DONE) If you write Без, it transliterates to Bjez instead of  Bez, as it should. 2. (DONE) Convert ъ to nothing before comma or other non-letter particle, e.g.  in Однимъ словомъ, идешь на чтеніе. 3. (DONE) Make special-casing for adjectives in -го and for что (and friends) be the default, and implement transformations in Cyrillic rather than after translit so that we can display the transformed Cyrillic in the "phonetic respelling" notation of. 4. (DONE) Convert apostrophe to ъ before transliteration when after a consonant and before a vowel (requested by Atitarev). ]=]

local u = mw.ustring.char local explode = require("Module:string utilities").explode_utf8 local concat = table.concat local insert = table.insert local rfind = mw.ustring.find local rsub = mw.ustring.gsub local rsplit = mw.text.split local toNFC = mw.ustring.toNFC local toNFD = mw.ustring.toNFD local decompose = require("Module:ru-common").decompose

local AC = u(0x301) -- acute = ́ local GR = u(0x0300) -- grave = ̀ local DI = u(0x0308) -- diaeresis = ̈ local TEMP_G = u(0xFFF1) -- substitute to preserve g from changing to v

local function ine(x) -- if not empty if x == "" then return nil else return x end end

-- In this table, we now map Cyrillic е and э to je and e, and handle the -- post-consonant version (plain e and ɛ) specially. local tab = { ["А"]="A", ["Б"]="B", ["В"]="V", ["Г"]="G", ["Д"]="D", ["Е"]="Je", ["Ж"]="Ž", ["З"]="Z", ["И"]="I", ["Й"]="J", ["К"]="K", ["Л"]="L", ["М"]="M", ["Н"]="N", ["О"]="O", ["П"]="P", ["Р"]="R", ["С"]="S", ["Т"]="T", ["У"]="U", ["Ф"]="F", ["Х"]="X", ["Ц"]="C", ["Ч"]="Č", ["Ш"]="Š", ["Щ"]="Šč", ["Ъ"]="ʺ", ["Ы"]="Y", ["Ь"]="ʹ", ["Э"]="E", ["Ю"]="Ju", ["Я"]="Ja", ['а']='a', ['б']='b', ['в']='v', ['г']='g', ['д']='d', ['е']='je', ['ж']='ž', ['з']='z', ['и']='i', ['й']='j', ['к']='k', ['л']='l', ['м']='m', ['н']='n', ['о']='o', ['п']='p', ['р']='r', ['с']='s', ['т']='t', ['у']='u', ['ф']='f', ['х']='x', ['ц']='c', ['ч']='č', ['ш']='š', ['щ']='šč', ['ъ']='ʺ', ['ы']='y', ['ь']='ʹ', ['э']='e', ['ю']='ju', ['я']='ja', -- Russian style quotes ['«']='“', ['»']='”',	-- archaic, pre-1918 letters ['І']='I', ['і']='i', ['Ѳ']='F', ['ѳ']='f', ['Ѣ']='Jě', ['ѣ']='jě', ['Ѵ']='I', ['ѵ']='i', -- archaic, pre-1700 letters ['Ѕ']='Z', ['ѕ']='z', ['Ꙃ']='Z', ['ꙃ']='z', ['Ꙁ']='Z', ['ꙁ']='z', ['Ѡ']='O', ['ѡ']='o', ['Ѿ']='Ot', ['ѿ']='ot', ['Ꙋ']='U', ['ꙋ']='u', ['Ꙑ']='Y', ['ꙑ']='y', ['Ꙗ']='Ja', ['ꙗ']='ja', ['Ѥ']='Je', ['ѥ']='je', ['Ѧ']='Ja', ['ѧ']='ja', ['Ѩ']='Ja', ['ѩ']='ja', ['Ѫ']='U', ['ѫ']='u', ['Ѭ']='Ju', ['ѭ']='ju', ['Ѯ']='Ks', ['ѯ']='ks', ['Ѱ']='Ps', ['ѱ']='ps', ['Є']='E', ['є']='e', ['Ї']='I', ['ї']='i', }

local tab_jo = { ["Ё"] = "Jo", ["Ѣ̈"] = "Jǒ", ["Я̈"] = "Jǫ", ["Ѧ̈"] = "Jǫ", ["Ѩ̈"] = "Jǫ", ["ё"] = "jo", ["ѣ̈"] = "jǒ", ["я̈"] = "jǫ", ["ѧ̈"] = "jǫ", ["ѩ̈"] = "jǫ" }

local plain_e = { ["Е"] = "E", ["Ѣ"] = "Ě", ["Э"] = "Ɛ", ["е"] = "e", ["ѣ"] = "ě", ["э"] = "ɛ" }

local vowels = "аеєиіоуүꙋѡѿꙑыѣэюꙗяѥѧѫѩѭѵaæɐeəɛiɪɨoɵuyʊʉ" local hushing_jo = "жчшщ" local izhitsa_v = "аеєиіѣэꙗяѥѧѩaæɐeəɛiɪɨ"

-- Apply transformations to the Cyrillic to more closely match pronunciation. -- Return two arguments: the "original" text (after decomposing composed -- grave characters), and the transformed text. If the two are different, -- should display a "phonetic respelling" notation. -- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces -- special-casing for adjectives, including those in -аго (pre-reform spelling) -- and disables checking for exceptions (e.g. много, ого). NOSHTO disables -- special-casing for что and related words. function export.apply_tr_fixes(text, noadj, noshto, forceadj) -- decompose stress accents without decomposing letters we want to treat -- as units (e.g. й or ё) text = decompose(text)

local origtext = text -- the second half of the if-statement below is an optimization; see above. if not noadj and text:find("го") then local v = {["г"] = "в", ["Г"] = "В"} local repl = function(e, g, o, sja) return e .. v[g] .. o .. (sja or "") end -- Handle какого-нибудь/-либо/-то; must be done first because of an exception -- made for бого-, снего-, etc. text = rsub(text, "([кКтТ][аА][кК][оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО]%-)", repl) if not forceadj then local function go(text, case) local pattern = rsub(case, "^(.)(.*)(го[" .. AC .. GR .. "]?)(%-?)$", function(m1, m2, m3, m4)					m1 = "%f[%a" .. AC .. GR .. "]([" .. mw.ustring.upper(m1) .. m1 .. "]"					m2 = m2:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?") .. ")"					m3 = m3:gsub("\204[\128\129]", "[" .. AC .. GR .. "]?")						:gsub("^г(.*)", "г(%1") m4 = m4 == "-" and "%-)" or ")%f[^%a" .. AC .. GR .. "]"					return m1 .. m2 .. m3 .. m4				end)				return rsub(text, pattern, "%1" .. TEMP_G .. "%2")			end			for _, case in ipairs{"мно́го", "н[еа]мно́го", "до́рого", "недо́рого", "стро́го", "нестро́го", "на́строго", "убо́го", "пол[ао]́го"} do				text = go(text, case)			end			-- check for neuter short forms of compound adjectives in -но́гий			if rfind(text, "но[" .. AC .. GR .. "]?го%f[^%a" .. AC .. GR .. "]") then				for _, case in ipairs{"безно́го", "босоно́го", "веслоно́го", "длинноно́го", "двуно́го", "коротконо́го", "кривоно́го", "одноно́го", "пятино́го", "трёхно́го", "трехно́го", "хромоно́го", "четвероно́го", "шестино́го"} do					text = go(text, case)				end			end			for _, case in ipairs{"ого́", "го́го", "ваго́го", "ло́го", "п[ео]́го", "со́го", "То́го", "ле́го", "игого́", "огого́", "альбиньязего", "д[иі]е́го", "бо́лого", "гр[иі]е́го", "манче́го", "пичис[иі]е́го", "тенкодого", "хио́го", "аго-", "его-", "ого-"} do				text = go(text, case)			end		end		--handle genitive/accusative endings, which are spelled -ого/-его/-аго		-- (-ogo/-ego/-ago) but transliterated -ovo/-evo/-avo; only for adjectives		-- and pronouns, excluding words like много, ого (-аго occurs in		-- pre-reform spelling); \204\129 is an acute accent, \204\128 is a grave accent		local pattern = "([оеОЕ" .. (forceadj and "аА" or "") .. "][" .. AC .. GR .. "]?)([гГ])([оО][" .. AC .. GR .. "]?)"		local reflexive = "([сС][яЯ][" .. AC .. GR .. "]?)"		text = rsub(text, pattern .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)		text = rsub(text, pattern .. reflexive .. "%f[^%a" .. AC .. GR .. TEMP_G .. "]", repl)		-- handle сегодня		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дня)%f[^%a" .. AC .. GR .. "]", "%1в%2")		-- handle сегодняшн-		text = rsub(text, "%f[%a" .. AC .. GR .. "]([Сс]е)г(о[" .. AC .. GR .. "]?дняшн)", "%1в%2")		-- replace TEMP_G with g; must be done after the -go -> -vo changes		text = rsub(text, TEMP_G, "г")	end

-- the second half of the if-statement below is an optimization; see above. if not noshto and text:find("то") then local ch2sh = {["ч"] = "ш", ["Ч"] = "Ш"} -- Handle что text = rsub(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]",			function(ch, to) return ch2sh[ch] .. to end) -- Handle чтобы, чтоб text = rsub(text, "%f[%a" .. AC .. GR .. "]([Чч])(то[" .. AC .. GR .. "]?бы?)%f[^%a" .. AC .. GR .. "]",			function(ch, to) return ch2sh[ch] .. to end) -- Handle ничто text = rsub(text, "%f[%a" .. AC .. GR .. "]([Нн]и)ч(то[" .. AC .. GR .. "]?)%f[^%a" .. AC .. GR .. "]", "%1ш%2") end

-- Handle мягкий, лёгкий, легчать, etc. text = rsub(text, "([МмЛл][яеё][" .. AC .. GR .. "]?)г([кч])", "%1х%2")

return origtext, text end

do local function get_char(text, pattern, d, dir) local i, ch = 0 repeat i = i + 1 ch = text[d.i + (i * dir)] until not (ch and (pattern):find(ch, 1, true)) return ch	end local function do_iteration(output, text, d)		-- Get current, previous and next characters, skipping over brackets, and -- ignoring diacritics for the previous character (which simplifies checks). local this = text[d.i]		local this_lower = this:ulower local prev = get_char(text, AC .. GR .. DI .. "%(", d, -1)		local nxt = get_char(text, "%)", d, 1) -- A word is monosyllabic if it has only one vowel. if vowels:find(this_lower, 1, true) then d.vowels = d.vowels + 1 end -- Convert ё (and archaic friends) here, since they need to be dealt with -- as a special-case. In general terms, ё becomes jo, ѣ̈ becomes jǒ and я̈, -- ѧ̈ & ѩ̈ become jǫ. When determining stress, they are treated -- as interchangeable. -- Jos do not implicitly take stress accents if an explicit primary stress is -- given. Otherwise, the final jo which doesn't have secondary stress takes -- primary stress. -- Prefixes do not take implicit primary stress. -- Primary stress will be shown on monosyllables if either they are a suffix -- or include_monosyllabic_jo_accent is true. if nxt == DI then d.i = d.i + 1 this = toNFC(this .. DI) local t = tab_jo[this] if t then -- An initial lowercase j is removed from a jo transliteration -- if preceded by a hushing consonant (ж ч ш щ) (but not for a				-- capital, on the assumption that a medial J is part of an				-- initialism). if (					t:sub(1, 1) == "j" and					prev and hushing_jo:find(prev:ulower, 1, true)				) then t = t:sub(2) end insert(output, t)				if text[d.i + 1] ~= GR then d.final_jo = #output end return end elseif this == AC then d.primary = true -- е after a consonant or a dash at the beginning of a word becomes e, -- and э becomes ɛ. elseif (this_lower == "е" or this_lower == "ѣ" or this_lower == "э") and (			(prev and not (vowels .. "ъьʹʺ"):find(prev:ulower, 1, true)) or			(not prev and d.dash_before)		) then insert(output, plain_e[this]) return elseif this_lower == "ю" then local prev_lower = prev and prev:ulower if prev_lower == "ж" or prev_lower == "ш" then insert(output, "u") return end -- Make izhitsa display as -v- after /a/, /e/ and /i/ (matching the equivalent		-- Greek digraphs αυ, ευ and ηυ). elseif (			this_lower == "ѵ" and			prev and izhitsa_v:find(prev:ulower, 1, true)		) then this = this == "Ѵ" and "В" or "в" text[d.i] = this -- Convert apostrophe to the hard sign between consonant and vowel (i.e.		-- in the places where the hard sign normally occurs in modern text). -- Apostrophe is sometimes used to indicate the hard sign; this may have -- originated from the forcible removal of the hard sign from printing -- offices in the 1920's, after the implementation of the Russian -- orthography reform. elseif (			this == "'" and			prev and not vowels:find(prev:ulower, 1, true) and			nxt and vowels:find(nxt, 1, true)		) then text[d.i] = nxt and nxt:uupper == nxt and "Ъ" or "ъ" this = text[d.i] -- Ignore word-final hard signs. elseif this_lower == "ъ" and d.i == #text then return end insert(output, tab[this] or this) end

-- Transliterate after the pronunciation-related transformations of -- export.apply_tr_fixes have been applied. Called from. -- INCLUDE_MONOSYLLABIC_JO_ACCENT is as in export.tr. function export.tr_after_fixes(text, include_monosyllabic_jo_accent) local word_chars = "%a'%(%)%[%]" .. AC .. GR .. DI		local output = {} text = rsub(toNFC(text), "([^" .. word_chars .. "]*)([" .. word_chars .. "]*)", function(before, text)			for _, ch in ipairs(explode(before)) do				insert(output, ch)			end			text = explode(rsub(text, "[^Йй]", toNFD))			local d = {				i = 0,				vowels = 0			}			if output[#output] == "-" then				local prev = output[#output - 1]				if not prev or rfind(prev, "%s") then					d.dash_before = true				end			end			while d.i < #text do				d.i = d.i + 1				do_iteration(output, text, d)			end			if ( d.final_jo and (not (d.primary or text[#text] == "-")) and (include_monosyllabic_jo_accent or d.vowels > 1 or d.dash_before) ) then				output[d.final_jo] = output[d.final_jo] .. AC			end		end) return toNFC(concat(output)) end end

-- Transliterates text, which should be a single word or phrase. It should -- include stress marks, which are then preserved in the transliteration. -- ё is a special case: it is rendered (j)ó in multisyllabic words and -- monosyllabic words in multi-word phrases, but rendered (j)o without an -- accent in isolated monosyllabic words, unless INCLUDE_MONOSYLLABIC_JO_ACCENT -- is specified. (This is used in conjugation and declension tables.) -- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces -- special-casing for adjectives and disables checking for exceptions -- (e.g. много). NOSHTO disables special-casing for что and related words. function export.tr(text, lang, sc, include_monosyllabic_jo_accent, noadj, noshto, forceadj) local origtext, subbed_text = export.apply_tr_fixes(text, noadj, noshto, forceadj) return export.tr_after_fixes(subbed_text, include_monosyllabic_jo_accent) end

-- translit with various special-case substitutions; NOADJ disables -- special-casing for adjectives in -го, while FORCEADJ forces special-casing -- for adjectives and disables checking for expections (e.g. много). -- NOSHTO disables special-casing for что and related words. SUB is used -- to implement arbitrary substitutions in the Cyrillic text before other -- transformations are applied and before translit. It is of the form -- FROM/TO,FROM/TO,... function export.tr_sub(text, include_monosyllabic_jo_accent, noadj, noshto, sub,	forceadj) if type(text) == 'table' then -- called directly from a template include_monosyllabic_jo_accent = ine(text.args.include_monosyllabic_jo_accent) noadj = ine(text.args.noadj) noshto = ine(text.args.noshto) sub = ine(text.args.sub) text = text.args[1] end

if sub then local subs = rsplit(sub, ",") for _, subpair in ipairs(subs) do			local subsplit = rsplit(subpair, "/") text = rsub(text, subsplit[1], subsplit[2]) end end

return export.tr(text, nil, nil, include_monosyllabic_jo_accent, noadj, noshto, forceadj) end

--for adjectives, pronouns function export.tr_adj(text, include_monosyllabic_jo_accent) if type(text) == 'table' then -- called directly from a template include_monosyllabic_jo_accent = ine(text.args.include_monosyllabic_jo_accent) text = text.args[1] end

-- we have to include "forceadj" because typically when tr_adj is called -- from the noun or adjective modules, it's called with suffix ого, which -- would otherwise trigger the exceptional case and be transliterated as ogo return export.tr(text, nil, nil, include_monosyllabic_jo_accent, false,		"noshto", "forceadj") end

return export

-- For Vim, so we get 4-space tabs -- vim: set ts=4 sw=4 noet: