User:Erutuon/scripts/modifyRussianTranslit.js

/*

This script finds all tagged Russian transliterations on a page, and modifies them so that they show vowel reduction, palatalization, and hard and soft postalveolars. There will be numerous errors with the vowel reduction, and so far assimilation in palatalization is not shown at all, but at least this makes pronunciation easier to decode in most cases.

Examples: - Воро́неж > Vorónež > Varón̦iž - роди́тельный > rodítelʹnyj > rad̦íțil̦nyj - шить > šitʹ > šyț - щёлочь > ščóločʹ > śśólać - хоро́ший > xoróšij > xaróšyj

to do: - don't reduce vowels of all monosyllables

/* globals $ */

"use strict";

var RussianTranslit = $(":lang(ru-Latn)");

var voiced =	[ "b", "v", "g", "d", "z", "x", "c",	"ć",	"ž" ]; var voiceless =	[ "p", "f", "k", "t", "s", "x", "dz",	"dź",	"š" ]; var getIndex = {}; var getType = {}; var consonants = "";

for ( let i = 0; i < voiced.length; i++ ) { var sound = voiced[i]; getIndex[sound] = i;	getType[sound] = "voiced"; if ( sound.length === 1 ) { consonants += sound; } }

for ( let i = 0; i < voiceless.length; i++ ) { var sound = voiceless[i]; getIndex[sound] = i;	getType[sound] = "voiceless"; if ( sound.length === 1 ) { consonants += sound; } }

var consonant = "[" + consonants + "]" + comma + "?"; // var consonantRegex = new RegExp(consonant, "g"); var consonantSequence = new RegExp(consonant + "(?:" + consonant + ")+", "g"); var palatalizable = "[bvgdzklmnprstfx]";

var u = String.fromCodePoint; var acute = u(0x301); var grave = u(0x300); var caron = u(0x30C); var comma = u(0x326); var prime = "ʹ"; var doublePrime = "ʺ"; var dottedCircle = u(0x25CC);

var regexCache = {}; var replace = function (text, regex, replacement) {	var orig = regex; if ( !regexCache[orig] ) { if ( regex.includes("#") && replacement.includes("#") ) { regex = regex.replace("#", "((?:<[^>]+>)?#)"); replacement = replacement.replace("#", "\$1"); }		regex = new RegExp(regex, "g"); regexCache[orig] = regex; }	return text.replace(regexCache[orig], replacement); };

RussianTranslit.each(	function 	{		var $this = $(this);		var innerHTML = $this.html;		var origDecomposed = innerHTML.normalize("NFD");		var decomposed = origDecomposed;		var isHeadword;		var classAttr = $this.attr("class");		if ( classAttr ) { 			isHeadword = classAttr.includes("headword-tr");		}		var isAffix = /^-/.test(innerHTML) || /-$/.test(innerHTML);		// Mark word boundaries with #.		decomposed = "#" + decomposed + "#";		decomposed = replace(decomposed, " ", "#");		/*	-т(ь)ся is pronounced like -тса, at least in reflexive verbs:			for example, каза́ться. See also .		*/		decomposed = replace(decomposed, "t" + prime + "?sja#", "tsa#");		/*	Remove prime from c, which is rarely palatalized,			and š, ž, which are never palatalized.		*/		decomposed = replace(decomposed, caron + prime, caron);		decomposed = replace(decomposed, "c" + prime, "c");		decomposed = replace(decomposed, prime + "o", comma + "jo"); decomposed = replace(decomposed, prime, comma); /*			Reduce unstressed е, о, я ([j]e, o, [j]a) to i or y, a, i or y, except for final -е and -я. -е is pronounced as e or a, but unfortunately this depends on the part of speech, which JavaScript has no way to determine. Further exceptions: - бу́дете (second-person plural indicative) - бу́дьте (second-person plural imperative) */		if ( decomposed.includes(acute) || ( isHeadword && isAffix ) ) {			var wordBoundary = "(?=#|[-\\.,])"; var notAccent = "(?=[^" + acute + grave + "])"; decomposed = replace(decomposed, "je" + wordBoundary, "%%"); decomposed = replace(decomposed, "ja" + wordBoundary, "&&"); decomposed = replace(decomposed, "([^\\w" + caron + "])e" + notAccent, "$1y"); decomposed = replace(decomposed, "([^\\w" + caron + "])E" + notAccent, "$1Y"); decomposed = replace(decomposed, "^e" + notAccent, "y"); decomposed = replace(decomposed, "^E" + notAccent, "Y"); decomposed = replace(decomposed, "e" + notAccent, "i"); decomposed = replace(decomposed, "E" + notAccent, "I"); decomposed = replace(decomposed, "o" + notAccent, "a"); decomposed = replace(decomposed, "O" + notAccent, "A"); decomposed = replace(decomposed, "ja" + notAccent, "ji"); decomposed = replace(decomposed, "Ja" + notAccent, "Ji"); decomposed = replace(decomposed, "%%", "je"); decomposed = replace(decomposed, "&&", "ja"); }		// и (i) is pronounced like ы (y) after ц, ш, ж (c, š, ž). decomposed = replace(decomposed, "([sz]" + caron + ")i", "$1y"); decomposed = replace(decomposed, "ci", "cy"); // Change č, šč to ć, śś to indicate that they are soft. decomposed = replace(decomposed, "s" + caron + "c" + caron, "śś"); decomposed = replace(decomposed, "c" + caron, "ć"); // Mark palatalization with comma below: b̦, v̦, g̦, ... .		decomposed = replace(decomposed, "(" + palatalizable + ")([ei])", "$1" + comma + "$2"); decomposed = replace(decomposed, "(" + palatalizable + ")j", "$1" + comma); // Epsilon (representing uniotated е). For example, тире́. Probably represents y when unstressed. decomposed = replace(decomposed, "ɛ", "e"); // Escape HTML tags and entities. var escaped = []; var i = 0; decomposed = decomposed.replace(			/<[^>]+>|&[^;]+;/g,			function(tag) {				escaped[i] = tag;				return "%" + i++;			}		); // Show voicing assimilation. decomposed = decomposed.normalize("NFC"); // Decompose ș, ț. decomposed = decomposed.replace(			/[șț]/g,			function (letter) {				return letter.normalize("NFD");			}); // Process sequences of obstruents. decomposed = decomposed.replace(			consonantSequence,			function(a) {				// var types = [];				return "" + a + " ";			}		); decomposed = decomposed.replace(			/%(\d)/g,			function(wholematch, number)			{				number = Number(number);				return escaped[number];			}		); decomposed = replace(decomposed, doublePrime, ""); // Remove #. decomposed = replace(decomposed, "#", " "); decomposed = replace(decomposed, " (.+) ", "$1"); $this.html(decomposed); } ); //