Module:aii-translit

local export = {}

local U = require("Module:string/char") local rsub = mw.ustring.gsub local rmatch = mw.ustring.match

local hbasa = U(0x73C) local rwaha = U(0x73F) local zlama_angular = U(0x739) local zlama_horizontal = U(0x738) local pthaha = U(0x732) local zqapha = U(0x735)

local diacritic_vowels = hbasa .. rwaha .. zlama_angular .. zlama_horizontal .. pthaha .. zqapha local diacritic_vowels_capture = "([" .. diacritic_vowels .. "])"

local talqana_above = U(0x747) local combining_diaeresis = U(0x308)

-- we declare consonants representing vowels (matres lectionis) as constants to mitigate differences in how mixing -- right-to-left and left-to-right characters in the same line appears in an IDE vs wiktionary. Since matres is used in -- concatenation via the .. operator, "ܘ" .. "ܐ" on wiktionary would render as "ܐ" .. "ܘ" in an IDE local alaph = U(0x710) local waw = U(0x718) local yudh = U(0x71D)

local combining_tilde_below = U(0x330) local combining_tilde_above = U(0x303) local combining_macron_below = U(0x331) local combining_macron = U(0x304) local qushshaya = U(0x741) local rukkakha = U(0x742) local combining_breve_below = U(0x32E)

local combining_dot_below = U(0x323) local combining_dot_above = U(0x307)

-- constants for transliterated snippets which are used in later substitutions local TR_THIRD_PERSON_FEM_SUFFIX = 'ōh' local TR_WAW_PLUS_RVASA = 'ū' local TR_WAW_PLUS_RVASA_SHORT = 'u'

local tt_transpose_punc = { -- left/right single/double quotes ["“"] = "”",	["”"] = "“",	["‘"] = "’",	["’"] = "‘",	["؟"] = "?", -- question mark ["«"] = '“', -- quotation mark ["»"] = '”', -- quotation mark ["،"] = ",", -- comma ["؛"] = ";", -- semicolon }

local tt_transpose_punc_keys = '' for key, _ in pairs(tt_transpose_punc) do tt_transpose_punc_keys = tt_transpose_punc_keys .. key end

local fix = { { diacritic_vowels_capture .. qushshaya, qushshaya .. "%1"},

-- under the hood mw uses uses NFC which preempts the following two substitutions... -- feel free to uncomment if there's a test case which can be added that requires them to be uncommented -- https://www.mediawiki.org/wiki/Unicode_normalization_considerations -- { diacritic_vowels_capture .. rukkakha, rukkakha .. "%1"},	-- { diacritic_vowels_capture .. combining_tilde_below, combining_tilde_below .. "%1" },

-- partition punctuation marks so "starts with" and "ends with" substitutions work {"([".. tt_transpose_punc_keys .. "!.:\"'])", "#%1#"}, }

local tt = { ["ܦ"] = "p", ["ܒ"] = "b", ["ܬ"] = "t", ["ܛ"] = "ṭ", ["ܕ"] = "d", ["ܟ"] = "k", ["ܓ"] = "g", ["ܩ"] = "q", ["ܣ"] = "s", ["ܨ"] = "ṣ", ["ܙ"] = "z", ["ܫ"] = "š", ["ܚ"] = "ḥ", ["ܥ"] = "ˁ", ["ܗ"] = "h", ["ܡ"] = "m", ["ܢ"] = "n", ["ܪ"] = "r", ["ܠ"] = "l", }

-- local tt_keys = '' -- for key, _ in pairs(tt) do tt_keys = tt_keys .. key end

local tt_values = '' for _, value in pairs(tt) do tt_values = tt_values .. value end

local mhagjana_capture = "([" .. rsub('ܗܠܡܢܥܪ', ".", tt) .. alaph .. yudh .. waw  .. "])" local marhetana_capture = "([" .. rsub('ܦܒܬܛܕܟܓܩܣܨܙܫܚ', ".", tt) .. "])"

-- https://r12a.github.io/scripts/syrc/aii.html#single_letter_words local bdul = 'ܒܕܘܠ' local bdul_capture = '([' .. bdul .. '])' local bdul_capture2 = '([' .. bdul .. '])([' .. bdul .. '])'

-- local alphabet = '' -- for letter, _ in pairs(tt) do alphabet = alphabet .. letter end -- alphabet = alphabet .. yudh .. waw .. alaph -- local alphabet_capture = '([' .. alphabet .. '])'

local tt_next = { [waw] = "w", [yudh] = "y",

[zlama_angular] = "ē", [zlama_horizontal] = "i", [pthaha] = "a", [zqapha] = "ā" }

local glides = alaph .. yudh .. waw -- unvoweled, original values of matres lectionis (consonants representing vowels) local consonants_minus_glides = tt_values .. "cžfḇṯḏḵḡ" local consonants_minus_glides_cg = "([" .. consonants_minus_glides .. "])" local consonants_minus_glides_cg_2 = "([" .. consonants_minus_glides .. "y])"

local consonants_capture = "([" .. glides .. consonants_minus_glides .. "])" -- local consonants_capture_minus_alaph = "([" .. yudh .. waw .. consonants_minus_glides .. "])"

local vowels_w = TR_WAW_PLUS_RVASA_SHORT .. TR_WAW_PLUS_RVASA .. "ō" local vowels_y = "eiēī" local vowels = vowels_y .. vowels_w .. "aā"

local consonants_and_vowels_capture = "([" .. glides .. consonants_minus_glides .. vowels .. "])"

local genitive_endings = {"ܵܐ", "ܝܼ", "ܘܼܟ݂", "ܵܟ݂ܝ", "ܹܗ", "ܵܗ̇", "ܘܼܗܝ", "ܘܿܗ̇" , "ܘܼܗ̇" , "ܲܢ", "ܵܘܟ݂ܘܿܢ", "ܗܘܿܢ", "ܵܝܗܝ" , "ܹ̈ܐ" , "ܹܐ" , "ܲܝ" , "ܲܝ̈" , "ܲܬ݂" , "ܵܬ݂̈" , "ܵܬ݂" , "ܝܼ̈", "ܘܼ̈ܟ݂", "ܵܟ݂ܝ̈", "ܘܼ̈ܗܝ", "̈ܘܿܗ̇", "ܘܼ̈ܗ̇" , "ܲܢ̈", "ܵܘ̈ܟ݂ܘܿܢ", "ܗ̈ܘܿܢ" , "ܵܝ̈ܗܝ"} -- per "No alternation (the | operator)" https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Patterns -- lua doesn't provide support to check if a word contains this, or that, or other via (this|that|other) -- this means we can't simply check if the word ends in any of the possible masculine genitive endings with a capture group like -- local genitive_endings_capture = "(" .. table.concat(genitive_endings,"|") .. ")" local genitive_endings_special_cases = { ["ܒܵܬ"] = "bātt", ["ܚܘܼܒ"] = "ḥubb", ["ܓܘܼܕ"] = "gudd",	["ܓܘܼܕܵܢ"] = "guddān", ["ܦܘܼܡ"] = "pumm", ["ܪܘܼܙ"] = "ruzz", ["ܐܸܣܦܘܼܓ"] = "ispugg", ["ܐܸܛܪܘܼܓ"] = "iṭrugg",	["ܐܸܛܪ̈ܘܼܓ"] = "iṭrugg", ["ܫܹܢ݇ܬ"] = "šētt", ["ܓܘܼܬ"] = "gutt", }

-- due to runtime performance concerns, we should strive to limit special cases to only the most common terms for which the transliterator is way off local special_cases = { -- { aii_text, aii_translit_output } --	-- the # symbol pads the start and end of a word, consider the follow examples -- #float#   only float matches -- #float    words starting with float like float or floats match -- float#    words ending with float like float or afloat match -- float     words containing float like float, floats, afloat and refloats match {"#ܒܗ" .. combining_dot_above .. "ܝ#", "#b-ay#"}, {"ܗ" .. combining_dot_above .. "ܝ#", "aya#"}, {"ܗ" .. combining_dot_above .. "ܘ#", "awa#"}, {"ܡ" .. combining_dot_above .. "ܢ#", "man#"}, {"ܡ" .. combining_dot_below .. "ܢ#", "min#"}, -- commenting this out for now until test cases can be added for it -- {waw .. hbasa .. "ܟ݂#", "ōḵ#"}, {waw .. hbasa .. combining_diaeresis .. "ܟ݂#", "ōḵ#"}, {"ܟ̰ܵܐܝ", "čāy"}, {"ܒܵܐܝ", "bāy"}, {"ܝܼܫܘܿܥ#", "īšōˁ#"}, -- starts with vowel but not silent alaph -- "to be" without inital khwasa, ì {"#ܝܘܸܢ#", "#ìwen#"}, {"#ܝܘܵܢ#", "#ìwān#"}, {"#ܝܘܲܚ#", "#ìwaḥ#"}, {"#ܝܘܸܬ#", "#ìwet#"}, {"#ܝܘܵܬܝ#", "#ìwāt#"}, {"#ܝܬܘܿܢ#", "#ìtōn#"}, {"#ܝܠܹܗ#", "#ìlēh#"}, {"#ܝܠܵܗ̇#", "#ìlāh#"}, {"#ܝܢܵܐ#", "#ìnā#"}, {"#ܝܗ݇ܘܵܐ#", "#ìwā#"}, {"#ܝܗ݇ܘܵܬ݇#", "#ìwā#"}, {"#ܝܗ݇ܘܵܘ#", "#ìwā#"}, -- "to be" with inital khwasa, ī -- https://en.wiktionary.org/wiki/Template:aii-conj-verb/hawe {"ܝܼܘܸܢ#", "īwen#"}, {"ܝܼܘܵܢ", "īwān"}, {"ܝܼܘܸܬ#", "īwet#"}, {"ܝܼܘܵܬܝ#", "īwāt#"}, {"ܝܼܠܹܗ#", "īlēh#"}, {"ܝܼܠܵܗ̇#", "īlāh#"}, {"ܝܼܘܲܚ#", "īwaḥ#"}, {"ܝܼܬܘܿܢ#", "ītōn#"}, {"ܝܼܢܵܐ#", "īnā#"}, -- "to be" blends {"ܝܼܗ݇ܘܵܐ#", "īwā#"}, {"ܝܼܗ݇ܘܵܘ#", "īwā#"}, -- "to be" imperative forms -- following substitutions starting with '#w' are to pre-empt 'w-' prefixing rule {"#ܗ݇ܘܝܼ", "#wī"}, {"#ܗ݇ܘܹܝܡܘܼܢ#", "#wēmūn#"}, -- "to be" past particles {"#ܗ݇ܘܵܐ#", "#wā#"}, {"#ܗ݇ܘܵܘ#", "#wā#"}, {"#ܗ݇ܘܹܐ#", "#wē#"}, -- "of" (fixed expressions) {"#ܕܝܵܡܵܐ#", "#d-yāmā#"}, {"#ܕܠܵܐ#", "#d-lā#"}, {"#ܕܕܸܒ݂ܫܵܐ#", "#d-diḇšā#"}, {"#ܕܲܕܡܵܐ#", "#da-dmā#"}, -- "to" (fixed expressions) {"ܠܒܲܕܲܪ#", "l-baddar#"}, {"ܠܓܵܘܵܐ#", "l-gāwā#"}, {"#ܠܡܵܐ#", "#lemā#"}, -- "clitic demonstrative pronouns" {"ܒܗܵܢܵܐ#", "b-hānā#"}, {"ܒܗܵܕ݂ܹܐ", "b-hāḏē"}, {"ܒܗܸܢܘܿܢ#", "b-hinnōn#"}, {"ܒܗܸܢܹܝܢ", "b-hinnēn"}, {"ܕܗܵܢܵܐ#", "d-hānā#"}, {"ܕܗܵܕ݂ܹܐ", "d-hāḏē"}, {"ܕܗܸܢܘܿܢ#", "d-hinnōn#"}, {"ܕܗܸܢܹܝܢ", "d-hinnēn"}, {"ܠܗܵܢܵܐ#", "l-hānā#"}, {"ܠܗܵܕ݂ܹܐ", "l-hāḏē"}, {"ܠܗܸܢܘܿܢ#", "l-hinnōn#"}, {"ܠܗܸܢܹܝܢ", "l-hinnēn"}, {"ܘܗܵܢܵܐ#", "w-hānā#"}, {"ܘܗܵܕ݂ܹܐ", "w-hāḏē"}, {"ܘܗܸܢܘܿܢ#", "w-hinnōn#"}, {"ܘܗܸܢܹܝܢ", "w-hinnēn"}, -- "per" (fixed expressions with time) {"ܒܕܲܩܝܼܩܵܐ#", "b-daqqīqā#"}, {"ܒܪܦܵܦܵܐ#", "b-rpāpā#"}, {"ܒܫܵܥܬ݂ܵܐ#", "b-šāˁṯa#"}, {"ܒܫܵܥܲܬ݂#", "b-šāˁaṯ#"}, {"ܒܝܵܘܡܵܐ#", "b-yāwmā#"}, {"ܒܝܘܿܡ#", "b-yōm#"}, {"ܒܫܵܒ݂ܘܿܥܵܐ#", "b-šāḇōˁā#"}, {"ܒܡܸܬ݂ܚܵܐ#", "b-miṯḥā#"}, {"ܒܝܲܪܚܵܐ#", "b-yarḥā#"}, {"ܒܝܼܪܲܚ#", "b-yraḥ#"}, {"ܒܫܹܢ݇ܬܵܐ#", "b-šēttā#"}, {"ܒܫܹܢ݇ܬ#", "b-šēt#"}, {"ܒܕܵܘܪܵܐ#", "b-dāwrā#"}, {"ܒܠܲܝܠܹܐ#", "b-laylē#"}, {"ܒܨܲܦܪܵܐ#", "b-ṣaprā#"}, {"ܒܪܲܡܫܵܐ#", "b-ramšā#"}, -- adverbs with clitics (fixed expressions) {"ܠܩܘܼܪܒܵܐ#", "l-qurbā#"}, {"ܒܩܘܼܪܒܵܐ#", "b-qurbā#"}, {"ܒܚܲܪܬܵܐ#", "b-ḥartā#"}, {"ܒܟܠ#", "b-kul#"}, {"ܕܗܵܘܝܵܐ#", "d-hāwyā#"}, {"ܕܟܹܐ#", "d-kē#"}, {"ܠܩܘܼܪܒܵܐ ܕ", "l-qurbā d-" .. ""},	{"ܒܡܸܬ݂ܚܵܐ ܕ", "b-miṯḥā d-" .. ""},	-- "all", "each", "every" {"ܟܠ#", "kul#"}, {"ܟܠܵܢ#", "kullān#"}, {"ܟܠܘܼܟ݂#", "kullōḵ#"}, {"ܟܠܵܟ݂ܝ#", "kullāḵ#"}, {"ܟܠܹܗ#", "kullēh#"}, {"ܟܠܵܗ̇#", "kullāh#"}, {"ܟܠܘܼܗܝ#", "kullūh#"}, {"ܟܠܘܿܗ̇#", "kullōh#"}, {"ܟܠܲܢ#", "kullan#"}, {"ܟܠܵܘܟ݂ܘܿܢ#", "kullāwḵōn#"}, {"ܟܠܵܝܗܝ#", "kullāyh#"}, {"ܟܠܗܘܿܢ#", "kullhōn#"}, {"ܟܠܵܢܵܐܝܼܬ#", "kullānāˀīt#"}, {"ܟܠܵܢܵܐܝܼܬ݂#", "kullānāˀīṯ#"}, {"ܟܠܵܢܵܝ", "kullānāy"}, {"ܟܘܿܠܵܝ", "kollāy"}, {"ܟܠܚܲܕ݇#", "kulḥa#"}, {"ܟܠܚܕ݂ܵܐ#", "kulḥḏā#"}, {"ܟܠܫܲܢ݇ܬ#", "kulšat#"}, -- "classical because" {"ܡܸܛܠ#", "miṭṭul#"}, {"ܡܸܛܠܬ", "miṭṭult"}, {"ܡܸܛܠܵܐܝܼܬ݂#", "miṭṭullāˀīṯ#"}, -- "houses" {"ܒܵܬܹ̈ܐ#", "bāttē#"}, {"ܒܵܬܲܝ̈#", "bāttay#"}, {"ܒܵܬܲܢ̈#", "bāttān#"}, {"ܒܵܬܘܼ̈ܟ݂#", "bāttōḵ#"}, {"ܒܵܬ̈ܗܘܿܢ#", "bātthōn#"}, -- masc sing construct state rvasa {"ܓܘܼܕ#", "gud#"},	{"ܦܘܼܡ#", "pum#"}, {"ܐܸܛܪܘܼܓ#", "iṭrug#"},	{"ܐܸܣܦܘܼܓ#", "ispug#"}, -- countries and nationalities {"ܒܸܠܓܝܵܐ#", "belgyā#"}, {"ܒܸܠܓܝܼܩܵܝ", "belgīqāy"}, {"ܣܹܝܫܸܠܝܼܣ#", "sēšellīs#"}, {"ܣܹܝܫܸܠ#", "sēšel#"}, {"ܣܹܝܫܸܠܵܝ", "sēšellāy"}, -- popular slang terms {"ܝܲܐܠܵܗ#", "yallāh#"}, {"ܘܲܐܠܵܗ#", "wallāh#"}, -- feminine imperative forms {"ܙܹܠ݇ܝ#", "zē#"}, {"ܬܵܐܝ#", "tā#"}, }

function export.tr(text, lang, sc)

text = rsub(text, " | ", "# | #") text = "##" .. rsub(text, " ", "# #") .. "##"	text = rsub(text, "ـ", "") for _, sub in ipairs(fix) do text = rsub(text, unpack(sub)) end

-- Special cases -- print(text) for _, sub in ipairs(special_cases) do text = rsub(text, unpack(sub)) end -- print(text)

-- for every special case, apply substitutions for every masc genitive ending for aii_prefix, aii_prefix_corrected_tr in pairs(genitive_endings_special_cases) do		for _, masc_genitive_ending in ipairs(genitive_endings) do text = rsub(text, aii_prefix .. '(' .. masc_genitive_ending .. ')' .. '#', aii_prefix_corrected_tr .. '%1#' ) end end

-- Ignore siyameh text = rsub(text, combining_diaeresis, "")

text = rsub(text, "ܟ" .. combining_tilde_below, "č") text = rsub(text, "ܓ" .. combining_tilde_below, "j") text = rsub(text, "ܫ" .. combining_tilde_below, "ž")

text = rsub(text, "ܙ" .. combining_tilde_above, "ž") text = rsub(text, "ܟ" .. combining_tilde_above, "č") text = rsub(text, "ܫ" .. combining_tilde_above, "ž")

text = rsub(text, "ܒ" .. combining_breve_below, "v") text = rsub(text, "ܦ" .. combining_breve_below, "f")

text = rsub(text, "ܦ" .. qushshaya, "p") -- needs a test case text = rsub(text, "ܒ" .. qushshaya, "b") text = rsub(text, "ܬ" .. qushshaya, "t") text = rsub(text, "ܕ" .. qushshaya, "d") text = rsub(text, "ܟ" .. qushshaya, "k") text = rsub(text, "ܓ" .. qushshaya, "g")

text = rsub(text, "ܒ" .. rukkakha, "ḇ") text = rsub(text, "ܬ" .. rukkakha, "ṯ") text = rsub(text, "ܕ" .. rukkakha, "ḏ") text = rsub(text, "ܟ" .. rukkakha, "ḵ") text = rsub(text, "ܓ" .. rukkakha, "ḡ")

-- this covers b-, d-, w-, l- prefixing for words starting with an alaph -- https://r12a.github.io/scripts/syrc/aii.html#standalone -- and ALL special_cases starting with initial_translit_char local initial_translit_char = 'aī' -- accounts for substituted special cases starting with vowel sound local initial_char_capture = "([" .. alaph .. initial_translit_char .. "])" text = rsub(text, "#" .. bdul_capture2 .. initial_char_capture, "#%1-%2-%3") text = rsub(text, "#" .. bdul_capture .. initial_char_capture, "#%1-%2")

text = rsub(text, waw .. hbasa .. "ܗ" .. combining_dot_above .. "#", TR_THIRD_PERSON_FEM_SUFFIX .. "#")

text = rsub(text, yudh .. hbasa, "ī") text = rsub(text, waw .. rwaha, "ō") text = rsub(text, waw .. hbasa, TR_WAW_PLUS_RVASA)

text = rsub(text, ".", tt_transpose_punc) text = rsub(text, ".", tt) text = rsub(text, "#" .. alaph .. "#", "#ˀ#") -- needs a test case

text = rsub(text, consonants_capture .. mhagjana_capture .. combining_macron_below .. consonants_capture, "%1e%2%3") text = rsub(text, consonants_capture .. marhetana_capture .. combining_macron .. consonants_capture, "%1%2e%3")

text = rsub(text, consonants_capture .. talqana_above, "")

-- doubling consonants text = rsub(text, "([" .. zlama_horizontal .. pthaha .. "])" .. consonants_capture .. diacritic_vowels_capture, "%1%2%2%3") text = rsub(text, "([" .. zlama_horizontal .. pthaha .. "])" .. consonants_capture .. TR_THIRD_PERSON_FEM_SUFFIX, "%1%2%2" .. TR_THIRD_PERSON_FEM_SUFFIX) --

text = rsub(text, combining_dot_above, "")

text = rsub(text, consonants_capture .. zlama_angular .. yudh .. consonants_capture, "%1ē%2") text = rsub(text, consonants_capture .. yudh .. consonants_capture, "%1i%2")

text = rsub(text, consonants_minus_glides_cg .. yudh .. "#", "%1#")

text = rsub(text, alaph .. pthaha .. waw .. "#", "aw#") -- needs a test case (impossible combination of characters?) text = rsub(text, alaph .. pthaha .. yudh .. "#", "ay#") -- needs a test case

text = rsub(text, "#" .. alaph .. zlama_angular .. yudh, "#ē") -- needs a test case text = rsub(text, "#" .. alaph .. yudh, "#ī") -- needs a test case

text = rsub(text, "#" .. yudh .. consonants_capture, "#%1")

text = rsub(text, pthaha .. alaph .. "#", "a#") -- needs a test case text = rsub(text, zlama_angular .. alaph .. "#", "ē#") text = rsub(text, zqapha .. alaph .. "#", "ā#") -- needs a test case text = rsub(text, alaph .. "#", "ā#") -- needs a test case text = rsub(text, "#" .. alaph, "#") text = rsub(text, alaph, "ˀ")

text = rsub(text, "#" .. waw .. consonants_and_vowels_capture, "#w-%1")

text = rsub(text, ".", tt_next) -- shorten waw + rvasa text = rsub(text, TR_WAW_PLUS_RVASA .. consonants_minus_glides_cg .. consonants_minus_glides_cg_2, TR_WAW_PLUS_RVASA_SHORT .. "%1%2")

text = rsub(text, "([ēīā])" .. "ˀ" .. consonants_capture, "%1%2")

text = rsub(text, "([" .. vowels_w .. "])([" .. vowels .. "])", "%1w%2") -- needs a test case text = rsub(text, "([" .. vowels_y .. "])([" .. vowels .. "])", "%1y%2")

text = rsub(text, "ˁˁ", "ˁ") -- needs a test case text = rsub(text, "ˀˀ", "ˀ") -- needs a test case text = rsub(text, "-ˀ", "-")

text = rsub(text, "ḇḇ", "ḇ") text = rsub(text, "ḡḡ", "ḡ") text = rsub(text, "ḏḏ", "ḏ") text = rsub(text, "ḵḵ", "ḵ") text = rsub(text, "p̄p̄", "p̄") text = rsub(text, "ṯṯ", "ṯ")

-- substitutions like this are preferred to be closer to the top, but it was hard to figure out how to do that -- shorten waw + rvasa a different way from before per verbal noun of the d stem local waw_cg = "([" .. consonants_minus_glides .. "wy])" text = rsub(text, '#' .. waw_cg .. TR_WAW_PLUS_RVASA .. waw_cg .. 'ā' .. waw_cg .. 'ā#', "%1" .. TR_WAW_PLUS_RVASA_SHORT .. '%2%2ā%3ā')

-- local bdul_capture = '([bdwl])' -- text = rsub(text, "#" .. bdul_capture .. "([" .. vowels .. "])", "#%1-%2")

text = rsub(text, "#", "")

if not rmatch(text, "([-" .. vowels .. " ])") then require("Module:debug").track("aii-translit/lacking diacritics") return nil end

return text end

return export