Module:fa-cls-translit

-- Authors: Sameerhameedy

local U = mw.ustring.char local gsub = mw.ustring.gsub local export = {}

local fatHataan = U(0x64B) -- اً, tanvin-e nasb (تنوین نصب) local Dammataan = U(0x64C) -- un local kasrataan = U(0x64D) -- in local zabar = U(0x64E) local zer = U(0x650) local pesh = U(0x64F) local tashdid = U(0x651) -- also called shadda local jazm = "ْ" local he = "ه" local zwnj = U(0x200C) local highhmz = U(0x654) local lrm = U(0x200e) -- left-to-right mark local rlm = U(0x200f) -- right-to-left mark local balticons = "ڃڇڑڗݜݨݩǩ"

local consonants = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticons local consonants2 = "ءبپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنوؤهیئywة" .. balticons -- including semivowels local vowels = "āēīōū" local semivowel = "یو" local hes = "هح" local diacritics = "َُِّْٰ" local ZZP = "َُِ" local alif_wasla = "ٱ" local space_like = "%s'" .. '"' local space_like_class = "[" .. space_like .. zwnj .. "]"

--- The characters ټ ٹ ډ ڈ ے are included only for Mughal Persian and Hazaragi.

local mapping = { ["آ"] = "ā", ["ب"] = "b", ["پ"] = "p", ["ت"] = "t", ["ث"] = "s", ["ج"] = "j", ["چ"] = "č", ["ح"] = "h", ["خ"] = "x", ["د"] = "d", ["ذ"] = "z", ["ر"] = "r", ["ز"] = "z", ["ژ"] = "ž", ["س"] = "s", ["ش"] = "š", ["ص"] = "s", ["ض"] = "z", ["ط"] = "t", ["ظ"] = "z", ["غ"] = "ğ", ["ف"] = "f", ["ق"] = "q", ["ک"] = "k", ["گ"] = "g", ["ل"] = "l", ["م"] = "m", ["ن"] = "n", ["و"] = "ō", ["ی"] = "ē", ["۔"] = ".",

["ه"] = "h",

["ع"] = "'", ["ء"] = "'", ["ئ"] = "'", ["ؤ"] = "'", ["أ"] = "'",

-- diacritics [zabar] = "a", [zer] = "i", [pesh] = "u", [jazm] = "", -- also sukun - no vowel [zwnj] = "-", -- ZWNJ (zero-width non-joiner) [highhmz] = "-yi",

-- ligatures ["ﻻ"] = "lā", ["ﷲ"] = "allāh",

-- kashida ["ـ"] = "-", -- kashida, no sound

-- alif_wasla [alif_wasla] = "", -- nothing

-- numerals ["۱"] = "1",	["۲"] = "2",	["۳"] = "3",	["۴"] = "4",	["۵"] = "5",	["۶"] = "6",	["۷"] = "7",	["۸"] = "8",	["۹"] = "9",	["۰"] = "0",

-- punctuation (leave on separate lines) ["؟"] = "?", -- question mark ["،"] = ",", -- comma ["؛"] = ";", -- semicolon ["«"] = "“", -- quotation mark ["»"] = "”", -- quotation mark ["٪"] = "%", -- percent ["؉"] = "‰", -- per mille ["٫"] = ".", -- decimals ["٬"] = ",", -- thousan

-- regional characters (FOR VERY SPECIFIC USECASES) ["ټ"] = "ṭ", ["ٹ"] = "ṭ", ["ډ"] = "ḍ", ["ڈ"] = "ḍ", -- balti -- cant do anything about ژ because it conflicts with persian ["ڃ"] = "ž", ["ڇ"] = "č̣", ["ڑ"] = "ṛ", ["ڗ"] = "dz", ["ݜ"] = "ṣ", ["ݨ"] = "ng", ["ݩ"] = "ny", ["ھ"] = "h", ["ے"] = "e", }

local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".'!»٪؉۔`,/–—" local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = "ع" local alif = "ا" local malif = "آ" local hamza = "ء" local ye = "ی" local ye2 = "ئ" local vao = "و" local dagger_alif = U(0x670) local marbuta = U(0x629) local te = "ت" local ye3 = "ے" local laam = "ل" local vowel = "[" .. vowels .. ZZP .. jazm .. semivowel .. malif .. "]" local sun_letters = "تثدذرزسشصضطظلن"

local before_diacritic_checking_subs = { transformations prior to checking for diacritics -- { U(0x06E5), "و" }, { U(0x06E6), "ی" }, { "ہ", he }, -- get rid of balti he (allows balti to transliterate) { "ک" .. highhmz, "ǩ" }, { "([" .. fatHataan .. ZZP .. dagger_alif .. "])" .. tashdid, tashdid .. "%1" },	{ alif .. fatHataan, zabar .. "ن" }, { fatHataan .. alif, zabar .. "ن" }, { jazm .. ye .. dagger_alif, jazm .. ye .. zabar .. alif }, { zabar .. ye .. dagger_alif, zabar .. alif }, { ye .. dagger_alif, zabar .. alif }, -- the first letter is U+06CC { ye3, ye }, { "[أإ]", ye2 }, -- kashiida { "^" .. "ـ" .. zabar .. alif, "ـ" .. malif }, { "^" .. "ـ" .. "([" .. ZZP .. "])", "ـ" .. alif .. "%1" },	{ zabar .. dagger_alif, zabar .. alif }, { dagger_alif, zabar .. alif }, { fatHataan, zabar .. "ن" }, -- fatḥatan { Dammataan, pesh .. "ن" }, -- ḍammatan { kasrataan, zer .. "ن" }, -- kasratan

-- allah ligatures and arabic al { alif_wasla .. laam, "l-" }, { alif_wasla, "" }, { "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" }, { "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" }, { "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. ZZP .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" }, { "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" }, { marbuta .. "([" .. ZZP .. "])" .. alif .. laam, te .. "%1-" .. laam .. "%-" },	{ "l%-" .. "([" .. sun_letters .. "])" .. tashdid, "%1" .. jazm .. "-%1" },	{ "l%-" .. laam .. tashdid, laam .. laam }, { "l%-" .. laam, laam .. laam }, { "l%-", laam .. "-" },	{ marbuta .. "([" .. ZZP .. "])" .. alif, te .. "%1-" },	{ marbuta .. "([" .. ZZP .. jazm .. "])", te .. "%1" },	{ marbuta, he }, {		"(["			.. consonants2			.. "]["			.. ZZP			.. "])("			.. space_like_class			.. ")" .. alif .. laam .. "(["			.. jazm			.. laam			.. "])", "%1%2" .. laam .. "%3",	},	{ laam .. laam .. tashdid, laam .. tashdid }, -- use jazm/sukoon to prevent this conversion { "(خ)" .. vao .. zabar .. alif, "%1" .. zabar .. alif }, { "(خ)" .. vao .. zabar, "%1" .. pesh }, { "(خ)" .. vao .. ye .. "([^" .. ZZP .. jazm .. "])", "%1" .. ye .. "%2" },	-- izāfa { zwnj, "-" }, { jazm .. alif, jazm .. "-" .. alif }, -- vowel killing, invisible ZWNJ { zabar .. jazm, "-" }, -- vowel killing, invisible ZWNJ }

local has_diacritics_subs = { -- this ensure allah ligatures and al- work { "l%-", "" }, { "[" .. sun_letters .. "]" .. jazm .. "%-", "" },	{ "[" .. consonants2 .. "]" .. "([" .. ZZP .. "])" .. space_like_class .. alif .. laam, "" }, -- remove punctuation and tashdid { "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .. "]", "" },	{ "[" .. consonants .. "]$", "" },	{ "[" .. consonants .. "](" .. space_like_class .. ")", "%1" }, { "[" .. consonants .. "]%-", "-" },	-- these are required for arabic al- to work { "[" .. consonants2 .. "]" .. "([" .. zer .. pesh .. "])" .. alif .. laam, laam }, { "[" .. consonants2 .. "]([" .. zer .. pesh .. "])%-" .. alif .. laam, laam }, -- remove CV pairs -- consonants paired to alif { "[" .. consonants2 .. "]" .. jazm, "" }, { "[" .. consonants2 .. "]" .. jazm .. malif, "" }, { "[" .. consonants2 .. "]" .. zabar .. alif, "" }, -- consonants paired to a semivowel {		"[" .. consonants .. alif .. "][" .. semivowel .. ZZP .. "]([" .. semivowel .. "])([" .. semivowel .. "])", "%1%2",	},	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. jazm .. semivowel .. "]", "" },	{ "[" .. alif .. consonants2 .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },	{ malif, "" }, -- counts as a CV pair { jazm .. alif .. "[" .. ZZP .. "]", "" },	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "]", "" },	{ "[" .. consonants2 .. alif .. semivowel .. "][" .. semivowel .. "]", "" },	-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ { "[" .. numbers .. "ٱ" .. "آ" .. "]", "" },	{ "%s", "" }, { "%-", "" },	{ "[" .. semivowel .. "]", "" },	{ "(" .. vowel .. ")", "" }, }

local function has_diacritics(text) local count text, count = gsub(text, "[" .. lrm .. rlm .. "]", "") if count > 0 then require("Module:debug").track("fa-translit/lrm or rlm") end for _, sub in ipairs(has_diacritics_subs) do		text = gsub(text, unpack(sub)) end return #text == 0 end

function export.tr(text, lang, sc) if type(text) == "table" then local function f(x) return (x ~= "") and x or nil end text, lang, sc, omit_i3raab, force_translit = f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5]) end for _, sub in ipairs(before_diacritic_checking_subs) do		text = gsub(text, sub[1], sub[2]) end

if not force_translit and not has_diacritics(text) then require("Module:debug").track("fa-translit/lacking diacritics") return nil end

--define the "end" of a word text = gsub(text, "#", "HASHTAG") text = gsub(text, "^", "#") text = gsub(text, "$", "#") text = gsub(text, " | ", "# | #") text = gsub(text, "%s", "# #") text = gsub(text, "\n", "#" .. "\n" .. "#") text = gsub(text, "([" .. punctuation .. "])", "#" .. "%1" .. "#")	text = "##" .. gsub(text, " ", "# #") .. "##"	text = gsub(text, "%-", "#-#") -- hastags now mark the beginning and end of a word --character reformatting and exceptions text = gsub(text, highhmz, "#" .. highhmz .. "#") --this ensures "and" is transliterated as a short vowel text = gsub(text, "#" .. vao .. "#", "#u#") text = gsub(text, "#" .. vao .. jazm .. malif, "#w-" .. malif ) -- prevent izafa from converting until later

-- Tashdeed text = gsub(text, "([" .. consonants .. "])" .. tashdid, "%1%1") text = gsub(text, "([" .. consonants .. "])" .. tashdid .. "([" .. ZZP .. "])", "%1%1%2")	text = gsub(text, "([" .. consonants .. "])" .. "([" .. ZZP .. "])" .. tashdid, "%1%1%2") text = gsub(text, ye .. "([" .. ZZP .. "])" .. tashdid, "yy%1") text = gsub(text, vao .. "([" .. ZZP .. "])" .. tashdid, "ww%1") text = gsub(text, ye .. tashdid .. "([" .. ZZP .. "])", "yy%1") text = gsub(text, vao .. tashdid .. "([" .. ZZP .. "])", "ww%1")

-- distinguish initial alif from vowel alif text = gsub(text, "([" .. consonants2 .. "])" .. zabar .. alif, "%1ā") text = gsub(text, "([" .. consonants2 .. "])" .. alif, "%1ā") text = gsub(text, jazm .. malif, "'ā") -- invisible ZWNJ text = gsub(text, "([" .. consonants2 .. "])" .. malif, "%1'ā") text = gsub(text, alif .. ye, "ē") text = gsub(text, alif .. vao, "ō") text = gsub(text, alif .. zer .. ye, "ī") text = gsub(text, alif .. pesh .. vao, "ū") text = gsub(text, tashdid .. alif, tashdid .. "ā")

-- convert semi vowels text = gsub(text, ye .. "ā", "yā") text = gsub(text, vao .. "ā", "wā") text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1") text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1") text = gsub(text, ye .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ē%1%2") text = gsub(text, vao .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ō%1%2") text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2") text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2") text = gsub(text, "([" .. consonants .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2") text = gsub(text, "([" .. consonants .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")

-- conversions for vaav/waaw/vao text = gsub(text, pesh .. vao, "ū") text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1") text = gsub(text, "(" .. vowel .. ")" .. vao, "%1w") -- conversions for ye text = gsub(text, zer .. ye, "ī") text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1") text = gsub(text, "(" .. vowel .. ")" .. ye, "%1y")

--Alif with short vowel text = gsub(text, alif .. "([" .. ZZP .. "])", "%1")

-- final changes -- izafa text = gsub(text, "ē" .. zer .. "#", "ē-yi#") text = gsub(text, zer .. "y" .. zer .. "#", "ī-yi#") text = gsub(text, "([^" .. consonants .. "])" .. "y" .. zer .. "#", "%1-yi#") text = gsub(text, "([" .. consonants2 .. "])" .. zer .. "#", "%1-i#") text = gsub(text, '("\'")' .. "##" .. zer .. "#", "%1-i#") -- do not count zer as izafa before silent alif text = gsub(text, "%-i" .. "##" .. "(" .. space_like_class .. ")" .. "##" .. "([" .. sun_letters .. "]" .. jazm .. "#%-#" .. ")", "i%1%2") text = gsub(text, "%-i" .. "#%-#" .. "([" .. sun_letters .. "]" .. "#%-#" .. ")", "i-%1") -- he deletion text = gsub(text, "([" .. ZZP .. "])" .. he .. "#" .. zwnj, "%1-") text = gsub(text, "([" .. ZZP .. "])" .. he .. "#", "%1#") text = gsub(text, "#" .. ain, "#")

-- get rid of hashtags (not needed) text = gsub(text, "#", "") text = gsub(text, "HASHTAG", "#") text = string.gsub(text, lrm, "") text = string.gsub(text, rlm, "") -- convert all characters text = mw.ustring.gsub(text, ".", mapping)

-- alif -- Final corrections text = mw.ustring.gsub(text, "āa", "ā") text = mw.ustring.gsub(text, "aaa", "ā") text = mw.ustring.gsub(text, "āā", "ā") text = mw.ustring.gsub(text, "aa", "ā") text = mw.ustring.gsub(text, "ī" .. "([" .. vowels .. "])", "iy%1") text = mw.ustring.gsub(text, "ū" .. "([" .. vowels .. "])", "uw%1")

text = mw.ustring.toNFC(text)

return text end

return export