Module:ur-translit

--[=[

FIXME:

1. support for Arabic al- (copy from fa-cls-translit)

]=] local U = require("Module:string/char") local gsub = mw.ustring.gsub local export = {}

local fatHataan = U(0x64B) local zabar = U(0x64E) local zer = U(0x650) local pesh = U(0x64F) local zwnj = U(0x200C) -- Is this even used in Urdu? Why was it included in the previous version? local highhmz = U(0x654) local tashdid = U(0x651) -- also called tashdid local jazm = "ْ" local he = "ہ" local ghunna = U(0x658) local dagger_alif = U(0x670)

local consonants = "ببپتثجچحخدذرزژسشصضطظعغفقکگلࣇمنݨؤڷہئھٹڈڑ" local consonantS = "ببپتثجچحخدذرزژسشصضطظعغفقکگڷلࣇمنݨہھٹڈڑ" local consonantS2 = "یببپتثجچحخدذرزژسشصضطظعغفقکگلࣇڷمنݨوؤہھئٹڈڑ" local semivowel = "یو" local vowels = "āایئےۓوؤ" local indvowels = "آایےوؤ" local hes = "ہح" local diacritics = "َُِّْٰ" local ZZP = "َُِ" local lrm = U(0x200e) -- left-to-right mark local rlm = U(0x200f) -- right-to-left mark

local consonants_needing_vowels = "ببپتثجچحخدذرزژسشصضطظعغفقکڷگلࣇمنںݨہئٹڈڑءﷲ" -- consonants on the right side; includes alif madda local rconsonants = consonants_needing_vowels .. "ویآ" -- consonants on the left side; does not include alif madda local lconsonants = consonants_needing_vowels local space_like = "%s'" .. '"' local space_like_class = "[" .. space_like .. "]"

-- not all letters here are used by urdu local mapping = { ["آ"] = 'ā', ["ب"] = 'b', ["پ"] = 'p', ["ت"] = 't', ["ٹ"] = 'ṭ', ["ث"] = 's', ["ج"] = 'j', ["چ"] = 'c', ["ح"] = 'h', ["خ"] = 'x', ["د"] = 'd', ["ڈ"] = 'ḍ', ["ذ"] = 'z', ["ر"] = 'r', ['ڑ'] = "ṛ", ["ز"] = 'z', ["ژ"] = 'ź', ["س"] = 's', ["ش"] = 'ś', ["ص"] = 's', ["ض"] = 'z', ["ط"] = 't', ["ظ"] = 'z', ["غ"] = 'ġ', ["ف"] = 'f', ["ق"] = 'q', ["ک"] = 'k', ["گ"] = 'g', ["ݨ"] = 'ṇ', ["ࣇ"] = 'ḷ', ["ڷ"] = 'ł', ["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n', ["و"] = 'o', ["ہ"] = 'h', ["ی"] = 'e', ["ے"] = 'e', ["۔"] = ".", ["ں"] = '̃',

["ھ"] = "h",

["ع"] = '\'', ["ء"] = '\'', ["أ"] = '', -- diacritics [zabar] = "a", [zer] = "i", [pesh] = "u", [jazm] = "", -- also sukun - no vowel [zwnj] = "-", -- ZWNJ (zero-width non-joiner) -- ligatures ["ﻻ"] = "lā", ["ﷲ"] = "allāh", -- kashida ["ـ"] = "-", -- kashida, no sound -- numerals ["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",	["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",	-- punctuation (leave on separate lines) ["؟"] = "?", -- question mark ["۔"] = ".", -- period ["،"] = ",", -- comma ["؛"] = ";", -- semicolon ["«"] = '“', -- quotation mark ["»"] = '”', -- quotation mark ["٪"] = "%", -- percent ["؉"] = "‰", -- per mille ["٫"] = ".", -- decimals ["٬"] = ",", -- thousand ["ۓ"] = "-ye", [highhmz] = "-yi", }

local punctuation = "%-:%(%)%[%]*&٫؛؟،ـ«\".\'!»٪؉۔" local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = 'ع' local alif = 'ا' local ye = 'ی' local ye2 = 'ئ' local ye3 = "ے" local vao = "و" local aspirate = 'ھ' local highhmz = U(0x654) local aiu = "āīūآ" local n_exceptions = "[^" .. aiu .. "]" -- for nasalization exceptions

local before_diacritic_checking_subs = { transformations prior to checking for diacritics -- {U(0x06E5), "و"}, {U(0x06E6), "ی"}, -- ignore dagger alif placed over regular alif or alif maqṣūra {"([" .. alif .. ye .. "])" .. dagger_alif, alif}, {"([^" .. alif .. ye .. "])" .. fatHataan, alif .. fatHataan}, }

local has_diacritics_subs = { -- remove arabic ye (ruins conversions) {"لل" .. he, ""}, {"لل" .. tashdid .. he, ""}, {"لل" .. tashdid .. dagger_alif .. he, ""}, {"ۃ", ""}, -- aspirated consonants should cound as 1 consonant not two {"([" .. consonants .. "][".. ZZP .. diacritics .. "?])" .. aspirate, "%1"}, {"([" .. consonants .. "])" .. aspirate, "%1"}, { aspirate, ""}, -- remove punctuation and tashdid {"[" .. punctuation .. tashdid .. highhmz .. zwnj .. numbers .. "]", ""},	-- noon gunna and silent consonants can be removed { ".. [".. ZZP .. indvowels .. diacritics .. "?] .. ([" .. consonantS2 .. "])" .. "([".. ghunna .. jazm .."])" .. "([" .. consonantS2 .. "])", ""}, { "([" .. consonants .. "])" .. ghunna, ""}, { "([" .. consonantS2 .. "])" .. jazm, ""}, { "([" .. consonantS2 .. "])" .. "یٰ", ""}, -- must go before removing final consonants {"[".. ZZP .. diacritics .. "]" .. alif, alif }, {fatHataan, "" }, { "([" .. consonantS2 .. "])" .. "[" .. ZZP .. diacritics .. indvowels .. "?]" .. "([ںۓۂۂ])", "" }, { "([ںۓۂۂ])", "" }, { "([" .. ye .. alif .. "])" .. dagger_alif, alif}, { dagger_alif .. ye, alif}, { alif .. "[".. ZZP .. diacritics .. "]", ""},	{ "[".. ZZP .. diacritics .. "]" .. alif, alif}, { dagger_alif .. "([" .. ye .. alif .. "])", alif}, -- Remove consonants at end of word or utterance, so that we're OK with -- words lacking iʿrāb (must go before removing other consonants). -- If you want to catch places without iʿrāb, comment out the next two lines. {"[" .. lconsonants .. "]$", ""},	-- closed consonants {"([" .. consonantS2 .. "])[" .. indvowels .. ZZP .. "]", ""},	-- remove consonants (or alif) when followed by diacritics -- must go after removing tashdid -- do not remove the diacritics yet because we need them to handle -- long-vowel sequences of diacritic + pseudo-consonant {"[" .. lconsonants .. alif .. "]([" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "])", "%1"}, -- the following two must go after removing consonants w/diacritics because {"([" .. rconsonants .. "])([".. ZZP .. diacritics .. "?][" .. indvowels .. "?])([" .. consonantS2 .. "])", ""}, {"[" .. indvowels .. "]([" .. rconsonants .. "])", ""}, {"[".. ZZP .. diacritics .. "]([" .. lconsonants .. "])", ""}, {"([" .. consonants .. "])[" .. indvowels .. ZZP .. diacritics .. "]", ""},	{"([" .. rconsonants .. "])(" .. space_like_class .. ")", ""}, {"[" .. lconsonants .. "]" .. zabar .. "[".. ye .. ye3 .. vao .. "]", ""},	-- we only want to treat vocalic wāw/yā' in them (we want to have removed	-- remove vaw	{ "[" .. lconsonants .. "]" .. vao, ""},	{"ؤ" .. pesh, ""},	{"ؤ", ""},	-- remove ye	{ "[" .. lconsonants .. "]" .. ye, ""},	{ye3, ""},	{"([" .. consonants .. "][" .. ZZP .. "])" .. he,""},	-- remove fatḥa/fatḥatan + alif/alif-maqṣūra	{"[" .. fatHataan .. zabar .. "][" .. alif .. ye .. "]", ""},	-- remove diacritics and independant vowels	{"[" .. fatHataan .. zabar .. pesh .. zer .. jazm .. dagger_alif .. "]", ""},	{ "[" .. indvowels .. "]", ""},	{ "[".. semivowel .."]" .. "[" .. indvowels .. "]" , ""},	-- remove numbers, hamzatu l-waṣl, alif madda	{"[" .. numbers .. "ٱ" .. "آ" .. "]", ""},	{"%s", ""}, }

-- declared as local above local function has_diacritics(text) local count text, count = gsub(text, "[" .. lrm .. rlm .. "]", "") if count > 0 then require("Module:debug").track("ur-translit/lrm or rlm") end for _, sub in ipairs(has_diacritics_subs) do		text = gsub(text, unpack(sub)) end return #text == 0 end

function export.tr(text, lang, sc) if type(text) == "table" then local function f(x) return (x ~= "") and x or nil end text, lang, sc, omit_i3raab, force_translit = f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5]) end for _, sub in ipairs(before_diacritic_checking_subs) do		text = gsub(text, sub[1], sub[2]) end

if not force_translit and not has_diacritics(text) then require("Module:debug").track("ur-translit/lacking diacritics") return nil end --define the "end" of a word text = gsub(text, "#", "HASHTAG") text = gsub(text, " | ", "# | #") text = gsub(text, "\n", "#".."\n" .. "#") text = gsub(text, "(["..punctuation.."])", "#".."%1" .. "#") text = "##" .. gsub(text, " ", "# #") .. "##"	text = gsub(text, zwnj, "#"..zwnj.."#") -- hastags now mark the beginning and end of a word --exceptions text = gsub(text, "#" .. vao .. he .. "#", "#vo#") text = gsub(text, "#" .. vao .. pesh .. he .. "#", "#vo#") text = gsub(text, "#" .. "پ" .. he .. "#", "#pe#") text = gsub(text, "#" .. "پ" .. zer .. he .. "#", "#pe#") text = gsub(text, "#" .. ye .. he .. "#", "#ye#") text = gsub(text, "#" .. ye .. zer .. he .. "#", "#ye#") --character reformatting --to make an exceptions for a word, put hashtags on both sides text = gsub(text, "ۂ", he .. highhmz) text = gsub(text, highhmz, "#"..highhmz.."#") --text = gsub(text, 'ىٰ', "ā") -- the first letter is U+0649 (Arabic alif maqṣūra), it doesn't belong here text = gsub(text, 'یٰ', "ā") -- the first letter is U+06CC text = gsub(text, 'ٰ', "ā") text = gsub(text, 'ا' .. fatHataan, "an") text = gsub(text, 'لا', "ﻻ") text = gsub(text, "ة" , "ۃ") text = gsub(text, "ۃ" .. "([" .. ZZP .. jazm .. "])", "ت%1") text = gsub(text, "ۃ", he) -- Tashdeed text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid, "%1%1") text = gsub(text, '([' .. consonantS2 .. '])' .. tashdid .. '([' .. ZZP .. '])', "%1%1%2")	-- For some reason the tashdeed gets pushed after the other diacritics, so this line is necessary for tashdeed to work with other diacritics text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZP .. '])' .. tashdid, "%1%1%2") text = gsub(text, '([' .. ZZP .. '])' .. aspirate, aspirate.."%1") text = gsub(text, dagger_alif .. aspirate, aspirate.."%1") text = gsub(text, ye .. '([' .. ZZP .. '])' .. tashdid, "yy%1") text = gsub(text, vao .. '([' .. ZZP .. '])' .. tashdid, "vv%1") text = gsub(text, ye .. tashdid .. '([' .. ZZP .. '])', "yy%1") text = gsub(text, vao .. tashdid .. '([' .. ZZP .. '])', "vv%1")

--initial alif text = gsub(text, "(["..consonantS2.."])" .. alif, "%1ā") --alifs paired to a consonant are a vowel text = gsub(text, jazm .. alif, "-") -- invisible ZWNJ text = gsub(text, jazm .. "آ", "-ā") -- invisible ZWNJ text = gsub(text, "(["..consonantS2.."])" .. "آ", "%1'ā") text = gsub(text, pesh .. vao .. zabar .. alif, "ūā" ) text = gsub(text, zabar .. alif, "ā") text = gsub(text, "(" .. ghunna .. ")" .. alif, "%1ā") text = gsub(text, "(["..diacritics.."])" .. alif, "%1") text = gsub(text, "(["..ZZP.."])" .. alif, "%1") --alifs not paired to a consonant are a glottal stop (not shown currently) text = gsub(text, alif.."(["..diacritics.."])".. "(["..consonantS2.."])", "%1%2") text = gsub(text, alif..ye.."#", "ī") text = gsub(text, alif..ye, "e") text = gsub(text, alif..ye3, "e") text = gsub(text, alif..zabar..ye3, "ai") text = gsub(text, alif..vao, "o") text = gsub(text, alif..zer..ye, "ī") text = gsub(text, alif..pesh..vao, "ū") text = gsub(text, alif.."(["..diacritics.."])", "%1") -- convert semi vowels text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1") text = gsub(text, ye.. "(["..diacritics..ZZP.."])", "y%1") text = gsub(text, ye .. "ā", "yā") text = gsub(text, vao.. "ā", "vā") text = gsub(text, ye .. "(["..zabar.."]?)" .. ye3, "y%1"..ye3.."") text = gsub(text, vao .. "(["..zabar.."]?)" .. ye3, "v%1"..ye3.."") text = gsub(text, ye .. "(["..semivowel.."])(["..semivowel.."])", "e%1%2") text = gsub(text, vao .. "(["..semivowel.."])(["..semivowel.."])", "o%1%2") text = gsub(text, ye .. "(["..semivowel.."])", "y%1") text = gsub(text, vao .. "(["..semivowel.."])", "v%1") -- conversions for vaav/vaw/vao text = gsub(text, pesh.. vao, "ū") text = gsub(text, zabar .. vao, "au") text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "v%1") text = gsub(text, "(["..diacritics..ZZP.."])" .. vao, "%1v") -- conversions for ye   text = gsub(text, zer.. ye, "ī") text = gsub(text, ye .. "#", "ī#") text = gsub(text, zabar.. ye, "ai") text = gsub(text, zabar.. ye3, "ai") text = gsub(text, ye .. "(["..diacritics..ZZP.."])", "y%1") text = gsub(text, "(["..diacritics..ZZP.."])" .. ye, "%1y") -- final he and izafa/ezafe text = gsub(text, "e" .. zer .. "#", "e-yi#") text = gsub(text, "ī" .. zer .. "#", "ī-yi#") text = gsub(text, "y" .. zer .. "#", "-yi#") text = gsub(text, zer .. "#", "-i#") text = gsub(text, "(["..ZZP.."])" .. he .. "#" .. zwnj, "%1-") text = gsub(text, "(["..ZZP.."])" .. he .. "#", "%1#") text = gsub(text, zabar .. he .. "#", "a#") -- noon ghunna assimilation/nasalization --remove impossible nasal vowels text = gsub(text, "ن" .. ghunna .. "([ب])", "m%1") -- nasal vowels are impossible before b   text = gsub(text, "ن" .. ghunna .. "ت" .. aspirate, "nth") text = gsub(text, "ن" .. ghunna .. "([قگ])",	"ṅ%1") -- impossible before q and g   text = gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "ٹ"	.. aspirate, "%1ṇṭh") text = gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "پ" .. aspirate, "%1mph") text = gsub(text, "(" .. n_exceptions .. ")" .. "ن" .. ghunna .. "ک" .. aspirate, "%1ṅkh") text = gsub(text, "ن" .. ghunna .. "([ج])", "ñ%1") -- impossible before j   text = gsub(text, "ن".. ghunna .. "ڈ" .. aspirate, "ṇḍh") -- aspirated d/D cant be nasalized text = gsub(text, "ن".. ghunna .. "د" .. aspirate, "ndh") -- aspirated d/D cant be nasalized --other nasals text = gsub(text, "ن" .. jazm .. "([کگق])" .. "#",	"ṅ%1#") text = gsub(text, "ن" .. ghunna .. "([کگق])" .. jazm .. "#",	"ṅ%1#") text = gsub(text, "ن" .. jazm .. "([دتر])", "n%1") -- dental text = gsub(text, "ن" .. ghunna .. "([ٹڈ])" .. jazm .. "#", "ṇ%1#") text = gsub(text, "ن" .. ghunna .. "([چج])" .. jazm .. "#", "ñ%1#") -- postalveolar text = gsub(text, "ن" .. ghunna .. "([چج]".. aspirate ..")" .. jazm .. "#", "ñ%1#") -- if noon ghunna cannot assimilate, it becomes a nasal vowel. text = gsub(text, "ن" .. ghunna, "ں") text = gsub(text, "ؤ" .. pesh .. "ں" .. "#", ye2 .. "ū" .. "ں" .. "#") -- get rid of hashtags (not needed) text = gsub(text, "#", "") text = gsub(text, "HASHTAG", "#") text = string.gsub(text, lrm, "") text = string.gsub(text, rlm, "") -- convert all characters text = gsub(text, '.', mapping) -- vowel fixes -- nasalized dipthongs text = gsub(text, 'a([iu])̃', 'a͠%1') -- alif -- Final corrections text = gsub(text, "lll", "ll") text = gsub(text, "āa", "ā") text = gsub(text, "aaa", "ā") text = gsub(text, "āā", "ā") text = gsub(text, "aa", "ā") --now get rid of the zero consonants text = gsub(text, "ئ", "") text = gsub(text, "u" .. "ؤ", "u") text = gsub(text, "ؤ" .. "u" .. "$", "ū") -- ؤُ is rendered 'ū' word-finally, short 'u' otherwise text = gsub(text, "ؤ" .. "u" .. "([ ,.;?!-])", "ū%1") text = gsub(text, "ؤ" .. "u", "u") text = gsub(text, "ؤ", "o") text = mw.ustring.toNFC(text) return text end

return export