Module:ar-utilities

local m_links = require("Module:links") local m_utilities = require("Module:utilities") local ar_translit = require("Module:ar-translit") local m_headword = require("Module:headword")

local export = {}

local lang = require("Module:languages").getByCode("ar") local sc = require("Module:scripts").getByCode("Arab")

local rfind = mw.ustring.find local rmatch = mw.ustring.match local rsubn = mw.ustring.gsub local rsplit = mw.text.split local u = require("Module:string/char")

local consonants = "[بتثجحخدذرزسشصضطظعغقفلكمنهويء]"

local function ine(x) -- If Not Empty if x == "" then return nil else return x   end end

-- version of rsubn that discards all but the first return value function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

-- synthesize a frame so that exported functions meant to be called from -- templates can be called from the debug console. function debug_frame(parargs, args) return {args = args, getParent = function return {args = parargs} end} end

function export.catfix return m_utilities.catfix(lang, sc) end

--- hamza processing --

-- hamza variants local HAMZA           = u(0x0621) -- hamza on the line (stand-alone hamza) = ء local HAMZA_ON_ALIF   = u(0x0623) local HAMZA_ON_WAW    = u(0x0624) local HAMZA_UNDER_ALIF = u(0x0625) local HAMZA_ON_YA     = u(0x0626) local HAMZA_ANY       = "[" .. HAMZA .. HAMZA_ON_ALIF .. HAMZA_UNDER_ALIF .. HAMZA_ON_WAW .. HAMZA_ON_YA .. "]" local HAMZA_PH        = u(0xFFF0) -- hamza placeholder

-- diacritics local A = u(0x064E) -- fatḥa local AN = u(0x064B) -- fatḥatān (fatḥa tanwīn) local U = u(0x064F) -- ḍamma local UN = u(0x064C) -- ḍammatān (ḍamma tanwīn) local I = u(0x0650) -- kasra local IN = u(0x064D) -- kasratān (kasra tanwīn) local SK = u(0x0652) -- sukūn = no vowel local SH = u(0x0651) -- šadda = gemination of consonants local DAGGER_ALIF = u(0x0670) local DIACRITIC_ANY_BUT_SH = "[" .. A .. I .. U .. AN .. IN .. UN .. SK .. DAGGER_ALIF .. "]" -- Pattern matching short vowels local AIU = "[" .. A .. I .. U .. "]" -- Pattern matching any diacritics that may be on a consonant local DIACRITIC = SH .. "?" .. DIACRITIC_ANY_BUT_SH

-- various letters and signs local ALIF  = u(0x0627) -- ʾalif = ا local AMAQ  = u(0x0649) -- ʾalif maqṣūra = ى local AMAD  = u(0x0622) -- ʾalif madda = آ local WAW   = u(0x0648) -- wāw = و local YA    = u(0x064A) -- yā = ي

function reorder_shadda(text) -- shadda+short-vowel (including tanwīn vowels, i.e. -an -in -un) gets -- replaced with short-vowel+shadda during NFC normalisation, which -- MediaWiki does for all Unicode strings; however, it makes the -- detection process inconvenient, so undo it. (For example, the code in	-- remove_in would fail to detect the -in in مُتَرَبٍّ because the shadda	-- would come after the -in.) text = rsub(text, "(" .. DIACRITIC_ANY_BUT_SH .. ")" .. SH, SH .. "%1") return text end

local hamza_subs = { --- handle initial hamza -- -- put initial hamza on a seat according to following vowel. {"^" .. HAMZA_PH .. "([" .. I .. YA .. "])", HAMZA_UNDER_ALIF .. "%1"}, 	{" " .. HAMZA_PH .. "([" .. I .. YA .. "])", " " .. HAMZA_UNDER_ALIF .. "%1"}, 	{"^" .. HAMZA_PH, HAMZA_ON_ALIF}, -- if no vowel, assume a {" " .. HAMZA_PH, " " .. HAMZA_ON_ALIF}, -- if no vowel, assume a

- handle final hamza -- -- "final" hamza may be followed by a short vowel or tanwīn sequence -- use a previous short vowel to get the seat {"(" .. AIU .. ")(" .. HAMZA_PH .. ")(" .. DIACRITIC .. "?)$", function(v, ham, diacrit) ham = v == I and HAMZA_ON_YA or v == U and HAMZA_ON_WAW or HAMZA_ON_ALIF return v .. ham .. diacrit end },	{"(" .. AIU .. ")(" .. HAMZA_PH .. ")(" .. DIACRITIC .. "? )", function(v, ham, diacrit) ham = v == I and HAMZA_ON_YA or v == U and HAMZA_ON_WAW or HAMZA_ON_ALIF return v .. ham .. diacrit end },	-- else hamza is on the line {HAMZA_PH .. "(" .. DIACRITIC .. "?)$", HAMZA .. "%1"},

handle medial hamza -- -- if long vowel or diphthong precedes, we need to ignore it. {"([" .. AMAD .. ALIF .. WAW .. YA .. "]" .. SK .. "?)(" .. HAMZA_PH .. ")(" .. SH .. "?)([^ ])", function(prec, ham, shad, v2) ham = (v2 == I or v2 == YA) and HAMZA_ON_YA or				(v2 == U or v2 == WAW) and HAMZA_ON_WAW or				rfind(prec, YA) and HAMZA_ON_YA or				HAMZA return prec .. ham ..shad .. v2		end },	-- otherwise, seat of medial hamza relates to vowels on one or both sides. {"([^ ])(" .. HAMZA_PH .. ")(" .. SH .. "?)(" .. AN .. "?[^ ])", function(v1, ham, shad, v2) ham = (v1 == I or v2 == I or v2 == YA) and HAMZA_ON_YA or				(v1 == U or v2 == U or v2 == WAW) and HAMZA_ON_WAW or				-- special exception for the accusative ending, in words like -- جُزْءًا (juzʾan). By the rules of Thackston pp. 281-282 a				-- hamza-on-alif should appear, but that would result in -- two alifs in a row, which is generally forbidden. -- According to Haywood/Nahmad pp. 114-115, after sukūn before -- the accusative ending (including when a pronominal suffix				-- follows) hamza is written on yāʾ if the previous letter -- is connecting, else on the line. The only examples they -- give involve preceding non-connecting z (جُزْءًا juzʾan and				-- (جُزْءَهُ juzʾahu) and preceding diphthongs, with the only				-- connecting letter being yāʾ, where we have hamza-on-yāʾ				-- anyway by the preceding regexp. Haywood/Nahmad's rule seems				-- too complicated, and since it conflicts with Thackston,				-- we only implement the case where otherwise two alifs would				-- appear with the indefinite accusative ending.				v2 == AN .. ALIF and HAMZA or				HAMZA_ON_ALIF			return v1 .. ham .. shad .. v2		end	},	--- handle alif madda -	{HAMZA_ON_ALIF .. A .. "?" .. ALIF, AMAD},

--- catch any remaining hamzas {HAMZA_PH, HAMZA} }

function export.process_hamza(term) -- convert HAMZA_PH into appropriate hamza seat for _, sub in ipairs(hamza_subs) do		term = rsub(term, sub[1], sub[2]) end

-- sequence of hamza-on-wāw + wāw is problematic and leads to a preferred -- alternative with some other type of hamza, as well as the original -- sequence; sequence of wāw + hamza-on-wāw + wāw is especially problematic -- and leads to two different alternatives with the original sequence not -- one of them if rfind(term, WAW .. "ؤُو") then return {rsub(term, WAW .. "ؤُو", WAW .. "ئُو"), rsub(term, WAW .. "ؤُو", WAW .. "ءُو")} elseif rfind(term, YA .. "ؤُو") then return {rsub(term, YA .. "ؤُو", YA .. "ئُو"), term} elseif rfind(term, ALIF .. "ؤُو") then -- Here John Mace "Arabic Verbs" is inconsistent. In past-tense parts, -- the preferred alternative has hamza on the line, whereas in		-- non-past parts the preferred alternative has hamza-on-yāʾ even -- though the sequence of vowels is identical. It's too complicated to -- propagate information about tense through to here so pick one. return {rsub(term, ALIF .. "ؤُو", ALIF .. "ئُو"), term} elseif rfind(term, A .. "ؤُو") then return {rsub(term, A .. "ؤُو", A .. HAMZA_ON_ALIF .. U .. WAW), term} -- no alternative spelling in sequence of U + hamza-on-wāw + U + wāw; -- sequence of I + hamza-on-wāw + U + wāw does not occur (has	-- hamza-on-yāʾ instead) else return {term} end end

--- misc junk -

local LRM = u(0x200E) -- left-to-right mark

local function link(term, tr, gloss, face, alt) if word == "" or word == "&mdash;" then return word else return m_links.full_link( { term = term, alt = alt, lang = lang, tr = tr, sc = sc, gloss = gloss }, face ) end end

local function format_genders(lang, sc, genders) if genders and #genders > 0 then local gen = require("Module:gender and number") return " " .. gen.format_list(genders, lang) else return "" end end

local ordinal = { "first", "second", "third", "fourth", "fifth" }

local function hamzaError(rootTable, output) for i, letter in pairs(rootTable) do		if not rfind(letter, consonants) then if rfind(letter, "[أإؤئ]") then table.insert(output, ' [Seated hamzas, such as "' .. letter .. '", are not allowed in the names of roots. Use bare hamza, "&rlm; ء &lrm;".] ') require("Module:debug").track("ar-root/hamza-error") -- Special:WhatLinksHere/Wiktionary:Tracking/ar-root/hamza-error else error(letter .. ", the " .. ordinal[i] .. " letter in the category name, is not a consonant.") end end end end

local function validateRoot(rootTable, joined_root) if type(rootTable) ~= "table" then error("rootTable is not a table", 2) end for i, letter in ipairs(rootTable) do		if mw.ustring.len(letter) > 1 then error('"' .. letter .. '", the ' .. ordinal[i] .. ' letter in the root "' .. joined_root .. '" should be a single letter.') end end end

function export.ar_root(frame) local output = {} local categories = {} local title = mw.title.getCurrentTitle local fulltitle = title.fullText local pagename = title.text local namespace = title.nsText local params = { [1] = {},		[2] = {},		[3] = {},		[4] = {},		["nocat"] = { type = "boolean", default = false }, ["plain"] = { type = "boolean", default = false }, ["t"] = {}, ["gloss"] = { alias_of = "t" }, ["face"] = { default = "term" }, ["notext"] = { type = "boolean", default = false }, ["nolink"] = { type = "boolean", default = false }, }	local args = require("Module:parameters").process(frame:getParent.args, params) local rootLetters = {} if not args[1] and namespace == "Template" then rootLetters = { "ك", "ت", "ب" } elseif args[1] and args[2] then rootLetters = { args[1], args[2], args[3], args[4] } elseif args[1] then rootLetters = rsplit(args[1], " ") else rootLetters = rsplit(fulltitle, " ") end hamzaError(rootLetters, output) local joined_root = table.concat(rootLetters, " ") validateRoot(rootLetters, joined_root) local joined_tr = ar_translit.tr(table.concat(rootLetters, "-"), lang, sc, nil, nil, "force") or "-" if fulltitle == joined_root then table.insert(output, m_headword.full_headword({lang = lang, sc = sc, pos_category = "roots", categories = {}, heads = { joined_root }, translits = { joined_tr }}) ) table.insert(categories, "") if args[1] then require("Module:debug").track("ar-root") -- Special:WhatLinksHere/Wiktionary:Tracking/ar-root end else if sc:countCharacters(pagename) < mw.ustring.len(pagename) - 2 then require("Module:debug").track("ar-root/title-not-ar") -- Special:WhatLinksHere/Wiktionary:Tracking/ar-root/title-not-ar if not args["nocat"] then require("Module:debug").track("ar-root/title-not-ar/cat") -- Special:WhatLinksHere/Wiktionary:Tracking/ar-root/title-not-ar/cat end end local link_text if args["nolink"] then link_text = link(nil, joined_tr, ine(args["gloss"]), args["face"], joined_root) else link_text = link(joined_root, joined_tr, ine(args["gloss"]), args["face"] ) end table.insert(output, link_text) table.insert(categories, m_utilities.format_categories( { "Arabic terms belonging to the root " .. joined_root }, lang) ) end if args["plain"] then return joined_root elseif args["nocat"] then return table.concat(output) elseif args["notext"] then return table.concat(categories) else return table.concat(output) .. table.concat(categories) end end

function export.ar_rootbox(frame) local output = {} local categories = {} local title = mw.title.getCurrentTitle local fulltitle = title.fullText local pagename = title.text local namespace = title.nsText

local params = { [1] = {},       ["nocat"] = {type = "boolean"}, ["plain"] = {type = "boolean"}, ["t"] = {}, ["gloss"] = {alias_of = "t"}, ["notext"] = {type = "boolean"}, ["nolink"] = {type = "boolean"}, ["sort"] = {}, ["face"] = {} }	local args = require("Module:parameters").process(frame:getParent.args, params) local rootLetters = {} if not args[1] and namespace == "Template" then rootLetters = { "ك", "ت", "ب" } elseif args[1] and args[2] then rootLetters = { args[1], args[2], args[3], args[4] } elseif args[1] then rootLetters = rsplit(args[1], " ") else rootLetters = rsplit(fulltitle, " ") end hamzaError(rootLetters, output) local joined_root = table.concat(rootLetters, " ") validateRoot(rootLetters, joined_root) local joined_tr = ar_translit.tr(table.concat(rootLetters, "-"), lang, sc, nil, nil, "force") or "-"

if fulltitle == joined_root then table.insert(output, m_headword.full_headword( {               lang = lang, sc = sc, pos_category = "roots", categories = {}, heads = {joined_root} }))

table.insert(categories, "")

if args["nocat"] then return table.concat(output) else return table.concat(output) .. table.concat(categories) end

else local link_text

if args["nolink"] then link_text = link(nil, joined_tr, args["gloss"], args["face"], joined_root) else link_text = link(joined_root, joined_tr, args["gloss"], args["face"]) end

table.insert(output, link_text)

table.insert(categories, m_utilities.format_categories( {"Arabic terms belonging to the root " .. joined_root}, lang))

if args["nocat"] then return table.concat(output) elseif args["plain"] then return table.concat(output) else return " " .. table.concat(categories) end

end

end

function export.ar_root2(parargs, args) return export.ar_root(debug_frame(parargs, args)) end

-- Used in so that we can specify a full lemma rather than -- requiring the user to truncate the -in ending. FIXME: Move ar-adj-in -- into Lua. function export.remove_in(frame) local lemma = frame.args[1] or error("Lemma required.") return rsub(reorder_shadda(lemma), IN .. "$", "") end

-- Used in so that we can specify a full lemma rather than -- requiring the user to truncate the -an ending. FIXME: Move ar-adj-an -- into Lua. function export.remove_an(frame) local lemma = frame.args[1] or error("Lemma required.") return rsub(reorder_shadda(lemma), AN .. AMAQ .. "$", "") end

-- Compare two words and find the alternation pattern (vowel changes, prefixes, suffixes etc.) -- Still a WIP, doesn't work correctly yet. function export.find_pattern(word1, word2) return nil end

function export.etymology(frame) local text, categories = {}, {} local linkText local frame_params = { [1] = { required = true }, }	local frame_args = require("Module:parameters").process(frame.args, frame_params) local anchor = frame_args[1] local data = { ["color adjective"] = { anchor = "Color or defect adjectives", text = "color adjective", categories = { "color/defect adjectives" }, },		["defect adjective"] = { anchor = "Color or defect adjectives", text = "defect adjective", categories = { "color/defect adjectives" }, },	}	local params = { [1] = {},		["nocat"] = { type = boolean, default = false }, ["lc"] = { type = boolean, default = false }, ["nocap"] = { alias_of = "lc" }, ["notext"] = { type = boolean, default = false}, }	local args = require("Module:parameters").process(frame:getParent.args, params) if anchor and data[anchor] then local data = data[anchor] anchor = data.anchor or error('The data table does not include an anchor for "' .. anchor .. '".') linkText = data.text or error('The data table does not include link text for "' .. anchor .. '".') if not args.lc then linkText = rsubn(linkText, "^%a", function(a) return mw.ustring.upper(a) end) end if not args.notext then table.insert(text, "" .. linkText .. "") end if not args.nocat then table.insert(categories, m_utilities.format_categories(data.categories, lang) ) end else error('The anchor "' .. tostring(anchor) .. '" is not found in the list of anchors.') end return table.concat(text) .. table.concat(categories) end

return export