Module:User:Erutuon/he-translit-omit-nonconsonantal

-- Sort of a reimplementation of Module:he-translit. -- This version omits matres lectionis and letters that otherwise don't have -- their consonantal value, except when they indicate vowel length.

-- Remaining issues: -- * some shvas are undecided (?) -- * cantillation marks and meteg are all interpreted as stress marks --  (some mark secondary stress, some don't mark stress) -- * qamats isn't distinguished correctly in all cases

local export = {}

local Array = require "Module:array"

local function show_code_point_names(text) if not text then return "" end local names = Array for cp in mw.ustring.gcodepoint(text) do -- Remove HEBREW LETTER, HEBREW POINT, etc.		local name = require "Module:Unicode data".lookup_name(cp) :gsub(				"^HEBREW (%w+) ",				function(type)					if type == "ACCENT" then return "ACCENT " else return "" end				end) :lower names:insert(name) end return names:concat ", " end

local function show_tokens(tokens, i, j)	return table.concat(Array(tokens):map(show_code_point_names), " | ", i, j) end

export.show_tokens = show_tokens

local U = mw.ustring.char local ufind = mw.ustring.find local ugsub = mw.ustring.gsub local ulen  = mw.ustring.len local umatch = mw.ustring.match local usub  = mw.ustring.sub

local sheva = U(0x05B0) local hataf_segol = U(0x05B1) local hataf_patah = U(0x05B2) local hataf_qamats = U(0x05B3) local hiriq = U(0x05B4) local tsere = U(0x05B5) local segol = U(0x05B6) local patah = U(0x05B7) local qamats = U(0x05B8) local qamats_qatan = U(0x05C7) local holam = U(0x05B9) local holam_haser_for_waw = U(0x05BA) local qubuts = U(0x05BB) local dagesh_mappiq = U(0x05BC) local shin_dot = U(0x05C1) local sin_dot = U(0x05C2)

local macron_above = U(0x0304) local macron_below = U(0x0331) local macron = "[" .. macron_above .. macron_below .. "]"

local acute = U(0x0301)

local alef = "א" local he = "ה" local waw = "ו" local yod = "י" local vowel_letters = alef .. he .. waw .. yod local shin_sin = 'ש'

local shuruq = waw .. dagesh_mappiq local holam_male = waw .. holam

local schwa = 'ə'

local vowel_map = { [sheva] = '', [hataf_segol] = 'ĕ', [hataf_patah] = 'ă', [hataf_qamats] = 'ŏ', [hiriq] = 'i', [tsere] = 'ē', [segol] = 'e', [patah] = 'a', [qamats] = 'ā', [qamats_qatan] = 'o', [qubuts] = 'u', [holam] = 'ō', [holam_male] = 'ō', [holam_haser_for_waw] = 'ō', [shuruq] = 'ū', }

local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat

local short_vowel_map = { [holam] = 'o', [holam_male] = 'o', [holam_haser_for_waw] = 'o', [shuruq] = 'u', }

local plene_map = { [hiriq] = 'ī', [tsere] = 'ē', [qamats] = 'ā', -- [qamats_qatan] = 'o', -- if plene, then misspelling? }

local bet = 'ב' local gimel = 'ג' local dalet = 'ד' local kaf = 'כ' local kaf_final = 'ך' local lamed = 'ל' local mem = 'מ' local pe = 'פ' local pe_final = 'ף' local tav = 'ת' local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav

local het = 'ח' local ayin = 'ע'

local letter_map = { [alef] = 'ʔ', [bet] = 'b' .. macron_below, [gimel] = 'g' .. macron_above, [dalet] = 'd' .. macron_below, [he] = 'h', [waw] = 'w', ['ז'] = 'z', [het] = 'ḥ', ['ט'] = 'ṭ', [yod] = 'y', [kaf] = 'k' .. macron_below, [kaf_final] = 'k' .. macron_below, [lamed] = 'l', [mem] = 'm', ['ם'] = 'm', ['נ'] = 'n', ['ן'] = 'n', ['ס'] = 's', [ayin] = 'ʕ', [pe] = 'p' .. macron_above, [pe_final] = 'p' .. macron_above, ['צ'] = 'ṣ', ['ץ'] = 'ṣ', ['ק'] = 'q', ['ר'] = 'r', [tav] = 't' .. macron_below, }

local shin_sin_map = { [shin_dot] = "š", [sin_dot] = "ś", }

local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat

local punctuation_map = { ["־"] = "-",	["׃"] = ".", }

-- First and last code point called "HEBREW ACCENT ...". local first_accent_cp, last_accent_cp = 0x0591, 0x05AE local meteg_cp = 0x05BD local meteg = U(meteg_cp) local combining_grapheme_joiner_cp = 0x034F local cgj = U(combining_grapheme_joiner_cp) local accents = { meteg } for cp = first_accent_cp, last_accent_cp do	table.insert(accents, U(cp)) end

local diacritic_order = { {shin_dot, shin_dot}, {dagesh_mappiq}, Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end), accents, {cgj}, }

local accent_pattern = U(first_accent_cp) .. "-" .. U(last_accent_cp) .. meteg

local diacritic_pattern = "[" .. shin_dot .. sin_dot .. dagesh_mappiq .. vowel_diacritics .. accent_pattern .. cgj .. "]" local diacritics_pattern = diacritic_pattern .. diacritic_pattern .. "+" local diacritic_order_map = {} for i, diacritics in ipairs(diacritic_order) do	for _, diacritic in ipairs(diacritics) do		diacritic_order_map[diacritic] = i	end end

local function is_accent(token) if not token then return false end local cp = mw.ustring.codepoint(token) return first_accent_cp <= cp and cp <= last_accent_cp or cp == combining_grapheme_joiner_cp end

-- Fix illogical order of diacritics in Unicode normalization. -- The default order: -- letter, vowel points, dagesh or mappiq, accent, shin or sin dot. -- The desired order: -- letter, shin or sin dot, dagesh or mappiq, first vowel point, accent, -- maybe second vowel point if first vowel point is sheva or hiriq. function export.normalize(text) text = ugsub(		text,		diacritics_pattern,		function(diacritics)			local diacritics_list = mw.text.split(diacritics, "")			table.sort( diacritics_list, function(a, b)					return (diacritic_order_map[a] or 0) < (diacritic_order_map[b] or 0) end)			-- For now remove combining grapheme joiners... though this might be wrong.			while diacritics_list[#diacritics_list] == cgj do				table.remove(diacritics_list)			end			-- If there are two vowels, put hiriq or sheva after other vowels.			-- If there is also an accent, put it after the first vowel.			-- Assume Unicode normalization:			-- sheva before hiriq before patah before either qamats.			-- This code works for combinations are in the testcases.			-- יְרוּשָׁלִַם, יְרוּשָׁלְַמָה			local i = 0			local first_vowel			repeat				i = i + 1				first_vowel = diacritics_list[i]			until not first_vowel or vowel_diacritics:find(first_vowel)			if first_vowel then				local second_vowel = diacritics_list[i + 1]				if second_vowel then					if first_vowel == hiriq or first_vowel == sheva then						diacritics_list[i], diacritics_list[i + 1] = diacritics_list[i + 1], diacritics_list[i]					end if is_accent(diacritics_list[i + 2]) then diacritics_list[i + 1], diacritics_list[i + 2] = diacritics_list[i + 2], diacritics_list[i + 1] end end end return table.concat(diacritics_list) end)	return text end

local function match_alt_one(text, code_point_pos, patterns) for _, pattern in ipairs(patterns) do		local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos) if start_pos == code_point_pos then -- Return first capture (if any) and end of match return capture, end_pos end end end

local token_patterns = { "(" .. holam_male .. ")", "([" .. letters .. waw .. "][" .. shin_dot .. sin_dot .. "]?" .. dagesh_mappiq .. "?)", "(.)", }

local function next_token(text, code_point_pos) return match_alt_one(text, code_point_pos, token_patterns) end

-- Validate shin dot and sin dot? local function tokenize(text) local pos = 1 local tokens = {} while true do		local token, next_pos = next_token(text, pos) if not next_pos then break end pos = next_pos + 1 table.insert(tokens, token) end return tokens end

export.tokenize = tokenize

local function may_be_silent(token) return token ~= nil and vowel_letters:find(token, 1, true) ~= nil end

-- Indicates that a token might be a vowel. -- Use only after determining that it is not a consonant. local function is_vowel(token) return token == holam_male or token == shuruq or (token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil) end

local function is_preceded_by_unchangeable_vowel(tokens, i)	local token1, token2 = tokens[i - 2], tokens[i - 1] return token2 == shuruq -- Don't check that this is waw with dagesh. or token2 == holam_male or token2 == yod and (token1 == hiriq or token1 == tsere or token1 == segol) end

local function is_short_vowel(token) return token == patah or token == segol or token == hiriq or token == qubuts end

local function is_open_vowel(token) return token == patah or token == qamats end

local function has_dagesh(token) return token ~= nil and token:find(dagesh_mappiq, 1, true) ~= nil end

local function is_waw(token) return token ~= nil and token:find(waw, 1, true) == 1 end

local function is_he(token) return token ~= nil and token:find(he, 1, true) == 1 end

local function is_hataf(token) return token == hataf_segol or token == hataf_patah or token == hataf_qamats end

local function get_letter(token) -- assert(ufind(token, "[" .. letters .. "]") == 1)	if token ~= nil then return usub(token, 1, 1) end end

local function is_guttural(token) local letter = get_letter(token) return letter == alef or letter == he or letter == het or letter == ayin end

local function is_bgdkpt(token) return token ~= nil and ufind(token, "^[" .. bgdkpt .. "]") == 1 end

-- Bidirectional control characters should be avoided as much as possible, -- but they are easily picked up when copying and pasting, so the module needs -- to account for them. -- This list is from Bidirectional control character. local bidirectional_control_characters = U(0x061C) .. U(0x200E) .. U(0x200F) .. U(0x202A) .. "-" .. U(0x202E) .. U(0x2066) .. "-" .. U(0x2069) local word_boundary_character = "^[%s%p" .. bidirectional_control_characters .. "]$" local function is_word_boundary(token) return token == nil or ufind(token, word_boundary_character) ~= nil end

local function get_dot(token) return token and umatch(token, "[" .. shin_dot .. sin_dot .. "]") end

local function is_followed_by_vowel(tokens, i)	repeat i = i + 1 until not is_accent(tokens[i]) return is_vowel(tokens[i]) end

local function is_preceded_by_vowel(tokens, i)	repeat i = i - 1 until not (may_be_silent(tokens[i]) or is_accent(tokens[i])) return is_vowel(tokens[i]) end

local function get_previous_vowel_pos(tokens, i)	while true do		i = i - 1 local token = tokens[i] if is_vowel(token) then return i		elseif is_word_boundary(token) then return nil end end end

local function get_previous_vowel(tokens, i)	local pos = get_previous_vowel_pos(tokens, i)	if pos then return tokens[pos] end end

local function get_previous_neighboring_vowel(tokens, i)	while true do		i = i - 1 local token = tokens[i] if is_vowel(token) then return token elseif not is_accent(token) then return nil end end end

-- Defined below. local is_consonant

local function skip_before_accent(tokens, i)	repeat i = i - 1 until not is_accent(tokens[i]) return i end

local function is_preceded_by_consonant(tokens, i)	return is_consonant(tokens, skip_before_accent(tokens, i)) end

local function makes_furtive_patah(token) local pos, letter = ufind(token, "([" .. ayin .. het .. he .. "])")	return pos == 1 and (token ~= he or has_dagesh(token)) end

-- Handles silence of the possibly silent letters, -- except for some cases of waw (holam male, shuruq). local function is_silent(tokens, i)	local prev_token, next_token = tokens[skip_before_accent(tokens, i)], tokens[i + 1] -- special case for יִשָּׂשכָר yiśśāḵār if tokens[i] == shin_sin and not is_vowel(next_token) then return true elseif may_be_silent(tokens[i]) then if tokens[i] == alef then -- Alef is pronounced when -- 1. initial -- 2. both preceded and followed by written vowels. return not (is_followed_by_vowel(tokens, i)				and (is_preceded_by_vowel(tokens, i)				or is_word_boundary(prev_token))) elseif tokens[i] == yod then return not is_followed_by_vowel(tokens, i)				and (prev_token == hiriq or prev_token == tsere or prev_token == segol				or not is_word_boundary(next_token)) -- בָּנָיו bānāw vs. בָּנַי bānay elseif tokens[i] == waw then -- holam + waw is probably incorrect return prev_token == holam or not (is_vowel(tokens[i + 1]) or is_word_boundary(tokens[i + 1])) else return not is_followed_by_vowel(tokens, i)		end else return false end end

-- Indicates that a token may be a consonant. -- Declared as local above. function is_consonant(tokens, i)	local token = tokens[i] if is_waw(token) then return token == waw or (token == shuruq and not (is_preceded_by_consonant(tokens, i) or is_word_boundary(tokens[i - 1]))) else return token ~= nil and ufind(token, "[" .. letters .. "]", 1) == 1 end end

-- Don't double he. -- Don't double bgdkpt after sheva or at beginning of word. local function is_double(tokens, i)	local token = tokens[i] return token ~= nil and has_dagesh(token) and not is_he(token) and not (is_bgdkpt(token) and (tokens[i - 1] == sheva or is_word_boundary(tokens[i - 1]))) end

local function is_preceded_by_prefix(tokens, i)	local consonant, vowel = tokens[i - 2], tokens[i - 1] local letter = get_letter(consonant) local letter_is_shin = (letter == shin_sin and get_dot(consonant) == shin_dot) local next_cons_has_dagesh = has_dagesh(tokens[i]) return (vowel == hiriq and letter == mem and next_cons_has_dagesh) or (vowel == sheva and ( letter == bet or letter == dalet or letter == waw or letter == kaf or letter == lamed )		) or (vowel == patah and next_cons_has_dagesh and ( letter == bet or letter == he or letter == kaf or letter == lamed or letter_is_shin -- very archaic, says Module:he-translit )		) or (vowel == segol and next_cons_has_dagesh and letter_is_shin) end

local function is_in_last_syllable(tokens, i)	while true do		local token = tokens[i + 1] if is_word_boundary(token) -- A sequence of consonant sheva consonant (sheva) does not have a vowel: -- וַיֵּבְךְּ wayyēḇk, וַיַּרְא wayyar or token == sheva and (			is_consonant(tokens, i + 2)			and (tokens[i + 3] == sheva or is_word_boundary(tokens[i + 3]))		) then return true elseif is_vowel(token) then return false end i = i + 1 end end

local function is_pronounced_sheva(tokens, i)	local previous_vowel = get_previous_vowel(tokens, i)	if tokens[i - 2] == meteg then return true -- ignore יְרוּשָׁלְָמָה yərūšālayim, יְרוּשָׁלְַמָה yərūšālāyim elseif is_word_boundary(tokens[i + 1]) or (tokens[i + 1] == alef and is_word_boundary(tokens[i + 2])) or has_dagesh(tokens[i + 1]) -- check for bgdkpt? then return false elseif -- after another sheva previous_vowel == sheva -- after initial consonant unless following consonant has dagesh or previous_vowel == nil -- between identical consonants or (get_letter(tokens[i - 1]) == get_letter(tokens[i + 1])			and not is_silent(tokens, i + 1)) or is_preceded_by_unchangeable_vowel(tokens, i - 1) or is_double(tokens, i - 1) then return true elseif is_short_vowel(previous_vowel) or is_guttural(tokens[i - 1]) then return false else -- Leave this catch-all case to make it clear what the default is. return false end end

function export.transliterate(text) local tokens = export.tokenize(export.normalize(text)) local transliteration = {} local function add_tr(val) assert(type(val) == "string") table.insert(transliteration, val) end -- Use a manually incremented loop so we can skip -- furtive patah and matres lectionis tokens. local i = 1 while true do		local token = tokens[i] if not token then break end -- This catches silent letters after a consonant; -- silent letters after a vowel are handled below. if is_silent(tokens, i) then add_tr("") elseif is_consonant(tokens, i) then local letter = get_letter(token) local tr = assert(letter_map[letter] or shin_sin_map[get_dot(token)] or letter == shin_sin and shin_sin_map[sin_dot], token) if has_dagesh(token) then tr = ugsub(tr, macron, "") if is_double(tokens, i) then tr = tr .. tr				end end -- Transcribe furtive patah before its consonant and skip it. if makes_furtive_patah(token) and tokens[i + 1] == patah and is_word_boundary(tokens[i + 2]) then local previous_vowel_pos = get_previous_vowel_pos(tokens, i)				if not (previous_vowel_pos and is_accent(tokens[previous_vowel_pos + 1])) then add_tr(acute) end add_tr(vowel_map[patah]) i = i + 1 end add_tr(tr) elseif is_vowel(token) then -- Genuine waw holam. Handle the waw and leave the holam to the next -- bit of code. -- מִצְוֹת miṣwōṯ if token == holam_male and tokens[i - 1] == sheva then add_tr(letter_map[waw]) end local has_accent = is_accent(tokens[i + 1]) local next_i = i + 1 if has_accent then next_i = i + 2 end -- Handle sheva. if tokens[i] == sheva then -- implicit ktiv/qre from Module:he-translit/testcases: -- יְרוּשָׁלְָמָה yərūšālayim, יְרוּשָׁלְַמָה yərūšālāyim if is_open_vowel(get_previous_neighboring_vowel(tokens, i)) then local previous_vowel_pos = get_previous_vowel_pos(tokens, i)					if not (previous_vowel_pos and is_accent(tokens[previous_vowel_pos + 1])) then add_tr(acute) end

add_tr("y") elseif is_pronounced_sheva(tokens, i) then add_tr(schwa) else add_tr("") end -- implicit ktiv/qre from Module:he-translit/testcases: -- יְרוּשָׁלִַם yərūšālaymā, יְרוּשָׁלִָם yərūšālāymā elseif token == hiriq and is_open_vowel(get_previous_neighboring_vowel(tokens, i)) then local previous_vowel_pos = get_previous_vowel_pos(tokens, i)				if not (previous_vowel_pos and is_accent(tokens[previous_vowel_pos + 1])) then add_tr(acute) end add_tr("yi") -- qamats in possibly closed syllable, -- as long as following two consonants are not identical, in which -- case the sheva has to be pronounced, putting the qamats -- in an open syllable elseif token == qamats and not has_accent and (				(tokens[next_i + 1] == sheva and not is_pronounced_sheva(tokens, next_i + 1))				or is_double(tokens, next_i)				or (is_guttural(tokens[next_i]) and is_hataf(tokens[next_i + 1]))				-- כָּל kol, on its own and with prefixes				or ((get_letter(tokens[i - 1]) == kaf and get_letter(tokens[next_i]) == lamed) and (is_word_boundary(tokens[next_i + 1])						and ( is_word_boundary(tokens[i - 2]) or is_preceded_by_prefix(tokens, i - 1) )					)				)			) then add_tr(vowel_map[qamats_qatan]) else local vowel = token local start_i = i				local i_after_silent_letters = next_i - 1 while is_silent(tokens, i_after_silent_letters + 1) do					i_after_silent_letters = i_after_silent_letters + 1 end if i_after_silent_letters > start_i or token == shuruq or token == holam_male then if is_double(tokens, i_after_silent_letters + 1) then add_tr(short_vowel_map[vowel] or vowel_map[vowel]) else add_tr(plene_map[vowel] or vowel_map[vowel]) end else add_tr(vowel_map[token]) end i = i_after_silent_letters end -- This is not completely correct because not all accents indicate stress. -- I haven't sorted out their functions though. if has_accent and not is_in_last_syllable(tokens, i) then add_tr(acute) end else if not (is_accent(token) or token == meteg) then add_tr(punctuation_map[token] or token) end end i = i + 1 end return table.concat(transliteration) end

function export.tr_t(frame) return export.transliterate(frame.args[1]) end

return export