Module:User:Erutuon/he-translit-circumflex

-- Currently based on -- https://ia803104.us.archive.org/7/items/A_Students_Vocabulary_For_Biblical_Hebrew_And_Aramaic/A%20Student%27s%20Vocabulary%20for%20Biblical%20Hebrew%20and%20Aramaic_text.pdf local export = {}

local Array = require "Module:array"

local U = mw.ustring.char local ufind = mw.ustring.find local ugsub = mw.ustring.gsub local ulen  = mw.ustring.len local umatch = mw.ustring.match local usub  = mw.ustring.sub

local sheva = U(0x05B0) local hataf_segol = U(0x05B1) local hataf_patah = U(0x05B2) local hataf_qamats = U(0x05B3) local hiriq = U(0x05B4) local tsere = U(0x05B5) local segol = U(0x05B6) local patah = U(0x05B7) local qamats = U(0x05B8) local qamats_qatan = U(0x05C7) local holam = U(0x05B9) local holam_haser_for_waw = U(0x05BA) local qubuts = U(0x05BB) local dagesh_mappiq = U(0x05BC) local shin_dot = U(0x05C1) local sin_dot = U(0x05C2)

local macron_above = U(0x0304) local macron_below = U(0x0331) local macron = "[" .. macron_above .. macron_below .. "]"

local alef = "א" local he = "ה" local waw = "ו" local yod = "י" local vowel_letters = alef .. he .. waw .. yod local shin_sin = 'ש' -- local vowel_letter = "[" .. vowel_letters .. "]"

-- -- '0' represents silent sheva -- local vowel_points = ( -- 	sheva .. hataf_segol .. hataf_patah .. hataf_qamats .. hiriq .. tsere .. -- 	segol .. patah .. qamats .. qamats_qatan .. holam .. qubuts .. '0' .. -- 	holam_haser_for_waw -- ) -- local vowel_point = "[" .. vowel_points .. "]" local short_vowels = segol .. patah .. hiriq .. qubuts .. qamats_qatan local short_vowel = "[" .. short_vowels .. "]"

local shuruq = waw .. dagesh_mappiq local holam_male = waw .. holam

local schwa = 'ə' local superscript_a = 'ᵃ'

local vowel_map = { [sheva] = '', [hataf_segol] = 'ĕ', [hataf_patah] = 'ă', [hataf_qamats] = 'ŏ', [hiriq] = 'i', [tsere] = 'ē', [segol] = 'e', [patah] = 'a', [qamats] = 'ā', [qamats_qatan] = 'o', [qubuts] = 'u', [holam] = 'ō', -- [shin_dot] = '', -- [sin_dot] = '', [holam_male] = 'ô', [shuruq] = 'û', }

local plene_map = { -- [sheva] = '', -- ə -- [hataf_segol] = 'ĕ', -- [hataf_patah] = 'ă', -- [hataf_qamats] = 'ŏ', [hiriq] = 'î', [tsere] = 'ê', -- [segol] = 'ệ', -- Lambdin's Introduction to Biblical Hebrew uses this. -- [patah] = 'a', [qamats] = 'â', -- [qamats_qatan] = 'o', -- if plene, then misspelling? -- [qubuts] = 'u', -- [holam] = 'ō', -- [holam_male] = 'ô', -- [shuruq] = 'û', }

local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat

local bet = 'ב' local gimel = 'ג' local dalet = 'ד' local kaf = 'כ' local kaf_final = 'ך' local pe = 'פ' local pe_final = 'ף' local tav = 'ת' local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav

local het = 'ח' local ayn = 'ע'

local letter_map = { [alef] = 'ʾ', [bet] = 'b' .. macron_below, [gimel] = 'g' .. macron_above, [dalet] = 'd' .. macron_below, ['ה'] = 'h', [waw] = 'w', ['ז'] = 'z', [het] = 'ḥ', ['ט'] = 'ṭ', ['י'] = 'y', [kaf] = 'k' .. macron_below, [kaf_final] = 'k' .. macron_below, ['ל'] = 'l', ['מ'] = 'm', ['ם'] = 'm', ['נ'] = 'n', ['ן'] = 'n', ['ס'] = 's', [ayn] = 'ʿ', [pe] = 'p' .. macron_above, [pe_final] = 'p' .. macron_above, ['צ'] = 'ṣ', ['ץ'] = 'ṣ', ['ק'] = 'q', ['ר'] = 'r', [tav] = 't' .. macron_below, }

local shin_sin_map = { [shin_dot] = "š", [sin_dot] = "ś", }

local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat

local punctuation_map = { ["־"] = "-",	["׃"] = ".", }

-- Fix illogical order of diacritics in Unicode normalization. function export.normalize(text) -- Comment from Module:he-translit: -- The default order is: consonant, vowel point, dagesh or mappiq, shin or sin dot. -- The desired order is: consonant, shin or sin dot, dagesh or mappiq, vowel point. text = ugsub(text, "([" .. vowel_diacritics .. ']*)(' .. dagesh_mappiq .. "*)([" .. shin_dot .. sin_dot .. "]*)", "%3%2%1")	text = ugsub(		text,		"[" .. hiriq .. patah .. qamats .. qamats_qatan .. sheva .. "]+",		function(vowels)			if ulen(vowels) == 2 then				local first, second = umatch(vowels, "^(.)(.)$")				-- יְרוּשָׁלִַם				if (first == hiriq and second ~= hiriq)				-- יְרוּשָׁלְַמָה				or (first == sheva and (second == patah or second == qamats or second == qamats_qatan)) then					return second .. first				end			end		end) return text end

local function match_alt_one(text, code_point_pos, patterns) for _, pattern in ipairs(patterns) do		local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos) if start_pos == code_point_pos then -- Return first capture (if any) and end of match return capture, end_pos end end end

local token_patterns = { "(" .. holam_male .. ")", "([" .. letters .. waw .. "][" .. shin_dot .. sin_dot .. "]?" .. dagesh_mappiq .. "?)", "(.)", }

local function next_token(text, code_point_pos) return match_alt_one(text, code_point_pos, token_patterns) end

-- Validate shin dot and sin dot? local function tokenize(text) local pos = 1 local tokens = {} while true do		local token, next_pos = next_token(text, pos) if not next_pos then break end pos = next_pos + 1 table.insert(tokens, token) end return tokens end

export.tokenize = tokenize

-- Indicates that a token may be a consonant. local function is_consonant(token) return token ~= nil and ufind(token, "[" .. letters .. "]", 1) == 1 end

local function may_be_silent(token) return token ~= nil and vowel_letters:find(token, 1, true) ~= nil end

-- Indicates that a token is definitely a vowel. -- Shuruq not covered because it could be a ww. local function is_vowel(token) return token == holam_male or token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil end

local function is_preceded_by_unchangeable_vowel(tokens, i)	local token1, token2 = tokens[i - 2], tokens[i - 1] return token2 == shuruq -- Don't check that this is waw with dagesh. or token2 == holam_male or token2 == yod and (token1 == hiriq or token1 == tsere or token1 == segol) end

local function has_dagesh(token) return token:find(dagesh_mappiq, 1, true) ~= nil end

local function is_waw(token) return token:find(waw, 1, true) == 1 end

local function is_he(token) return token:find(he, 1, true) == 1 end

local function is_bgdkpt(token) return ufind(token, "^[" .. bgdkpt .. "]") == 1 end

local function is_word_boundary(token) return token == nil or ufind(token, "^[%s%p]$") ~= nil end

local function get_letter(token) -- assert(ufind(token, "[" .. letters .. "]") == 1)	if token ~= nil then return usub(token, 1, 1) end end

local function get_dot(token) return umatch(token, "[" .. shin_dot .. sin_dot .. "]") end

local function is_followed_by_vowel(tokens, i)	local next_token = tokens[i + 1] return is_vowel(next_token) or next_token == shuruq end

local function is_preceded_by_vowel(tokens, i)	i = i - 1 while may_be_silent(tokens[i]) do		i = i - 1 end return is_vowel(tokens[i]) or tokens[i] == shuruq end

local function makes_furtive_patah(token) local pos, letter = ufind(token, "([" .. ayn .. het .. he .. "])")	return pos == 1 and (token ~= he or has_dagesh(token)) end

function export.transliterate(text) local tokens = export.tokenize(export.normalize(text)) local transliteration = {} local function add_tr(val) assert(type(val) == "string") table.insert(transliteration, val) end -- Use a manually incremented loop so we can skip -- furtive patah and matres lectionis tokens. local i = 1 while true do		local token = tokens[i] if not token then break end if is_waw(token) then if token == holam_male then if tokens[i - 1] == sheva then add_tr(letter_map[waw] .. vowel_map[holam]) else add_tr(vowel_map[holam_male]) end -- waw with dagesh, shuruq elseif has_dagesh(token) then if is_consonant(tokens[i - 1]) or is_word_boundary(tokens[i - 1]) then add_tr(vowel_map[shuruq]) else add_tr("ww") end else add_tr("w") end elseif is_consonant(token) then local letter = get_letter(token) local tr = assert(letter_map[letter] or shin_sin_map[get_dot(token)] or letter == shin_sin and shin_sin_map[sin_dot], token) if has_dagesh(token) then tr = ugsub(tr, macron, "") -- Don't double he. -- Don't double bgdkpt after sheva or at beginning of word. if not is_he(token) and not (is_bgdkpt(token) and (tokens[i - 1] == sheva or is_word_boundary(tokens[i - 1]))) then tr = tr .. tr				end end -- Transcribe furtive patah before its consonant and skip it. if makes_furtive_patah(token) and tokens[i + 1] == patah and is_word_boundary(tokens[i + 2]) then add_tr(superscript_a) i = i + 1 end add_tr(tr) elseif is_vowel(token) then if ((token == tsere or token == hiriq) and tokens[i + 1] == yod) or (token == qamats and tokens[i + 1] == he and not is_vowel(tokens[i + 2])) then add_tr(plene_map[token]) i = i + 1 -- Skip mater lectionis. -- Handle vocalic sheva elseif token == sheva and (				-- after initial consonant unless following consonant has dagesh				(is_word_boundary(tokens[i - 2]) and not has_dagesh(tokens[i + 1]))				-- after another sheva not at end of word				or (tokens[i - 2] == sheva and not is_word_boundary(tokens[i + 1]))				-- between identical consonants				or get_letter(tokens[i - 1]) == get_letter(tokens[i + 1])				-- after unchangeable vowel				or is_preceded_by_unchangeable_vowel(tokens, i - 1)			) then add_tr(schwa) elseif -- implicit ktiv/qre from Module:he-translit/testcases: -- יְרוּשָׁלְַמָה, יְרוּשָׁלְָמָה token == sheva and (tokens[i - 1] == patah or tokens[i - 1] == qamats					or tokens[i - 1] == qamats_qatan) then add_tr("y") elseif -- implicit ktiv/qre from Module:he-translit/testcases: -- יְרוּשָׁלִַם, יְרוּשָׁלִָם token == hiriq and (tokens[i - 1] == patah or tokens[i - 1] == qamats					or tokens[i - 1] == qamats_qatan) then add_tr("yi") -- qamats in possibly closed syllable, -- as long as following two consonants are not identical, in which -- case the sheva has to be pronounced, putting the qamats -- in an open syllable elseif token == qamats and tokens[i + 2] == sheva and not (is_consonant(tokens[i + 1]) and is_consonant(tokens[i + 3]) and tokens[i + 1] == tokens[i + 3]) then add_tr(vowel_map[qamats_qatan]) elseif (token == patah or token == qamats) and tokens[i + 1] == yod and is_consonant(tokens[i + 2]) then add_tr(vowel_map[token]) add_tr("i") -- ??? i = i + 1 else add_tr(vowel_map[token]) end else add_tr(punctuation_map[token] or token) end i = i + 1 end return table.concat(transliteration) end

return export