Module:ja-ruby

local m_str_utils = require("Module:string utilities")

local codepoint = m_str_utils.codepoint local concat = table.concat local insert = table.insert local ipairs = ipairs local select = select local type = type local u = m_str_utils.char local ufind = m_str_utils.find local ugmatch = m_str_utils.gmatch local ugsub = m_str_utils.gsub local ulen = m_str_utils.len local umatch = m_str_utils.match local usub = m_str_utils.sub

local function str_hira_to_kata(s) return (ugsub(s, '[ぁ-ゖ]', function(m1) return u(codepoint(m1) + 96) end)) end local function str_kata_to_hira(s) return (ugsub(s, '[ァ-ヶ]', function(m1) return u(codepoint(m1) - 96) end)) end

local export = {}

-- using Wagner–Fischer algorithm -- str_ucompare is O(nlogn). parse_text is O(n). For long texts, the latter is less prone to Lua timeout error. local function str_ucompare(s1, s2, limit) s1, s2 = mw.text.split(s1, ), mw.text.split(s2, ) local len1, len2 = #s1, #s2 if limit and len1 * len2 * 20 > limit then return {{s1}, {s2}} end

local m_cost, m_step =, – for i = 1, len1 do       m_cost[i + 1] = {i} m_step[i + 1] = {3} end for j = 1, len2 do       m_cost[1][j + 1] = j        m_step[1][j + 1] = 4 end

for i = 1, len1 do       for j = 1, len2 do            local b_same = s1[i] == s2[j] local c_sub = m_cost[i][j] + (b_same and 0 or 1) local c_del = m_cost[i][j + 1] + 1 local c_ins = m_cost[i + 1][j] + 1 if c_sub <= c_del and c_sub <= c_ins then m_cost[i + 1][j + 1] = c_sub m_step[i + 1][j + 1] = b_same and 1 or 2 elseif c_del <= c_ins then m_cost[i + 1][j + 1] = c_del m_step[i + 1][j + 1] = 3 else m_cost[i + 1][j + 1] = c_ins m_step[i + 1][j + 1] = 4 end end end

local i1, i2 = len1 + 1, len2 + 1 local m_offset = {{-1, -1}, {-1, -1}, {-1, 0}, {0, -1}} local r_step_rev = {} local r_step_rev_pos1 = {} local r_step_rev_pos2 = {} local step = m_step[i1][i2] while step do       insert(r_step_rev, step) i1 = i1 + m_offset[step][1] i2 = i2 + m_offset[step][2] insert(r_step_rev_pos1, i1) insert(r_step_rev_pos2, i2) step = m_step[i1][i2] end

local r1, r2 = {}, {} local i = #r_step_rev local step = r_step_rev[i] while i > 0 do       local r1_f, r2_f = {}, {} if step == 1 then repeat insert(r1_f, s1[r_step_rev_pos1[i]]) insert(r2_f, s2[r_step_rev_pos2[i]]) i = i - 1 step = r_step_rev[i] until step ~= 1 else while true do               if step == 2 then insert(r1_f, s1[r_step_rev_pos1[i]]) insert(r2_f, s2[r_step_rev_pos2[i]]) elseif step == 3 then insert(r1_f, s1[r_step_rev_pos1[i]]) elseif step == 4 then insert(r2_f, s2[r_step_rev_pos2[i]]) else break end i = i - 1 step = r_step_rev[i] end end insert(r1, concat(r1_f)) insert(r2, concat(r2_f)) end return {r1, r2} end

local function str_parse_link(s) local t = {} local lt   local i1, i2    local i_o = 1 local i_n = s:find('%[%[', i_o) while i_n do       i1, i2 = s:find('%[%[', i_n + 1), s:find('%]%]', i_n + 2) if not i2 then break end while i1 and i1 < i2 do           i_n = i1            i1 = s:find('%[%[', i_n + 1) end if i_o < i_n then insert(t, {           text = s:sub(i_o, i_n - 1),        }) end if i_n + 2 < i2 then lt = s:sub(i_n + 2, i2 - 1) i1 = lt:find('|') if i1 and i1 > 1 and i1 < lt:len then insert(t, {                   text = lt:sub(i1 + 1),                    linkto = lt:sub(1, i1 - 1),                }) else insert(t, {                   text = lt,                    linkto = lt,                }) end end i_o = i2 + 2 i_n = s:find('%[%[', i_o) end if i_o <= s:len then insert(t, {       text = s:sub(i_o),    }) end return t end

local function table_merge(link_table, ruby_table) local r = {} local r_sub, r_insert local len_cut local id_l, id_r = 1, 1 while id_l <= #link_table and id_r <= #ruby_table do       len_cut = link_table[id_l].text:len - ruby_table[id_r].text:len if ruby_table[id_r].ruby and (ruby_table[id_r].ruby:find'%[%[..-%]%]' or len_cut < 0) then if ruby_table[id_r].ruby then r_sub = { text = {}, ruby = str_parse_link(ruby_table[id_r].ruby), }               r_insert = r_sub.text insert(r, r_sub) else r_insert = r           end while len_cut < 0 do               insert(r_insert, {                    text = link_table[id_l].text,                    linkto = link_table[id_l].linkto                }) id_l = id_l + 1 len_cut = len_cut + link_table[id_l].text:len end insert(r_insert, {               text = link_table[id_l].text:sub(1, -1 - len_cut),                linkto = link_table[id_l].linkto            }) if len_cut == 0 then id_l = id_l + 1 id_r = id_r + 1 else link_table[id_l].text = link_table[id_l].text:sub(-len_cut) id_r = id_r + 1 end else if link_table[id_l].linkto then r_sub = { text = {}, linkto = link_table[id_l].linkto, }               r_insert = r_sub.text insert(r, r_sub) else r_insert = r           end while len_cut > 0 and not (ruby_table[id_r].ruby and ruby_table[id_r].ruby:find'%[%[..-%]%]') do               insert(r_insert, {                    text = ruby_table[id_r].text,                    ruby = ruby_table[id_r].ruby,                }) id_r = id_r + 1 len_cut = len_cut - ruby_table[id_r].text:len end if len_cut == 0 then insert(r_insert, {                   text = ruby_table[id_r].text,                    ruby = ruby_table[id_r].ruby,                }) id_l = id_l + 1 id_r = id_r + 1 else if ruby_table[id_r].ruby then link_table[id_l].text = link_table[id_l].text:sub(-(len_cut + ruby_table[id_r].text:len)) else insert(r_insert, {                       text = ruby_table[id_r].text:sub(1, -1 + len_cut),                    }) ruby_table[id_r].text = ruby_table[id_r].text:sub(len_cut) id_l = id_l + 1 end end end end return r end

--[==[Concatenates the texts in a ruby table. Discards all ruby and links.]==] function export.to_text(ruby_table) local r = {} local v_text for _, v in ipairs(ruby_table) do       v_text = v.text if type(v_text) == 'string' then insert(r, v_text) else insert(r, export.to_text(v_text)) end end return concat(r) end

--[==[Concatenates the texts in a ruby table. Discards all links. Ruby are used in place of the text below it when present.]==] function export.to_ruby(ruby_table) local r = {} local v_text for _, v in ipairs(ruby_table) do       v_text = v.ruby or v.text if type(v_text) == 'string' then insert(r, v_text) else insert(r, export.to_ruby(v_text)) end end return concat(r) end

local function table_to_markup(ruby_table, break_link, lb, lm, lf, rb, rm, rf) local text = {} local v_text, v_ruby, v_linkto for _, v in ipairs(ruby_table) do       v_linkto, v_ruby = v.linkto, v.ruby if type(v.text) ~= 'string' then if break_link and v_linkto then v_text = {} for _, vv in ipairs(v.text) do                   if vv.text ~=  or vv.ruby and vv.ruby ~=  then insert(v_text, {                           text =,                            ruby = vv.ruby,                        }) end end v_linkto, v_ruby = nil, nil v_text = table_to_markup(v_text, break_link, lb, lm, lf, rb, rm, rf) else v_text = table_to_markup(v.text, break_link, lb, lm, lf, rb, rm, rf) end else v_text = v.text end if v_linkto then if v_linkto ~=  then insert(text, lb .. v_linkto .. lm .. (v_text ~=  and v_text or '_') .. lf) else insert(text, v_text) end elseif v_ruby then if type(v_ruby) ~= 'string' then v_ruby = table_to_markup(v_ruby, break_link, lb, lm, lf, rb, rm, rf) end if v_ruby ~= '' then insert(text, rb .. v_text .. rm .. v_ruby .. rf) else insert(text, v_text) end else insert(text, v_text) end end return concat(text) end

--[==[Generates {"[]"} markups from ruby tables. function export.to_markup(ruby_table, options) options = options or {} local omarkup = options.markup or {}
 * {options.break_link = true}: Change { ... } to { ... }.
 * {options.markup}: Use custom markups other than {"[...](...)"} and {"..."}. Custom markups are not recognized and can not be converted back.]==]

return table_to_markup(       ruby_table,        options.break_link,        omarkup.link_border_left or ,        omarkup.link_border_right or ,        omarkup.ruby_border_left or '[',        omarkup.ruby_border_middle or '](', omarkup.ruby_border_right or ')') end

--[==[Generates wikitexts from ruby tables. function export.to_wiki(ruby_table, options) options = options or {} local omarkup = options.markup or {}
 * The options are the same as {function export.to_markup}]==]

return table_to_markup(       ruby_table,        options.break_link,        omarkup.link_border_left or ,        omarkup.link_border_right or ,        omarkup.ruby_border_left or ' ',        omarkup.ruby_border_middle or '(', omarkup.ruby_border_right or ') ') end

--[==[Constructs a ruby table form {"[]"} markups.]==] function export.parse_markup(markup) local ruby = {} local link_table = str_parse_link(markup:gsub('(%b[])(%b)', function(m1, m2) insert(ruby, m2:sub(2, -2)) return m1:sub(2, -2) end))

local plain_text = export.to_text(str_parse_link(markup)) local ruby_table = {} local p0 = 1 local ruby_n = 1 local s_text, s_ruby for p1, m1, m2, p2 in plain_text:gmatch'(%b[])(%b)' do       if p0 < p1 then s_text = plain_text:sub(p0, p1 - 1) insert(ruby_table, {text = s_text}) end s_text = m1:sub(2, -2) s_ruby = ruby[ruby_n] insert(ruby_table, {           text = s_text,            ruby = s_ruby ~= '' and s_ruby or nil,        }) p0 = p2       ruby_n = ruby_n + 1 end if p0 <= plain_text:len then s_text = plain_text:sub(p0) insert(ruby_table, {text = s_text}) end

return table_merge(link_table, ruby_table) end

local data_range = mw.loadData'Module:ja/data/range' local range_mute = '%^%-%.゠・' local range_hirakata = data_range.hiragana .. data_range.katakana local range_kana = data_range.kana local range_noalias = '<>^%c%p%s%z' .. range_kana local range_noruby = range_noalias .. data_range.kana_graph local range_nospace = range_kana .. data_range.kanji .. data_range.ideograph .. data_range.kana_graph .. data_range.punctuation

--[==[Constructs a ruby table from the 2 strings passed to this function. The differences in the 2 strings are converted into ruby, with the corresponding part of {kana} becoming the ruby text and that of {term} becoming the text under the ruby. Links in {term} will be integrated into the result. Links in {kana} will be ignored by default. Details about the format of {term} and {kana} can be found in Template:ja-r/documentation. To better adapt to Japanese texts, this function assumes that all non-letters (except for 5 symbols "^", ".", "-", "゠", "・" for transliteration reasons) and all kana always represent themselves. These literal characters when appearing in {term} should also appear unchanged or as a hira-kata counterpart in {kana}. This behaviour can be changed by using {options.try}, or by manually isolating a single literal character with "%". function export.parse_text(term, kana, options) options = options or {}
 * {options.try == nil}: Lauch an error when the assumption of literal characters fails.
 * {options.try == 'force'}: Discard the assumption of literal characters when it fails, and try to find any differences in the strings.
 * {options.try_force_limit}: Limit the time used by {options.try == 'force'}.
 * {options.space == nil}: Remove spaces between kana or kanji but preserve elsewhere.
 * {options.space == 'all'}: Preserve all spaces.
 * {options.space == 'none'}: Remove all spaces.
 * {options.allow_ruby_link == true}: Try to match the links in {kana}.]==]

local _remove_space if options.space == 'none' then _remove_space = function(_r) local function _next(p1, p2) if p2 and p2 < #_r[p1].text then return p1, p2 + 1 end p1 = p1 + 1 if p1 > #_r then p2 = nil else p2 = type(_r[p1].text) ~= 'string' and 1 or nil end return p1, p2           end local pos1, pos2 = _next(0, nil) while pos1 <= #_r do               local _t = pos2 and _r[pos1].text[pos2] or _r[pos1] _t.text = _t.text:gsub(' ', '') if _t.linkto then _t.linkto = _remove_space[1].text end if _t.ruby then _t.ruby = _remove_space[1].text end if pos2 then if _r[pos1].linkto then _r[pos1].linkto = _remove_space[1].text end if _r[pos1].ruby then _r[pos1].ruby = _remove_space[1].text end end pos1, pos2 = _next(pos1, pos2) end return _r end elseif options.space == 'all' then _remove_space = function(_r) return _r end else _remove_space = function(_r, context_ak, context_bk) local function _next(p1, p2) if p2 and p2 < #_r[p1].text then return p1, p2 + 1 end p1 = p1 + 1 if p1 > #_r then p2 = nil else p2 = type(_r[p1].text) ~= 'string' and 1 or nil end return p1, p2           end local pos1, pos2 = _next(0, nil) local pos3, pos4 = pos1, pos2 local after_k = context_ak local before_k local _t, char while pos1 <= #_r do               if pos3 == pos1 and (pos4 == pos2 or pos4 < pos2) or pos3 < pos1 then before_k = context_bk pos3, pos4 = _next(pos1, pos2) while pos3 <= #_r do                       _t = pos4 and _r[pos3].text[pos4] or _r[pos3] char = ufind(_t.text, '[^ \']') if char then char = usub(_t.text, char, char) before_k = ufind(char, '['..range_nospace..']') break end pos3, pos4 = _next(pos3, pos4) end end

_t = pos2 and _r[pos1].text[pos2] or _r[pos1] if _t.linkto then _t.linkto = _remove_space(, after_k, before_k)[1].text end if _t.ruby then _t.ruby = _remove_space(, after_k, before_k)[1].text end if pos2 then if _r[pos1].linkto then _r[pos1].linkto = _remove_space(, after_k, before_k)[1].text end if _r[pos1].ruby then _r[pos1].ruby = _remove_space(, after_k, before_k)[1].text end end

local seg = {} local i0 = 1 for i1, m1, i2 in ugmatch(_t.text, '(['..range_nospace..']+)') do                   if after_k and not usub(_t.text, i0, i1 - 1):find'[^ \']' then insert(seg, (usub(_t.text, i0, i1 - 1):gsub(' ', ''))) else insert(seg, usub(_t.text, i0, i1 - 1)) end insert(seg, m1) after_k = true i0 = i2               end after_k = after_k and not usub(_t.text, i0):find'[^ \']' if after_k and before_k then insert(seg, (usub(_t.text, i0):gsub(' ', ''))) else insert(seg, usub(_t.text, i0)) end _t.text = concat(seg)

pos1, pos2 = _next(pos1, pos2) end return _r end end

-- Create the link table -- e.g. "アラン・ポーの推理 小説" local link_table = str_parse_link(term:gsub('%%', '')) -- remove '%' --link_table = {       {text = 'アラン・ポー', linkto = 'エドガー・アラン・ポー'},        {text = 'の'},        {text = '推理 小説', linkto = '推理 小説'},    }

-- Remove romaji markup kana = kana:gsub('[%^%-%.]', '') -- remove '^', '-', '.', preserve '%', ' '

-- Create the ruby table -- e.g. 'アラン・ポーの推理 小説', 'あらん ぽー の すいり しょうせつ' -- ("ぽお" is not allowed) local ruby_table = {} local plain_term_raw = export.to_text(str_parse_link(term)) -- Remove links: A|B -> B, C -> C   local plain_kana_raw = options.allow_ruby_link and kana or export.to_text(str_parse_link(kana)) local plain_term = mw.text.split(plain_term_raw, '%%') local plain_kana = mw.text.split(plain_kana_raw, '%%') if #plain_term ~= #plain_kana then mw.logObject(plain_term) mw.logObject(plain_kana) error('Separator "%" in the kanji and kana strings do not match.') end for i, plain_term_i in ipairs(plain_term) do       if plain_term_i ~=  or plain_kana[i] ~=  then local pattern_ruby, pattern_ruby_is_ruby = {}, {} local function _func_pat(s_sub) local in_xml_tag = false insert(pattern_ruby, '(' .. ugsub(s_sub, '.', function(m0)                   if in_xml_tag then                        if m0 == '>' then in_xml_tag = false end                        return ''                    else                        if m0 == '<' then                            in_xml_tag = true                            return ' ?<.->'                        else                            local m0_m = m0                            if m0:find'^[%(%)%.%%%+%-%*%?%[%]%^%$]$' then m0_m = '%' .. m0_m end                            if ufind(m0, '^['..range_mute..']$') then m0_m = '[' .. m0_m .. ' -]?'                            elseif ufind(m0, '^[ヶゖケ]$') then                                m0_m = "[" .. str_kata_to_hira(m0_m) .. str_hira_to_kata(m0_m) .. "かがこカガコ]"                            elseif ufind(m0, '^['..range_hirakata..']$') then                                m0_m = "[" .. str_kata_to_hira(m0_m) .. str_hira_to_kata(m0_m) .. "]"                           end return ' ?' .. m0_m end end end) .. ' ?)')           end            local plain_term_noxml = plain_term_i:gsub('%b<>', '<>')            local pos0 = 1            -- Use a custom iterator so that we can exclude "&" and "@" from range_noalias, as they're part of %p.            for pos1, s, pos2 in (function local pos2, pos1, c = 1 local len = ulen(plain_term_noxml) return function if pos2 > len then return nil end pos1 = math.min(						ufind(plain_term_noxml, "[^"..range_noalias.."]", pos2) or math.huge,						ufind(plain_term_noxml, "[&@]", pos2) or math.huge					) if pos1 == math.huge then return nil end pos2 = pos1 repeat pos2, c = select(2, ufind(plain_term_noxml, "(["..range_noalias.."])", pos2 + 1)) until (not pos2) or (pos2 and not c:find("[&@]")) pos2 = pos2 or len + 1 return pos1, usub(plain_term_noxml, pos1, pos2 - 1), pos2 end end) do               if pos0 < pos1 then					local s_sub = usub(plain_term_noxml, pos0, pos1 - 1)                    if not pattern_ruby_is_ruby[#pattern_ruby] or umatch(s_sub, '[^' .. range_mute .. ']') then                       _func_pat(s_sub)                    end                end                if not pattern_ruby_is_ruby[#pattern_ruby] then                    insert(pattern_ruby, '(..-)')                    pattern_ruby_is_ruby[#pattern_ruby] = true                end                pos0 = pos2			end            if #pattern_ruby == 0 then                -- isolated symbol matches anything.                insert(ruby_table, { text = plain_term_i, ruby = plain_kana[i] ~= plain_term_i and ulen(plain_term_i) == 1 and plain_kana[i] or nil, })           else				if pos0 <= ulen(plain_term_noxml) then					_func_pat(usub(plain_term_noxml, pos0))				end				local pat_ruby_s = concat(pattern_ruby)				-- 'アラン・ポーの推理 小説' to '( ?[あア] ?[らラ] ?[んン] ?[・ -]? ?[ぽポ] ?ー ?[のノ] ?)(..-)(..-)'				-- Excute matching				local ruby_table_i_ruby = {umatch(plain_kana[i], '^'..pat_ruby_s..'$')}				if #ruby_table_i_ruby > 0 then					local ruby_table_i_text = {umatch(plain_term_i, '^'..pat_ruby_s..'$')}					for n_match = 1, #pattern_ruby do						-- Exclude "&" and "@" from range_noruby, as they're part of %p.						if ( pattern_ruby_is_ruby[n_match] and ruby_table_i_text[n_match] ~= ruby_table_i_ruby[n_match] and (								ufind(ruby_table_i_text[n_match], '[^' .. range_noruby .. ']') or								ruby_table_i_text[n_match]:find("[&@]")							) ) then							insert(ruby_table, { text = ruby_table_i_text[n_match], ruby = ruby_table_i_ruby[n_match], })						else							if #ruby_table > 0 and ruby_table[#ruby_table].ruby == nil then								ruby_table[#ruby_table].text = ruby_table[#ruby_table].text .. ruby_table_i_text[n_match]							else								insert(ruby_table, {text = ruby_table_i_text[n_match]})							end						end					end				elseif options.try == 'force' then					require('Module:debug').track('ja-ruby/forced match')					local forced_result = str_ucompare(plain_term_i, plain_kana[i], options.try_force_limit)					for ii, vv in ipairs(forced_result[1]) do						insert(ruby_table, { text = vv, ruby = forced_result[2][ii] ~= vv and forced_result[2][ii] or nil, })					end				else					mw.log(pat_ruby_s)					error('Can not match "' .. plain_term_i .. '" and "' .. plain_kana[i] .. '"')				end           end        end    end	--ruby_table = {        {text = 'アラン・ポーの'},        {text = '推理', ruby = 'すいり'},        {text = ' '}        {text = '小説', ruby = 'しょうせつ'},    }

return _remove_space(table_merge(link_table, ruby_table)) -- Merge the ruby and link table --return {       {text = 'アラン・ポー', linkto = 'エドガー・アラン・ポー'},        {text = 'の'},        {text = {            {text = '推理', ruby = 'すいり'},            {text = ''}            {text = '小説', ruby = 'しょうせつ'},        }, linkto = '推理小説'},    } end

--[==[A shortcut for combinations like {to_wiki(parse_text(...))}. It accepts a table containing named arguments instead of positional ones. function export.ruby_auto(args) local to_target if args.target == 'text' then to_target = export.to_text elseif args.target == 'ruby' then to_target = export.to_ruby elseif args.target == 'markup' then to_target = export.to_markup else to_target = export.to_wiki end
 * {term}, {kana}: Arguments for {parse_text}.
 * markup: Argument for {parse_markup}.
 * to_target: Default to {to_wiki}. {'text'} for {to_text}; {'ruby'} for {to_ruby}; {'markup'} for {to_markup}.
 * options: A shared option table passed to all functions involved.]==]

if args.term and args.kana then return to_target(export.parse_text(args.term, args.kana, args.options), args.options) elseif args.markup then return to_target(export.parse_markup(args.markup, args.options), args.options) else error('Cannot find "term" and "kana" or "markup"') end end

return export