Module:User:Gyfo/parse utilities

local export = {}

local rfind = mw.ustring.find local rsplit = mw.text.split local u = mw.ustring.char local rsubn = mw.ustring.gsub

-- version of rsubn that discards all but the first return value local function rsub(term, foo, bar) local retval = rsubn(term, foo, bar) return retval end

--[==[ -- Reimplementation of mw.ustring.split that includes any capturing -- groups in the splitting pattern. This works like Python's re.split -- function, except that it has Lua's behavior when the split pattern -- is empty (i.e. advancing by one character at a time; Python returns the -- whole remainder of the string). ]==] function export.capturing_split(str, pattern) local ret = {} -- (.-) corresponds to (.*?) in Python or Perl; captures the -- current position after matching. pattern = "(.-)" .. pattern .. ""	local start = 1 while true do -- Did we reach the end of the string? if start > #str then table.insert(ret, "") return ret end -- match returns all captures as multiple return values; -- we need to insert into a table to get them all. local captures = {export.match(str, pattern, start)} -- If no match, add the remainder of the string. if #captures == 0 then table.insert(ret, export.sub(str, start)) return ret end local newstart = table.remove(captures) -- Special case: If we don't advance by any characters, then advance -- by one character; this avoids an infinite loop, and makes splitting -- by an empty string work the way mw.ustring.split does. If we		-- reach the end of the string this way, return immediately, so we -- don't get a final empty string. if newstart == start then table.insert(ret, export.sub(str, start, start)) table.remove(captures, 1) start = start + 1 if start > #str then return ret end else table.insert(ret, table.remove(captures, 1)) start = newstart end -- Insert any captures from the splitting pattern. for _, x in ipairs(captures) do			table.insert(ret, x)		end end end

--[=[ In order to understand the following parsing code, you need to understand how inflected text specs work. They are intended to work with inflected text where individual words to be inflected may be followed by inflection specs in angle brackets. The format of the text inside of the angle brackets is up to the individual language and part-of-speech specific implementation. A real-world example is as follows: "меди́чна<+> сестра́<*,*#.pr>". This is the inflection of a multiword expression "меди́чна сестра́", which means "nurse" in Ukrainian (literally "medical sister"), consisting of two words: the adjective меди́чна ("medical" in the feminine singular) and the noun сестра́ ("sister"). The specs in angle brackets follow each word to be inflected; for example, <+> means that the preceding word should be declined as an adjective.

The code below works in terms of balanced expressions, which are bounded by delimiters such as < > or [ ]. The intention is to allow separators such as spaces to be embedded inside of delimiters; such embedded separators will not be parsed as separators. For example, Ukrainian noun specs allow footnotes in brackets to be inserted inside of angle brackets; something like "меди́чна<+> сестра́" is legal, as is "меди́чна<+> сестра́", and the parsing code should not be confused by the embedded brackets, spaces or angle brackets.

The parsing is done by two functions, which work in close concert: parse_balanced_segment_run and split_alternating_runs. To illustrate, consider the following:

parse_balanced_segment_run("foo bar", "<", ">") = {"foo", "", " bar", "", ""}

then

split_alternating_runs({"foo", "", " bar", "", ""}, " ") = {{"foo", "", ""}, {"bar", "", ""}}

Here, we start out with a typical inflected text spec "foo bar", call parse_balanced_segment_run on it, and call split_alternating_runs on the result. The output of parse_balanced_segment_run is a list where even-numbered segments are bounded by the bracket-like characters passed into the function, and odd-numbered segments consist of the surrounding text. split_alternating_runs is called on this, and splits *only* the odd-numbered segments, grouping all segments between the specified character. Note that the inner lists output by split_alternating_runs are themselves in the same format as the output of parse_balanced_segment_run, with bracket-bounded text in the even-numbered segments. Hence, such lists can be passed again to split_alternating_runs. ]=]

--[==[ Parse a string containing matched instances of parens, brackets or the like. Return a list of strings, alternating between textual runs not containing the open/close characters and runs beginning and ending with the open/close characters. For example,

{parse_balanced_segment_run("foo(x(1)), bar(2)", "(", ")") = {"foo", "(x(1))", ", bar", "(2)", ""}} ]==] function export.parse_balanced_segment_run(segment_run, open, close) return capturing_split(segment_run, "(%b" .. open .. close .. ")")		-- Повторная реализация mw.ustring.split, которая включает любые группы захвата -- в шаблоне разделения. -- Это работает аналогично функции Python re.split -- за исключением того, что она ведет себя как Lua, -- когда шаблон разделения -- пуст (т. е. продвигается по одному символу за раз; Python возвращает -- весь остаток строки). end

--[==[ Split a list of alternating textual runs of the format returned by `parse_balanced_segment_run` on `splitchar`. This only splits the odd-numbered textual runs (the portions between the balanced open/close characters). The return value is a list of lists, where each list contains an odd number of elements, where the even-numbered elements of the sublists are the original balanced textual run portions. For example, if we do

{parse_balanced_segment_run("foo bar", "<", ">") = {"foo", "", " bar", "", ""}}

then

{split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ") = {{"foo", "<M.proper noun>", ""}, {"bar", "<F>", ""}}}

Note that we did not touch the text "<M.proper noun>" even though it contains a space in it, because it is an even-numbered element of the input list. This is intentional and allows for embedded separators inside of brackets/parens/etc. Note also that the inner lists in the return value are of the same form as the input list (i.e. they consist of alternating textual runs where the even-numbered segments are balanced runs), and can in turn be passed to split_alternating_runs.

If `preserve_splitchar` is passed in, the split character is included in the output, as follows:

{split_alternating_runs({"foo", "<M.proper noun>", " bar", "<F>", ""}, " ", true) = {{"foo", "<M.proper noun>", ""}, {" "}, {"bar", "<F>", ""}}}

Consider what happens if the original string has multiple spaces between brackets, and multiple sets of brackets without spaces between them.

{parse_balanced_segment_run("foo[dated][low colloquial] baz-bat quux xyzzy[archaic]", "[", "]") = {"foo", "[dated]", "", "[low colloquial]", " baz-bat quux xyzzy", "[archaic]", ""}}

then

{split_alternating_runs({"foo", "[dated]", "", "[low colloquial]", " baz-bat quux xyzzy", "[archaic]", ""}, "[ %-]") = {{"foo", "[dated]", "", "[low colloquial]", ""}, {"baz"}, {"bat"}, {"quux"}, {"xyzzy", "[archaic]", ""}}}

If `preserve_splitchar` is passed in, the split character is included in the output, as follows:

{split_alternating_runs({"foo", "[dated]", "", "[low colloquial]", " baz bat quux xyzzy", "[archaic]", ""}, "[ %-]", true) = {{"foo", "[dated]", "", "[low colloquial]", ""}, {" "}, {"baz"}, {"-"}, {"bat"}, {" "}, {"quux"}, {" "}, {"xyzzy", "[archaic]", ""}}}

As can be seen, the even-numbered elements in the outer list are one-element lists consisting of the separator text. ]==] function export.split_alternating_runs(segment_runs, splitchar, preserve_splitchar) local grouped_runs = {} local run = {} for i, seg in ipairs(segment_runs) do		if i % 2 == 0 then table.insert(run, seg) else local parts = preserve_splitchar and capturing_split(seg, "(" .. splitchar .. ")") or				rsplit(seg, splitchar) table.insert(run, parts[1]) for j=2,#parts do				table.insert(grouped_runs, run) run = {parts[j]} end end end if #run > 0 then table.insert(grouped_runs, run) end return grouped_runs end

--[==[ Like split_alternating_runs but applies an arbitrary function `frob` to "raw-text" segments in the result (i.e. not stuff within balanced delimiters such as footnotes and inflection specs, and not splitchars if present). `frob` is a function of one argument (the string to frob) and should return one argument (the frobbed string). ]==] function export.split_alternating_runs_and_frob_raw_text(run, splitchar, frob, preserve_splitchar) local split_runs = export.split_alternating_runs(run, splitchar, preserve_splitchar) --[==[	Apply an arbitrary function `frob` to the "raw-text" segments in a split run set (the output of	split_alternating_runs). We leave alone stuff within balanced delimiters (footnotes, inflection specs and the	like), as well as splitchars themselves if present. `preserve_splitchar` indicates whether splitchars are present in the split run set. `frob` is a function of one argument (the string to frob) and should return one argument (the	frobbed string). We operate by only frobbing odd-numbered segments, and only in odd-numbered runs if preserve_splitchar is given. ]==]	local function frob_raw_text_alternating_runs(split_run_set, frob, preserve_splitchar) for i, run in ipairs(split_run_set) do			if not preserve_splitchar or i % 2 == 1 then for j, segment in ipairs(run) do					if j % 2 == 1 then run[j] = frob(segment) end end end end end frob_raw_text_alternating_runs(split_runs, frob, preserve_splitchar) return split_runs end

--[==[ Split the non-modifier parts of an alternating run (after parse_balanced_segment_run is called) on a Lua pattern, but not on certain sequences involving characters in that pattern (e.g. comma+whitespace). `splitchar` is the pattern to split on; `preserve_splitchar` indicates whether to preserve the delimiter and is the same as in split_alternating_runs. `escape_fun` is called beforehand on each run of raw text and should return two values: the escaped run and whether unescaping is needed. If any call to `escape_fun` indicates that unescaping is needed, `unescape_fun` will be called on each run of raw text after splitting on `splitchar`. The return value of this function is as in split_alternating_runs. ]==] function export.split_alternating_runs_escaping(run, splitchar, preserve_splitchar, escape_fun, unescape_fun) -- First replace comma with a temporary character in comma+whitespace sequences. local need_unescape = false for i, seg in ipairs(run) do		if i % 2 == 1 then local this_need_unescape run[i], this_need_unescape = escape_fun(run[i]) need_unescape = need_unescape or this_need_unescape end end

if need_unescape then return export.split_alternating_runs_and_frob_raw_text(run, splitchar, unescape_fun, preserve_splitchar) else return export.split_alternating_runs(run, splitchar, preserve_splitchar) end end

--[==[ Replace comma with a temporary char in comma + whitespace. ]==] function export.escape_comma_whitespace(run, tempcomma) tempcomma = tempcomma or u(0xFFF0) local escaped = false

if run:find("\\,") then run = run:gsub("\\,", "\\" .. tempcomma) -- assign to temp to discard second return value escaped = true end if run:find(",%s") then run = run:gsub(",(%s)", tempcomma .. "%1") -- assign to temp to discard second return value escaped = true end return run, escaped end

--[==[ Undo the replacement of comma with a temporary char. ]==] function export.unescape_comma_whitespace(run, tempcomma) tempcomma = tempcomma or u(0xFFF0)

run = run:gsub(tempcomma, ",") -- assign to temp to discard second return value return run end

--[==[ Split the non-modifier parts of an alternating run (after parse_balanced_segment_run is called) on comma, but not on comma+whitespace. See `split_on_comma` above for more information and the meaning of `tempcomma`. ]==] function export.split_alternating_runs_on_comma(run, tempcomma) tempcomma = tempcomma or u(0xFFF0)

-- Replace comma with a temporary char in comma + whitespace. local function escape_comma_whitespace(seg) return export.escape_comma_whitespace(seg, tempcomma) end

-- Undo replacement of comma with a temporary char in comma + whitespace. local function unescape_comma_whitespace(seg) return export.unescape_comma_whitespace(seg, tempcomma) end

return export.split_alternating_runs_escaping(run, ",", false, escape_comma_whitespace, unescape_comma_whitespace) end

--[==[ Split text on comma, but not on comma+whitespace. This is similar to `mw.text.split(text, ",")` but will not split on commas directly followed by whitespace, to handle embedded commas in terms (which are almost always followed by a space). `tempcomma` is the Unicode character to temporarily use when doing the splitting; normally U+FFF0, but you can specify a different character if you use U+FFF0 for some internal purpose. ]==] function export.split_on_comma(text, tempcomma) -- Don't do anything if no comma. Note that split_escaping has a similar check at the beginning, so if there's a	-- comma we effectively do this check twice, but this is worth it to optimize for the common no-comma case. if not text:find(",") then return {text} end

tempcomma = tempcomma or u(0xFFF0)

-- Replace comma with a temporary char in comma + whitespace. local function escape_comma_whitespace(run) return export.escape_comma_whitespace(run, tempcomma) end

-- Undo replacement of comma with a temporary char in comma + whitespace. local function unescape_comma_whitespace(run) return export.unescape_comma_whitespace(run, tempcomma) end

--[==[	Split text on a Lua pattern, but not on certain sequences involving characters in that pattern (e.g.	comma+whitespace). `splitchar` is the pattern to split on; `preserve_splitchar` indicates whether to preserve the delimiter between split segments. `escape_fun` is called beforehand on the text and should return two values: the escaped run and whether unescaping is needed. If the call to `escape_fun` indicates that unescaping is needed, `unescape_fun` will be called on each run of text after splitting on `splitchar`. The return value of this a list of runs, interspersed with delimiters if `preserve_splitchar` is specified. ]==]	local function split_escaping(text, splitchar, preserve_splitchar, escape_fun, unescape_fun) if not rfind(text, splitchar) then return {text} end

--[==[		Like parse_balanced_segment_run but accepts multiple sets of delimiters. For example,

{parse_multi_delimiter_balanced_segment_run("foo[bar(baz[bat])], quux ", {{"[", "]"}, {"(", ")"}, {"<", ">"}}) = {"foo", "[bar(baz[bat])]", ", quux", " ", ""}}.

Each element in the list of delimiter pairs is a string specifying an equivalence class of possible delimiter characters. You can use this, for example, to allow either "[" or "&amp;#91;" to be treated equivalently, with either one closed by either "]" or "&amp;#93;". To do this, first replace "&amp;#91;" and "&amp;#93;" with single Unicode characters such as U+FFF0 and U+FFF1, and then specify a two-character string containing "[" and U+FFF0 as the opening delimiter, and a two-character string containing "]" and U+FFF1 as the corresponding closing delimiter.

If `no_error_on_unmatched` is given and an error is found during parsing, a string is returned containing the error message instead of throwing an error. ]==]		local function parse_multi_delimiter_balanced_segment_run(segment_run, delimiter_pairs, no_error_on_unmatched) local escaped_delimiter_pairs = {} local open_to_close_map = {} local open_close_items = {} local open_items = {} for _, open_close in ipairs(delimiter_pairs) do				local open, close = unpack(open_close) open = rsub(open, "([%[%]%%%%-])", "%%%1") close = rsub(close, "([%[%]%%%%-])", "%%%1") table.insert(open_close_items, open) table.insert(open_close_items, close) table.insert(open_items, open) open = "[" .. open .. "]"				close = "[" .. close .. "]"				open_to_close_map[open] = close table.insert(escaped_delimiter_pairs, {open, close}) end local open_close_pattern = "([" .. table.concat(open_close_items) .. "])" local open_pattern = "([" .. table.concat(open_items) .. "])" local break_on_open_close = capturing_split(segment_run, open_close_pattern) local text_and_specs = {} local level = 0 local seg_group = {} local open_at_level_zero

for i, seg in ipairs(break_on_open_close) do				if i % 2 == 0 then table.insert(seg_group, seg) if level == 0 then if not rfind(seg, open_pattern) then local errmsg = "Unmatched close sign " .. seg .. ": '" .. segment_run .. "'"							if no_error_on_unmatched then return errmsg else error(errmsg) end end assert(open_at_level_zero == nil) for _, open_close in ipairs(escaped_delimiter_pairs) do							local open, close = unpack(open_close) if rfind(seg, open) then open_at_level_zero = open break end end if open_at_level_zero == nil then error(("Internal error: Segment %s didn't match any open regex"):format(seg)) end level = level + 1 elseif rfind(seg, open_at_level_zero) then level = level + 1 elseif rfind(seg, open_to_close_map[open_at_level_zero]) then level = level - 1 assert(level >= 0) if level == 0 then table.insert(text_and_specs, table.concat(seg_group)) seg_group = {} open_at_level_zero = nil end end elseif level > 0 then table.insert(seg_group, seg) else table.insert(text_and_specs, seg) end end if level > 0 then local errmsg = "Unmatched open sign " .. open_at_level_zero .. ": '" .. segment_run .. "'"				if no_error_on_unmatched then return errmsg else error(errmsg) end end return text_and_specs end -- If there are square or angle brackets, we don't want to split on delimiters inside of them. To effect this, we		-- use parse_multi_delimiter_balanced_segment_run to parse balanced brackets, then do delimiter splitting on the -- non-bracketed portions of text using split_alternating_runs_escaping, and concatenate back to a list of -- strings. When calling parse_multi_delimiter_balanced_segment_run, we make sure not to throw an error on -- unbalanced brackets; in that case, we fall through to the code below that handles the case without brackets. if text:find("[%[<]") then local runs = parse_multi_delimiter_balanced_segment_run(text, {{"[", "]"}, {"<", ">"}},				"no error on unmatched") if type(runs) ~= "string" then local split_runs = export.split_alternating_runs_escaping(runs, splitchar, preserve_splitchar, escape_fun,					unescape_fun) for i = 1, #split_runs, (preserve_splitchar and 2 or 1) do					split_runs[i] = table.concat(split_runs[i]) end return split_runs end end

-- First escape sequences we don't want to count for splitting. local need_unescape text, need_unescape = escape_fun(text)

local parts = preserve_splitchar and capturing_split(text, "(" .. splitchar .. ")") or			rsplit(text, splitchar) if need_unescape then for i = 1, #parts, (preserve_splitchar and 2 or 1) do				parts[i] = unescape_fun(parts[i]) end end return parts end

return split_escaping(text, ",", false, escape_comma_whitespace, unescape_comma_whitespace) end

return export