Module:labels/data/lang/zh/functions

local export = {}

export.postprocess_handlers = {}

local labels_module = "Module:labels"

-- Remove duplicated labels like 'Taiwanese' in 'Taiwanese Hokkien|and|Taiwanese Hakka'. Also remove duplicated labels -- in things like -- * 'Quanzhou|_|Hokkien' (which canonicalizes to 'Quanzhou Hokkien|_|Hokkien'); -- * 'Xiamen|and|Quanzhou|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|and|Quanzhou Hokkien|_|Hokkien'); -- * 'Xiamen|and|Anxi|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|and|Anxi|_|Hokkien'); -- * 'Xiamen|Zhangzhou|and|Quanzhou|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|Zhangzhou Hokkien|and|Quanzhou Hokkien|_|Hokkien'); -- * 'Xiamen|Zhangzhou|and|Anxi|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|Zhangzhou Hokkien|and|Anxi|_|Hokkien'). -- We do two passes. The first pass fixes cases like 'Quanzhou Hokkien|_|Hokkien', irrespective of whether there's an -- "and" present. The second pass looks for a stretch of labels where (a) all of the labels have the same prefix or -- suffix, and (b) in between the labels is at least one occurrence of "and" (which can also start out as "&" but is -- canonicalized to "and"); but (c) we count two labels separated by "_" (which is canonicalized to a blank label) as a -- single label. table.insert(export.postprocess_handlers,	function(data)		local labels = data.labels		if #labels == 1 then			return		end		local m_labels = require(labels_module)

-- First, split the labels into `link` and `display` component parts (done only once). local split_labels = {} for i, label in ipairs(labels) do			local link, display = m_labels.split_display_form(label.label) split_labels[i] = {link = link, display = display} end

-- Then compute "label starts" (indices of label sets to consider when looking for runs with the same prefix or		-- suffix), where a label start is either a single label or a set of two labels separated by an underscore, -- and where we take occurrences of "and" into consideration. local label_starts = {} local i = 1 while i <= #labels do			local start = i			local followed_by_and = false local after_underscore if i <= #labels - 4 and labels[i + 1].label == "" and labels[i + 2] == "and" and labels[i + 3] == "" then -- 'Foo|_|and|_|Bar'; redundant underscores followed_by_and = true i = i + 3 elseif i <= #labels - 2 and labels[i + 1].label == "and" then followed_by_and = true i = i + 1 elseif i <= #labels - 2 and labels[i + 1].label == "" then after_underscore = i + 2 i = i + 1 end table.insert(label_starts, {				start = start,				followed_by_and = followed_by_and,				after_underscore = after_underscore			}) i = i + 1 end

-- Now the main loop.

-- Each spec is {"affix", `at_beginning`}, or {{"affix", "affix"}, `at_beginning`} where "affix" is a prefix or -- suffix to remove and `at_beginning` indicates whether "affix" is a prefix or suffix. If more than one affix -- is listed, any affix counts, e.g. 'Taiwan Mandarin|and|Taiwanese Hokkien'. for _, affix_spec in ipairs { {{"Taiwanese", "Taiwan"}, true}, {"Chinese"}, {"Gan"}, {"Hakka"}, {"Hokkien"}, {"Mandarin"}, -- Min needs to go before Southern Min, Eastern Min, etc. because the later check for e.g. Eastern Min -- will overwrite the value set by Min if both match. With Min later, we'll end up with e.g. -- "Fuqing Eastern Eastern Min". {"Min"}, {"Southern Min"}, {"Eastern Min"}, {"Northern Min"}, {"Central Min"}, {"Wu"}, {"Xiang"} } do			local affixes, at_beginning = unpack(affix_spec) if type(affixes) == "string" then affixes = {affixes} end

-- Does `item` match against the prefix or suffix when both prefix/suffix and something else are -- present? If so, return the something else, which is what we need to set the label to if we remove -- the prefix/suffix. local function matches_affix_with_space(item) for _, affix in ipairs(affixes) do local space_regex = at_beginning and "^" .. affix .. " (.+)$" or "^(.+) " .. affix .. "$"					local rest = item:match(space_regex) if rest then return rest end end return false end -- Does `item` match against the prefix or suffix exactly? If so, return an empty string, which is what -- we need to set the label to if we remove the prefix/suffix. local function matches_affix_exactly(item) for _, affix in ipairs(affixes) do					if item == affix then return "" end end return false end -- Does the link or display at `label_index` match with `match_function`? If so, return a three-element -- list of `label_index`, `component` (either "link" or "display") and the return value of `match_function`. local function check_match(label_index, match_function) local link, display = split_labels[label_index].link, split_labels[label_index].display local rest = display and match_function(display) if rest then return {label_index, "display", rest} else rest = link and matches_affix_with_space(link) if rest then return {label_index, "link", rest} end end return nil end -- Given {`label_index`, `component`, `value`}, set the link or display component (depending on `component`) -- of the label at `label_index` to `value`. local function set_component_value(to_erase) local label_index, component, value = unpack(to_erase) if value == "" then labels[label_index].label = "" else local link, display = split_labels[label_index].link, split_labels[label_index].display if component == "display" then display = value else link = value end labels[label_index].label = m_labels.combine_display_form_parts(link, display) end end

-- First pass: Look for two labels separated by an underscore, with the suffix occurring on both parts. -- (This shouldn't happen with prefixes.) if not at_beginning then for _, label_start in ipairs(label_starts) do					local to_erase = check_match(label_start.start, matches_affix_with_space) if to_erase and label_start.after_underscore and check_match(label_start.after_underscore, matches_affix_exactly) then set_component_value(to_erase) end end end

-- Second pass.

-- Check whether a prefix or suffix matches the given label start index (index of a label set in the			-- `label_starts` list; see above). If it matches, return value is {`index`, `component`, `value`}, i.e.			-- the label index to change, the component ("link" or "display") to change and the value to set the -- component to. Otherwise, return nil. local function affix_matches(label_start_index) local label_start = label_starts[label_start_index] -- If we're dealing with a suffix, there are two cases: (1) 'Quanzhou Hokkien'; -- (2) 'Quanzhou|_|Hokkien'. If we're dealing with a prefix, there are similarly (1) 'Taiwanese Hakka'; -- (2) 'Taiwanese|_|Hakka'. In addition, we have to check both the link and the display. local to_erase = check_match(label_start.start, matches_affix_with_space) if to_erase then return to_erase end local after_underscore = label_start.after_underscore if not after_underscore then return nil end return check_match(at_beginning and label_start.start or after_underscore, matches_affix_exactly) end

-- Now, try to find a run of two or more label sets with the same prefix or suffix, with at least one "and" -- in the middle. local j = 1 while j <= #label_starts - 1 do				local saw_and = false local run = {} local match = affix_matches(j) if match then table.insert(run, match) local k = j + 1 while k <= #label_starts do						match = affix_matches(k) if not match then break end table.insert(run, match) if label_starts[k - 1].followed_by_and then saw_and = true end k = k + 1 end if #run > 1 and saw_and then -- We saw a run of two or more with at least one 'and' in the middle. Remove the prefix or						-- suffix from all but the last (if we're dealing with a suffix) or all but the first (if we're						-- dealing with a prefix). if at_beginning then table.remove(run, 1) else table.remove(run) end for _, to_erase in ipairs(run) do							set_component_value(to_erase) end end j = k + 1 else j = j + 1 end end end end )

return export