Module:headword/page

local require = require local m_str_utils = require("Module:string utilities")

local mw = mw local string = string local table = table local ustring = mw.ustring

local concat = table.concat local decode_entities = m_str_utils.decode_entities local encode_entities = m_str_utils.encode_entities local find = string.find local get_category = require("Module:maintenance category").get_category local get_etym_lang = require("Module:etymology languages").getByCanonicalName local insert = table.insert local ipairs = ipairs local list_to_set = require("Module:table").listToSet local load_data = mw.loadData local match = string.match local new_title = mw.title.new local remove_comments = m_str_utils.remove_comments local split = m_str_utils.split local string_sort = require("Module:collation").string_sort local sub = string.sub local toNFC = ustring.toNFC local toNFD = ustring.toNFD local type = type local type_or_class = require("Module:parser").type_or_class local u = m_str_utils.char local ugsub = ustring.gsub local uupper = m_str_utils.upper

local langnames = load_data("Module:languages/canonical names") local etym_langnames = load_data("Module:etymology languages/canonical names")

local export = {}

-- Convert a numeric list of characters and ranges to the equivalent Lua pattern. WARNING: This destructively modifies -- the contents of `ranges`. local function char_ranges_to_pattern(ranges) for j, range in ipairs(ranges) do		if type(range) == "table" then for k, char in ipairs(range) do				range[k] = u(char) end ranges[j] = concat(range, "-") else ranges[j] = u(range) end end return concat(ranges) end

-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find -- single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + -- diacritic(s) + character). local comb_chars = { single = { {0x0300, 0x034E}, -- Exclude combining grapheme joiner. {0x0350, 0x035B}, {0x0363, 0x036F}, {0x0483, 0x0489}, {0x0591, 0x05BD}, 0x05BF, {0x05C1, 0x05C2}, {0x05C4, 0x05C5}, 0x05C7, {0x0610, 0x061A}, {0x064B, 0x065F}, 0x0670, {0x06D6, 0x06DC}, {0x06DF, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, 0x0711, {0x0730, 0x074A}, {0x07A6, 0x07B0}, {0x07EB, 0x07F3}, 0x07FD, {0x0816, 0x0819}, {0x081B, 0x0823}, {0x0825, 0x0827}, {0x0829, 0x082D}, {0x0859, 0x085B}, {0x0898, 0x089F}, {0x08CA, 0x08E1}, {0x08E3, 0x0903}, {0x093A, 0x093C}, {0x093E, 0x094F}, {0x0951, 0x0957}, {0x0962, 0x0963}, {0x0981, 0x0983}, 0x09BC, {0x09BE, 0x09C4}, {0x09C7, 0x09C8}, {0x09CB, 0x09CD}, 0x09D7, {0x09E2, 0x09E3}, 0x09FE, {0x0A01, 0x0A03}, 0x0A3C, {0x0A3E, 0x0A42}, {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, 0x0A51, {0x0A70, 0x0A71}, 0x0A75, {0x0A81, 0x0A83}, 0x0ABC, {0x0ABE, 0x0AC5}, {0x0AC7, 0x0AC9}, {0x0ACB, 0x0ACD}, {0x0AE2, 0x0AE3}, {0x0AFA, 0x0AFF}, {0x0B01, 0x0B03}, 0x0B3C, {0x0B3E, 0x0B44}, {0x0B47, 0x0B48}, {0x0B4B, 0x0B4D}, {0x0B55, 0x0B57}, {0x0B62, 0x0B63}, 0x0B82, {0x0BBE, 0x0BC2}, {0x0BC6, 0x0BC8}, {0x0BCA, 0x0BCD}, 0x0BD7, {0x0C00, 0x0C04}, 0x0C3C, {0x0C3E, 0x0C44}, {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56}, {0x0C62, 0x0C63}, {0x0C81, 0x0C83}, 0x0CBC, {0x0CBE, 0x0CC4}, {0x0CC6, 0x0CC8}, {0x0CCA, 0x0CCD}, {0x0CD5, 0x0CD6}, {0x0CE2, 0x0CE3}, 0x0CF3, {0x0D00, 0x0D03}, {0x0D3B, 0x0D3C}, {0x0D3E, 0x0D44}, {0x0D46, 0x0D48}, {0x0D4A, 0x0D4D}, 0x0D57, {0x0D62, 0x0D63}, {0x0D81, 0x0D83}, 0x0DCA, {0x0DCF, 0x0DD4}, 0x0DD6, {0x0DD8, 0x0DDF}, {0x0DF2, 0x0DF3}, 0x0E31, {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, 0x0EB1, {0x0EB4, 0x0EBC}, {0x0EC8, 0x0ECE}, {0x0F18, 0x0F19}, 0x0F35, 0x0F37, 0x0F39, {0x0F3E, 0x0F3F}, {0x0F71, 0x0F84}, {0x0F86, 0x0F87}, {0x0F8D, 0x0F97}, {0x0F99, 0x0FBC}, 0x0FC6, {0x102B, 0x103E}, {0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, 0x108F, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1715}, {0x1732, 0x1734}, {0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, 0x17DD, -- Exclude Mongolian variation selectors. {0x1885, 0x1886}, 0x18A9, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E}, {0x1A60, 0x1A7C}, 0x1A7F, {0x1AB0, 0x1ACE}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2}, {0x1CD4, 0x1CE8}, 0x1CED, 0x1CF4, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DCC}, {0x1DCE, 0x1DFB}, {0x1DFD, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, 0x2D7F, {0x2DE0, 0x2DFF}, {0x302A, 0x302F}, {0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, 0xA802, 0xA806, 0xA80B, {0xA823, 0xA827}, 0xA82C, {0xA880, 0xA881}, {0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, 0xA8FF, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, 0xA9E5, {0xAA29, 0xAA36}, 0xAA43, {0xAA4C, 0xAA4D}, {0xAA7B, 0xAA7D}, 0xAAB0, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, 0xAAC1, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, 0xFB1E, {0xFE20, 0xFE2F}, 0x101FD, 0x102E0, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, 0x10A3F, {0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10EFD, 0x10EFF}, {0x10F46, 0x10F50}, {0x10F82, 0x10F85}, {0x11000, 0x11002}, {0x11038, 0x11046}, 0x11070, {0x11073, 0x11074}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, 0x110C2, {0x11100, 0x11102}, {0x11127, 0x11134}, {0x11145, 0x11146}, 0x11173, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, 0x1123E, 0x11241, {0x112DF, 0x112EA}, {0x11300, 0x11303}, {0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, 0x11357, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, 0x1145E, {0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938}, {0x1193B, 0x1193E}, 0x11940, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, 0x119E4, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, 0x11A47, {0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, 0x11D3A, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45}, 0x11D47, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x11F00, 0x11F01}, 0x11F03, {0x11F34, 0x11F3A}, {0x11F3E, 0x11F42}, 0x13440, {0x13447, 0x13455}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, 0x16F4F, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92}, -- Exclude Khitan Small Script filler. {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1CF00, 0x1CF2D}, {0x1CF30, 0x1CF46}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36}, {0x1DA3B, 0x1DA6C}, 0x1DA75, 0x1DA84, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A}, 0x1E08F, {0x1E130, 0x1E136}, 0x1E2AE, {0x1E2EC, 0x1E2EF}, {0x1E4EC, 0x1E4EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, },	double = { {0x035C, 0x0362}, 0x1DCD, 0x1DFC, },	vs = { -- variation selectors; separated out so that we don't get categories for them {0xFE00, 0xFE0F}, {0xE0100, 0xE01EF}, } } for key, charset in pairs(comb_chars) do	comb_chars[key] = char_ranges_to_pattern(charset) end

local comb_chars_all = comb_chars.single .. comb_chars.double .. comb_chars.vs

comb_chars = { combined_single = "[^" .. comb_chars_all .. "][" .. comb_chars.single .. comb_chars.vs .. "]+%f[^" .. comb_chars_all .. "]",	combined_double = "[^" .. comb_chars_all .. "][" .. comb_chars.single .. comb_chars.vs .. "]*[" .. comb_chars.double .. "]+[" .. comb_chars_all .. "]*.[" .. comb_chars.single .. comb_chars.vs .. "]*",	diacritics_single = "[" .. comb_chars.single .. "]",	diacritics_double = "[" .. comb_chars.double .. "]",	diacritics_all = "[" .. comb_chars_all .. "]" }

-- From https://unicode.org/Public/emoji/15.1/emoji-sequences.txt local emoji_chars = { {0x231A, 0x231B}, -- watch..hourglass done                                          # E0.6   [2] (⌚..⌛) {0x23E9, 0x23EC}, -- fast-forward button..fast down button                          # E0.6   [4] (⏩..⏬) 0x23F0,          --  alarm clock                                                    # E0.6   [1] (⏰) 0x23F3,          --  hourglass not done                                             # E0.6   [1] (⏳) {0x25FD, 0x25FE}, -- white medium-small square..black medium-small square           # E0.6   [2] (◽..◾) {0x2614, 0x2615}, -- umbrella with rain drops..hot beverage                         # E0.6   [2] (☔..☕) {0x2648, 0x2653}, -- Aries..Pisces                                                  # E0.6  [12] (♈..♓) 0x267F,          --  wheelchair symbol                                              # E0.6   [1] (♿) 0x2693,          --  anchor                                                         # E0.6   [1] (⚓) 0x26A1,          --  high voltage                                                   # E0.6   [1] (⚡) {0x26AA, 0x26AB}, -- white circle..black circle                                     # E0.6   [2] (⚪..⚫) {0x26BD, 0x26BE}, -- soccer ball..baseball                                          # E0.6   [2] (⚽..⚾) {0x26C4, 0x26C5}, -- snowman without snow..sun behind cloud                         # E0.6   [2] (⛄..⛅) 0x26CE,          --  Ophiuchus                                                      # E0.6   [1] (⛎) 0x26D4,          --  no entry                                                       # E0.6   [1] (⛔) 0x26EA,          --  church                                                         # E0.6   [1] (⛪) {0x26F2, 0x26F3}, -- fountain..flag in hole                                         # E0.6   [2] (⛲..⛳) 0x26F5,          --  sailboat                                                       # E0.6   [1] (⛵) 0x26FA,          --  tent                                                           # E0.6   [1] (⛺) 0x26FD,          --  fuel pump                                                      # E0.6   [1] (⛽) 0x2705,          --  check mark button                                              # E0.6   [1] (✅) {0x270A, 0x270B}, -- raised fist..raised hand                                       # E0.6   [2] (✊..✋) 0x2728,          --  sparkles                                                       # E0.6   [1] (✨) 0x274C,          --  cross mark                                                     # E0.6   [1] (❌) 0x274E,          --  cross mark button                                              # E0.6   [1] (❎) {0x2753, 0x2755}, -- red question mark..white exclamation mark                      # E0.6   [3] (❓..❕) 0x2757,          --  red exclamation mark                                           # E0.6   [1] (❗) {0x2795, 0x2797}, -- plus..divide                                                   # E0.6   [3] (➕..➗) 0x27B0,          --  curly loop                                                     # E0.6   [1] (➰) 0x27BF,          --  double curly loop                                              # E1.0   [1] (➿) {0x2B1B, 0x2B1C}, -- black large square..white large square                         # E0.6   [2] (⬛..⬜) 0x2B50,          --  star                                                           # E0.6   [1] (⭐) 0x2B55,          --  hollow red circle                                              # E0.6   [1] (⭕) {0x1F300, 0x1FAFF}, -- emoji in Plane 1 -- NOTE: There are lots more emoji sequences involving non-emoji Plane 0 symbols followed by 0xFE0F, which we don't	-- (yet?) handle. } emoji_chars = char_ranges_to_pattern(emoji_chars)

local unsupported_characters = {} for k, v in pairs(require("Module:links/data").unsupported_characters) do	unsupported_characters[v] = k end

-- Get the list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles). local unsupported_titles = {} for k, v in pairs(require("Module:links/data").unsupported_titles) do	unsupported_titles[v] = k end

--[==[ Given a pagename (or {nil} for the current page), create and return a data structure describing the page. The returned object includes the following fields: (those that decompose into multiple characters in the NFD decomposition). The patterns are meant to be used with {mw.ustring.find}. The keys are: combining characters, possibly also with single combining characters; with {mw.ustring.find}. including the namespace and the root (portion before the slash). are treated as single characters. character + decomposed diacritic are treated as single characters). ]==]
 * `comb_chars`: A table containing various Lua character class patterns for different types of combined characters
 * `single`: Single combining characters (character + diacritic), without surrounding brackets;
 * `double`: Double combining characters (character + diacritic + character), without surrounding brackets;
 * `vs`: Variation selectors, without surrounding brackets;
 * `all`: Concatenation of `single` + `double` + `vs`, without surrounding brackets;
 * `diacritics_single`: Like `single` but with surrounding brackets;
 * `diacritics_double`: Like `double` but with surrounding brackets;
 * `diacritics_all`: Like `all` but with surrounding brackets;
 * `combined_single`: Lua pattern for matching a spacing character followed by one or more single combining characters;
 * `combined_double`: Lua pattern for matching a combination of two spacing characters separated by one or more double
 * `emoji_pattern`: A Lua character class pattern (including surrounding brackets) that matches emojis. Meant to be used
 * `L2_list`: Ordered list of L2 headings on the page, with the extra key `n` that gives the length of the list.
 * `L2_sections`: Lookup table of L2 headings on the page, where the key is the section number assigned by the preprocessor, and the value is the L2 heading name. Once an invocation has got its actual section number from get_current_section in Module:utilities, it can use this table to determine its parent L2. TODO: We could expand this to include subsections, to check POS headings are correct etc.
 * `unsupported_titles`: Map from pagenames to canonical titles for unsupported-title pages.
 * `namespace`: Namespace of the pagename.
 * `ns`: Namespace table for the page from mw.site.namespaces (TODO: merge with `namespace` above).
 * `full_raw_pagename`: Full version of the RAW pagename (i.e. unsupported-title pages aren't canonicalized);
 * `pagename`: Canonicalized subpage portion of the pagename (unsupported-title pages are canonicalized).
 * `decompose_pagename`: Equivalent of `pagename` in NFD decomposition.
 * `pagename_len`: Length of `pagename` in Unicode chars, where combinations of spacing character + decomposed diacritic
 * `explode_pagename`: Set of characters found in `pagename`. The keys are characters (where combinations of spacing
 * `encoded_pagename`: FIXME: Document me.
 * `pagename_defaultsort`: FIXME: Document me.
 * `raw_defaultsort`: FIXME: Document me.
 * `wikitext_topic_cat`: FIXME: Document me.
 * `wikitext_langname_cat`: FIXME: Document me.

function export.process_page(pagename) local data = { comb_chars = comb_chars, emoji_pattern = "[" .. emoji_chars .. "]",		unsupported_titles = unsupported_titles }	local cats = {} data.cats = cats

-- We cannot store `raw_title` in `data` because it contains a metatable. local raw_title local function bad_pagename if not pagename then error("Internal error: Something wrong, `data.pagename` not specified but current title containg illegal characters") else error(("Bad value for `data.pagename`: '%s', which must not contain illegal characters"):format(pagename)) end end if pagename then -- for testing, doc pages, etc.		raw_title = new_title(pagename) if not raw_title then bad_pagename end else raw_title = mw.title.getCurrentTitle end data.namespace = raw_title.nsText data.ns = mw.site.namespaces[raw_title.namespace] data.full_raw_pagename = raw_title.fullText

local frame = mw.getCurrentFrame -- WARNING: `content` May be nil, e.g. if we're substing a template like on a not-yet-created page -- or if the module specifies the subpage as `data.pagename` (which many modules do) and we're in an Appendix -- or other non-mainspace page. We used to make the latter an error but there are too many modules that do it, -- and substing on a nonexistent page is totally legit, and we don't actually need to be able to access the -- content of the page. local content = raw_title:getContent local content_lang = mw.getContentLanguage

--Get the pagename. pagename = raw_title.subpageText :gsub("^Unsupported titles/(.*)", function(m)			insert(cats, "Unsupported titles")			return unsupported_titles[m] or (m:gsub("`.-`", unsupported_characters))		end) -- Save pagename, as local variable will be destructively modified. data.pagename = pagename -- Decompose the pagename in Unicode normalization form D.	data.decompose_pagename = toNFD(pagename) -- Explode the current page name into a character table, taking decomposed combining characters into account. local explode_pagename = {} local pagename_len = 0 local function explode(char) explode_pagename[char] = true pagename_len = pagename_len + 1 return "" end pagename = ugsub(pagename, comb_chars.combined_double, explode) pagename = ugsub(pagename, comb_chars.combined_single, explode) :gsub(".[\128-\191]*", explode)

data.explode_pagename = explode_pagename data.pagename_len = pagename_len -- Generate DEFAULTSORT. data.encoded_pagename = encode_entities(data.pagename) data.pagename_defaultsort = require("Module:languages").getByCode("mul"):makeSortKey(data.encoded_pagename) frame:callParserFunction(		"DEFAULTSORT",		data.pagename_defaultsort	) data.raw_defaultsort = raw_title.text:uupper -- Make `L2_list` and `L2_sections`, note raw wikitext use of and, then add categories if any unwanted L1 headings are found, the L2 headings are in the wrong order, or they don't match a canonical language name. -- Note: HTML comments shouldn't be removed from `content` until after this step, as they can affect the result. do local L2_list, L2_list_len, L2_sections, sort_cache, prev = {}, 0, {}, {} local defaultsort, displaytitle, page_has_L1, L2_wrong_order, L2_nonstandard, page_has_arg local function get_weight(L2) if L2 == "Translingual" then return "\1" elseif L2 == "English" then return "\2" elseif match(L2, "^[%z\1-\b\14-!#-&(-,.-\127]+$") then				return L2			end			local weight = sort_cache[L2]			if weight then				return weight			end			weight = toNFC(ugsub(ugsub(toNFD(L2), "[" .. comb_chars_all .. "'\"ʻʼ]+", ""), "[%s%-]+", " "))			sort_cache[L2] = weight			return weight		end		local function handle_heading(node)			local level = node.level			if level > 2 then				return			end			local name = node:get_name			-- Check there are no newline characters in the heading, which might appear after preprocessing (e.g. from an expanded template). In such cases, the preprocessor section count still increments (since it's calculated pre-expansion), but the heading will fail, so we shouldn't increment the L2 count.			if find(name, "\n", 1, true) then				return			end			L2_list_len = L2_list_len + 1			L2_list[L2_list_len] = name			L2_sections[node.section] = name			-- We also add any L1s, since they terminate the preceding L2, but add a maintenance category since it's probably a mistake.			if level == 1 then				page_has_L1 = true			end			-- Check the heading is in the right order.			-- FIXME: we need a more sophisticated sorting method which handles non-diacritic special characters (e.g. Magɨ). if prev and not (				L2_wrong_order or				string_sort(get_weight(prev), get_weight(name))			) then L2_wrong_order = true end -- Check it's a canonical language name. if not langnames[name] then L2_nonstandard = true end prev = name end local function handle_template(node) local name = node:get_name if name == "DEFAULTSORT:" then defaultsort = true elseif name == "DISPLAYTITLE:" then displaytitle = true end end if content then for node in require("Module:template parser").parse(content):__pairs("next_node") do				local node_type = type_or_class(node) if node_type == "heading" then handle_heading(node) elseif node_type == "template" and not (defaultsort and displaytitle) then handle_template(node) elseif node_type == "argument" then page_has_arg = true end end end L2_list.n = L2_list_len data.L2_list = L2_list data.L2_sections = L2_sections if defaultsort then insert(cats, get_category("Pages with DEFAULTSORT conflicts")) end if displaytitle then insert(cats, get_category("Pages with DISPLAYTITLE conflicts")) end if page_has_L1 then insert(cats, get_category("Pages with unwanted L1 headings")) end if L2_wrong_order then insert(cats, get_category("Pages with language headings in the wrong order")) end if L2_nonstandard then insert(cats, get_category("Pages with nonstandard language headings")) end if page_has_arg then insert(cats, get_category("Pages with raw triple-brace template arguments")) end end

-- 4. Parse page for maintenance categories. --	-- Use of tab characters. if content and find(content, "\t", 1, true) then insert(cats, get_category("Pages with tab characters")) end -- Unencoded character(s) in title. local IDS = list_to_set{"⿰", "⿱", "⿲", "⿳", "⿴", "⿵", "⿶", "⿷", "⿸", "⿹", "⿺", "⿻", "⿼", "⿽", "⿾", "⿿", "㇯"} for char in pairs(explode_pagename) do		if IDS[char] and char ~= data.pagename then insert(cats, "Terms containing unencoded characters") break end end

-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used. do local wikitext_topic_cat = {} local wikitext_langname_cat = {} local raw_sortkey -- If a raw sortkey has been found, add it to the relevant table. -- If there's no table (or the index is just `true`), create one first. local function add_cat_table(t, lang, sortkey) local t_lang = t[lang] if not sortkey then if not t_lang then t[lang] = true end return elseif t_lang == true or not t_lang then t_lang = {} t[lang] = t_lang end t_lang[uupper(decode_entities(sortkey))] = true end local function do_iteration(name, sortkey, wikitext_langname_cat) if langnames[name] then return add_cat_table(wikitext_langname_cat, name, sortkey) end name = etym_langnames[name] and name or content_lang:lcfirst(name) if etym_langnames[name] then name = get_etym_lang(name):getFullName return add_cat_table(wikitext_langname_cat, name, sortkey) end end local function process_category(content, cat, colon, nxt) local pipe = find(cat, "|", colon + 1, true) -- Categories cannot end "|]]". if pipe == #cat then return end local title = new_title(pipe and sub(cat, 1, pipe - 1) or cat) if not (title and title.namespace == 14) then return end -- Get the sortkey (if any), then canonicalize category title. local sortkey = pipe and sub(cat, pipe + 1) or nil cat = title.text if sortkey then raw_sortkey = true -- If the sortkey contains "[", the first "]" of a final "]]]" is treated as part of the sortkey. if find(sortkey, "[", 1, true) and sub(content, nxt, nxt) == "]" then sortkey = sortkey .. "]"				end end local code = match(cat, "^([%w%-.]+):") if code then return add_cat_table(wikitext_topic_cat, code, sortkey) end -- Split by word. cat = split(cat, " ", true, true) -- Iterate over the category name, starting with the longest possible name and shaving off the first word until we find one. We do it this way because: -- (a) Going from shortest to longest risks falsely matching (e.g.) German Low German categories as German. -- (b) Checking the start of category names first risks falsely match (e.g.) Alsatian French as Alsatian (a variety of Alemannic German), not French. -- If no matches are found, then check the start of the category name, shaving off the last word each iteration. local cat_len, n, name, done = #cat, 1 repeat name = concat(cat, " ", n, cat_len) done = do_iteration(name, sortkey, wikitext_langname_cat) if done then return end n = n + 1 until n > cat_len n = cat_len - 1 if n <= 0 then return end repeat name = concat(cat, " ", 1, n)				done = do_iteration(name, sortkey, wikitext_langname_cat) if done then return end n = n - 1 until n == 0 end if content then -- Remove comments, then iterate over category links. content = remove_comments(content, "BOTH") local head = find(content, "", 1, true)			while head do				local close = find(content, "", head + 2, true) if not close then break end -- Make sure there are no intervening "[[" between head and close.				local open = find(content, "[[", head + 2, true)				while open and open < close do					head = open					open = find(content, "[[", head + 2, true)				end				local cat = sub(content, head + 2, close - 1)				-- Locate the colon, and weed out most unwanted links. "[ _\128-\244]*" catches valid whitespace, and ensures any category links using the colon trick are ignored. We match all non-ASCII characters, as there could be multibyte spaces, and mw.title.new will filter out any remaining false-positives; this is a lot faster than running mw.title.new on every link.				local colon = match(cat, "^[ _\128-\244]*[Cc][Aa][Tt][EeGgOoRrYy _\128-\244]*:")				if colon then					process_category(content, cat, colon, close + 2)				end				head = open			end		end		data.wikitext_topic_cat = wikitext_topic_cat		data.wikitext_langname_cat = wikitext_langname_cat		if raw_sortkey then			insert(cats, get_category("Pages with raw sortkeys"))		end	end

return data end

return export