Module:etymon

local export = {}

local m_links = require("Module:links") local m_utilities = require("Module:utilities") local page_data = mw.loadData("Module:headword/data").page

local check_ancestor = require("Module:etymology").check_ancestor local concat = table.concat local find = string.find local findTemplates = require("Module:template parser").findTemplates local format_categories = m_utilities.format_categories local get_section = m_utilities.get_section local full_link = m_links.full_link local get_lang = require("Module:languages").getByCode local get_link_page = m_links.get_link_page local insert = table.insert local ipairs = ipairs local max = math.max local new_title = mw.title.new local process_params = require("Module:parameters").process local split = require("Module:string utilities").split local sub = string.sub local gsub = string.gsub local type = type local unpack = unpack

-- Normalize the language so that special handling of Chinese is accounted for. -- This is everything in the Sinitic family which isn't a creole, pidgin or mixed language. local function getNormLang(lang) if lang:inFamily("zhx") and not lang:inFamily("crp", "qfa-mix") then return get_lang("zh") else return lang end end

-- Given an etymon param, return its parts. local function getParts(templateLang, etymonParam) local etymonLang, etymonLangcode, etymonPage, etymonId local parts = split(etymonParam, ">", true, true) if #parts == 2 then -- Assume language is the same as the template call if none is provided. etymonLang, etymonPage, etymonId = templateLang, unpack(parts) else etymonLangcode, etymonPage, etymonId = unpack(parts) etymonLang = get_lang(etymonLangcode, true, true) end

return etymonLang, etymonPage, etymonId end

local argsOf = {} local disambiguationCount = {} local function scrapePage(etymonPage, etymonTitle, key, etymonLangcode, etymonId, redirectedFrom) local pageContent = etymonTitle:getContent local redirectTarget = etymonTitle.redirectTarget local L2_key = etymonLangcode .. ">" .. etymonPage

if pageContent == nil then argsOf[key] = "redlink" return end

-- TODO: it is really necessary to convert the language object to the code and back into the object? local langSection = get_section(pageContent, get_lang(etymonLangcode, true, true):getCanonicalName, 2)

-- Search the entire entry if it is a redirect. if redirectTarget then langSection = pageContent elseif langSection == nil then return "missing" end

-- Search for the template on the page (even if this is a redirect page). for name, templateArgs in findTemplates(langSection) do		if name == "etymon" and templateArgs[1] == etymonLangcode then argsOf[L2_key .. ">" .. templateArgs["id"]] = templateArgs if disambiguationCount[L2_key] == nil then disambiguationCount[L2_key] = 1 else disambiguationCount[L2_key] = disambiguationCount[L2_key] + 1 end end end

if redirectedFrom and disambiguationCount[L2_key] then disambiguationCount[redirectedFrom] = (disambiguationCount[redirectedFrom] or 0) + disambiguationCount[L2_key] end

-- If scraping produced a result, there's nothing left to do. if argsOf[key] then return -- Else if we've already followed a redirect and still found nothing, record the template as missing. elseif redirectedFrom then argsOf[key] = "missing" return end

-- Check if the page is a redirect, and if not record the template as missing. if not redirectTarget then argsOf[key] = "missing" return end

-- Otherwise, try again with the redirect target. etymonPage = redirectTarget.prefixedText scrapePage(etymonPage, redirectTarget, L2_key .. ">" .. etymonId, etymonLangcode, etymonId, L2_key)

-- Record the value as the same as the redirect's. argsOf[key] = argsOf[L2_key .. ">" .. etymonId] end

-- Given an etymon, scrape the page and get its parameters. -- This function returns either: a table of the params, "missing", "redlink", or "nolink" local function getArgs(templateLang, etymonParam) -- Get normalized parts of the etymon parameter. local etymonLang, etymonPage, etymonId = getParts(templateLang, etymonParam) -- "?" is a special value that unlinks the page. if etymonId == "?" then return "nolink" end etymonPage = get_link_page(etymonPage, etymonLang) local etymonLangcode = getNormLang(etymonLang):getFullCode

-- Find the parameters by scraping etymonPage. -- Store data in the argsOf table to save time in case the same etymon is accessed again. -- The key is a normalized version of etymonParam. local key = etymonLangcode .. ">" .. etymonPage .. ">" .. etymonId if argsOf[key] == nil then local etymonTitle = new_title(etymonPage) if not etymonTitle then -- This shouldn't happen: all unsupported titles should be resolved at this stage. error("Invalid page title \"" .. etymonPage .. "\" encountered.") end scrapePage(etymonPage, etymonTitle, key, etymonLangcode, etymonId) end

return argsOf[key] end

-- [tag]: {abbreviation, label glossary anchor, start text, start text plus, middle text, forms groups} -- Note: the keywords `afeq`, `conf`, and `unc` are also recognized, but do not use this dictionary. -- Please do not add any new keywords without discussion or this list will get extremely unwieldy. -- If we decide to add keywords for each thing I will have to figure out a systematic way to organize them. local keywordDict = { ["from"] = {false, false, "From", "From", "from", false, false}, ["inh"] = {false, false, "From", "Inherited from", "from", false}, ["af"] = {false, false, "From", "From", "from", true}, ["blend"] = {"blend.", "blend", "Blend of", "Blend of", "a blend of", true}, ["bor"] = {"bor.", "borrowing", "Borrowed from", "Borrowed from", "borrowed from", false}, ["lbor"] = {"lbor.", "learned_borrowing", "Learned borrowing from", "Learned borrowing from", "borrowed from", false}, ["obor"] = {"obor.", "orthographic_borrowing", "Orthographic borrowing from", "Orthographic borrowing from", "borrowed from", false}, ["slbor"] = {"slbor.", "semi-learned_borrowing", "Semi-learned borrowing from", "Semi-learned borrowing from", "borrowed from", false}, ["der"] = {"der.", "derived_terms", "Derived from", "Derived from", "from", false}, ["calque"] = {"calq.", "calque", "Calque of", "Calque of", "a calque of", false}, ["sl"] = {"sl.", "semantic loan", "Semantic loan of", "Semantic loan of", "a semantic loan of", false}, ["influence"] = {"influ.", "contamination", "", "", "", false} }

-- This function takes an etymon and recursively builds a tree to display in an entry. local function etyTree(currTitle, lang, args, alreadySeen, isTopLevel, isUncertain, label) local treeWidth = 0 local treeHeight = 0 local subtree, subtreeHeight, subtreeWidth, etymonLang, etymonPage, etymonArgs local subtrees = {} local currId = "" if type(args) == "table" then currId = args["id"] end local key = getNormLang(lang):getFullCode .. ">" .. get_link_page(currTitle, lang) .. ">" .. currId local derType, confidence, ignoreEtymons = "from", "conf", false

-- Only recurse when an etymon has params and was not included in the tree previously. if type(args) == "table" and alreadySeen[key] == nil then local templateLang = get_lang(args[1], true, true) -- Add the page to alreadySeen, which keeps track of what's already been added to the tree and the depth reached. alreadySeen[key] = true -- Loop over each parameter in the current template. for i, param in ipairs(args) do			if i > 1 and find(param, ">") and not ignoreEtymons then etymonLang, etymonPage = getParts(templateLang, param)

-- Scrape the page and get the parameters. etymonArgs = getArgs(templateLang, param)

-- Recurse into the etymon and append its tree to the list of subtrees. subtree, subtreeHeight, subtreeWidth = etyTree(etymonPage, etymonLang, etymonArgs, alreadySeen, false, confidence == "unc", derType) insert(subtrees, subtree) treeHeight = max(treeHeight, subtreeHeight) treeWidth = treeWidth + subtreeWidth elseif i > 1 then -- Reached a keyword. if param == "conf" or param == "unc" then confidence = param elseif keywordDict[param] ~= nil then ignoreEtymons = false confidence = "conf" derType = param else ignoreEtymons = true end end end end

-- Create term block. local link = "" .. lang:getCanonicalName .. " " if isTopLevel then link = link .. full_link({lang=lang, alt="" .. currTitle .. ""}, "term") elseif currId == "" then link = link .. full_link({lang=lang, term=currTitle}, "term") else link = link .. full_link({lang=lang, term=currTitle, id=currId}, "term") end link = link .. " "

-- Create tree. local tree = "" if #subtrees == 1 then -- Add long top connector. tree = tree .. " " elseif #subtrees >= 2 then --Add short top connector. tree = tree .. " " end

--Create term block. tree = tree .. "" .. link

-- Add derivation and uncertainty labels. -- TODO: make the CSS less horrible. if (label ~= "" and keywordDict[label][1] ~= false) or isUncertain then tree = tree .. "" if label ~= "" and keywordDict[label][1] ~= false then tree = tree .. "" .. keywordDict[label][1] .. " " if isUncertain then -- Add uncertainty label next to the derivation label. tree = tree .. "? " end elseif isUncertain then -- Add uncertainty label in the middle. tree = tree .. "? " end tree = tree .. " "	end

tree = tree .. " "

-- Append subtrees. if #subtrees == 1 then tree = subtrees[1] .. tree elseif #subtrees >= 2 then local subtreeString = "" for i,v in ipairs(subtrees) do			if i == 1 then -- Add left connector. v = v .. " " elseif i == #subtrees then -- Add right connector. v = v .. " " else -- Add a short bottom connector and middle connector. v = v .. "  " end -- Add column div. v = "" .. v .. " "			subtreeString = subtreeString .. v		end tree = "" .. subtreeString .. " " .. tree else --Reached a leaf node. treeWidth = treeWidth + 1 end

-- Add outer divs. if isTopLevel then tree = "" .. tree .. " "		tree = "<div class=\"etytree NavFrame\" data-etytree-height=\"" .. treeHeight + 1 .. "\" data-etytree-width=\"" .. treeWidth .. "\"><div class=\"NavHead\" style=\"background:#eee\">Etymology tree <div class=\"NavContent\" style=\"overflow:auto\">" .. tree .. " "	end

return tree, treeHeight + 1, treeWidth end

-- This function takes an etymon and generates some text to display in an entry. -- Currently, it is only able to handle simple combinations of parameters. local function etyText(title, lang, args, usePlusTemplates, maxDepth) local text = "" local depth = 1 local alreadyWritten = {} local key, currLang, group, groupType, groupConfidence, confidence, derType, foundGroup, complexParams, ignoreEtymons, etymonLang, etymonTitle, etymonId, templateLang

-- Loop and continuously expand the sentence until we reach the end of the chain. while not maxDepth or depth <= maxDepth do		group, groupType, groupConfidence, confidence, derType, foundGroup, complexParams, ignoreEtymons, currLang = {}, "from", "conf", "conf", "from", false, false, false, lang key = getNormLang(lang):getFullCode .. ">" .. get_link_page(title, lang) .. ">" .. args["id"] templateLang = get_lang(args[1], true, true) -- Stop if we encounter an already-seen term. if alreadyWritten[key] ~= nil then break end alreadyWritten[key] = true for i, param in ipairs(args) do			if i > 1 and find(param, ">") and not ignoreEtymons then -- The text should only continue if `args` is either (not including `influence` or `afeq` etymons): -- A single etymon, or single `af` group. Otherwise the parameters are too "complex" and are rejected. -- TODO: add smarter handling for complex parameters. if foundGroup or (#group == 1 and not keywordDict[derType][6]) then complexParams = true break end groupType = derType if confidence == "unc" then groupConfidence = "unc" end insert(group, param) elseif i > 1 then -- Reached a keyword. if param == "unc" then confidence = param elseif param == "afeq" or param == "influence" then ignoreEtymons = true if #group == 1 then foundGroup = true end else ignoreEtymons = false confidence = "conf" derType = param if #group == 1 then foundGroup = true end end end end if complexParams or #group == 0 then break end if #group == 1 then args = getArgs(templateLang, group[1]) end if text == "" then -- Start the sentence. if groupConfidence == "conf" and not usePlusTemplates then text = keywordDict[groupType][3] elseif groupConfidence == "conf" and usePlusTemplates then text = keywordDict[groupType][4] else text = "Possibly " .. keywordDict[groupType][5] end else -- Add a phrase onto the sentence. if groupConfidence == "conf" then text = text .. ", " .. keywordDict[groupType][5] else text = text .. ", possibly " .. keywordDict[groupType][5] end end -- Add the links. for i = 1,#group do			etymonLang, etymonTitle, etymonId = getParts(templateLang, group[i]) if etymonLang ~= currLang then group[i] = etymonLang:makeWikipediaLink .. " " .. full_link({lang=etymonLang, term=etymonTitle, id=etymonId}, "term") currLang = etymonLang else group[i] = full_link({lang=etymonLang, term=etymonTitle, id=etymonId}, "term") end end text = text .. " " .. concat(group, " + ") depth = depth + 1 if #group == 2 then break end lang = etymonLang title = etymonTitle if type(args) ~= "table" then break end end -- Add a period at the end of the sentence. if text ~= "" then text = text .. "."	end return text end

-- This function take an etymon and recursively generates categories to add to the entry. -- Currently the behaviour tries to emulate existing templates including. -- More specific and useful categories are planned pending consensus (e.g. take confidence into account). local categories = {} local alreadySeenByEtyCategories = {} local function etyCategories(title, lang, args, isTopLevel, passedThroughOtherLanguage, inInhChain) local etymonLang, categoryEtymonTitle, etymonTitle, normTitle, etymonId, etymonLangName, etymonNormLangName, etymonArgs, key, L2_key, etymonPassedThroughOtherLanguage, etymonInInhChain local currGroupLength = 0 local derType = "from" local templateLang = get_lang(args[1], true, true) local langName = lang:getFullName

for i, param in ipairs(args) do		if i > 1 and find(param, ">") then currGroupLength = currGroupLength + 1 etymonLang, etymonTitle, etymonId = getParts(templateLang, param) normTitle = get_link_page(etymonTitle, etymonLang) L2_key = getNormLang(etymonLang):getFullCode .. ">" .. normTitle key = L2_key .. ">" .. etymonId

etymonLangName = etymonLang:getCanonicalName etymonNormLangName = getNormLang(etymonLang):getFullName etymonInInhChain = inInhChain and (derType == "from" or derType == "inh") etymonPassedThroughOtherLanguage = passedThroughOtherLanguage or langName ~= etymonNormLangName etymonArgs = getArgs(templateLang, param)

if isTopLevel then if derType == "bor" or derType == "lbor" or derType == "slbor" then categories[langName .. " terms borrowed from " .. etymonLangName] = true end if derType == "lbor" then categories[langName .. " learned borrowings from " .. etymonLangName] = true elseif derType == "calque" then categories[langName .. " terms calqued from " .. etymonLangName] = true elseif derType == "sl" then categories[langName .. " semantic loans from " .. etymonLangName] = true elseif derType == "slbor" then categories[langName .. " semi-learned borrowings from " .. etymonLangName] = true elseif derType == "blend" then categories[langName .. " blends"] = true elseif derType == "obor" then categories[langName .. " orthographic borrowings from " .. etymonLangName] = true end end

categoryEtymonTitle = etymonTitle if sub(etymonTitle, 1, 15) == "Reconstruction:" then categoryEtymonTitle = gsub(etymonTitle, "^Reconstruction:[^/]+%/", "*") end

if false then --not etymonPassedThroughOtherLanguage then if etymonLangName ~= "Proto-Indo-European" and (derType == "afeq" or keywordDict[derType][6]) and sub(etymonTitle, #etymonTitle) == "-" then if type(etymonArgs) == "table" and disambiguationCount[L2_key] > 1 then categories[langName .. " terms prefixed with " .. categoryEtymonTitle .. " (" .. etymonId .. ")"] = true else categories[langName .. " terms prefixed with " .. categoryEtymonTitle] = true end elseif (derType == "afeq" or keywordDict[derType][6]) and currGroupLength > 1 and (sub(etymonTitle, 1, 1) == "-" or sub(etymonTitle, 1, 2) == "*-") then if type(etymonArgs) == "table" and disambiguationCount[L2_key] > 1 then categories[langName .. " terms suffixed with " .. categoryEtymonTitle .. " (" .. etymonId .. ")"] = true else categories[langName .. " terms suffixed with " .. categoryEtymonTitle] = true end end --TODO: figure out what to do about infixes, interfixes, circumfixes, etc.			elseif etymonPassedThroughOtherLanguage and langName == etymonNormLangName then categories[langName .. " terms borrowed back into " .. langName] = true elseif etymonNormLangName ~= langName then categories[langName .. " terms derived from " .. etymonLangName] = true if etymonInInhChain then categories[langName .. " terms inherited from " .. etymonLangName] = true end end

-- PIE roots and words, excluding suffixes. if false then --etymonLangName == "Proto-Indo-European" then if sub(normTitle, #normTitle) == "-" then if type(etymonArgs) == "table" and disambiguationCount[L2_key] > 1 then if langName == "Proto-Indo-European" then categories["Proto-Indo-European terms belonging to the root " .. categoryEtymonTitle .. " (" .. etymonId .. ")"] = true else categories[langName .. " terms derived from the Proto-Indo-European root " .. categoryEtymonTitle .. " (" .. etymonId .. ")"] = true end else if langName == "Proto-Indo-European" then categories["Proto-Indo-European terms belonging to the root " .. categoryEtymonTitle] = true else categories[langName .. " terms derived from the Proto-Indo-European root " .. categoryEtymonTitle] = true end end elseif sub(etymonTitle, 1, 2) ~= "*-" then if type(etymonArgs) == "table" and disambiguationCount[L2_key] > 1 then categories[langName .. " terms derived from the Proto-Indo-European word " .. categoryEtymonTitle .. " (" .. etymonId .. ")"] = true else categories[langName .. " terms derived from the Proto-Indo-European word " .. categoryEtymonTitle] = true end end end

if derType ~= "afeq" and derType ~= "influence" and alreadySeenByEtyCategories[key] == nil and type(etymonArgs) == "table" then alreadySeenByEtyCategories[key] = true etyCategories(title, lang, etymonArgs, false, etymonPassedThroughOtherLanguage, etymonInInhChain) end elseif i > 1 and param ~= "unc" and param ~= "conf" then derType = param currGroupLength = 0 end end if isTopLevel then local output = {} local sortkey = lang:makeSortKey(title) for category, _ in pairs(categories) do			insert(output, "") end return concat(output) end end

local function paramsSanityCheck(lang, params, id, title) if mw.ustring.len(id) < 2 then error("The `id` parameter must have at least two characters. See the documentation for more details.") elseif id == title or id == page_data.pagename then error("The `id` parameter must not be the same as the page title. Be more creative. See the documentation for more details.") end

local paramLang, paramTitle, paramId local currKeyword = "from" local singleAfParam = "not in group" local output = "" for _, param in ipairs(params) do		if find(param, ">") then --In this case, `templateLang` is the same as `lang` because we are at the top level. paramLang, paramTitle, paramId = getParts(lang, param)

-- Check for link errors. full_link({lang=lang, term=paramTitle, id=paramId}, "term")

--Add a maintenance category if an invalid ID is provided. if getArgs(lang, param) == "missing" or getArgs(lang, param) == "redlink" then if page_data.namespace == "" or page_data.namespace == "Reconstruction" or page_data.namespace == "Appendix" then output = "" else output = "" end end

if currKeyword == "from" then if paramLang:getFullCode ~= lang:getFullCode then error("Error: " .. param .. " is associated with `from` (same-language derivation) but is of language `" .. paramLang:getFullCode .. "`, which does not match the current entry language (`" .. lang:getFullCode .. "`); see the documentation for more details.") end elseif currKeyword == "inh" then check_ancestor(lang, paramLang) elseif currKeyword == "afeq" or keywordDict[currKeyword][6] then if singleAfParam == "not in group" then singleAfParam = param else singleAfParam = "found group" end elseif (currKeyword == "bor" or currKeyword == "lbor" or currKeyword == "obor" or currKeyword == "slbor" or currKeyword == "der" or currKeyword == "calque" or currKeyword == "sl") and (paramLang:getCode == lang:getCode) then error("Error: " .. param .. " is associated with `" .. currKeyword .. "` but has the same language (`" .. paramLang:getCode .. "`) as the current entry; see the documentation for more details.") end elseif param ~= "unc" and param ~= "conf" and param ~= "afeq" and keywordDict[param] == nil then error("Received unknown keyword: " .. param) elseif param ~= "unc" and param ~= "conf" then currKeyword = param if singleAfParam == "found group" then singleAfParam = "not in group" end end end if singleAfParam ~= "not in group" and singleAfParam ~= "found group" then error("Detected `af` or `afeq` group containing only a single etymon: `" .. singleAfParam .. "`; note that `af` and `afeq` groups must have at least two etymons. See the documentation for more details.") end return output end

function export.main(frame) -- Process argument input. local boolean = {type = "boolean"} local args = process_params(frame:getParent.args, {		[1] = {required = true, type = "language", default = "und"},		[2] = {list = true, disallow_holes = true},		["id"] = {required = true},		["title"] = {},		["tree"] = boolean,		["text"] = boolean,	}) local lang = args[1] -- Store non-numeric parameters as locals, then treat the main numeric list as `args`. local id = args["id"] local title = args["title"] local text = args["text"] local tree = args["tree"] args = args[2] -- The `title` parameter is used for overriding the page title. if title == nil then -- Get the canonical pagename. title = page_data.pagename -- Determine if current term is reconstructed. if page_data.namespace == "Reconstruction" or lang:hasType("reconstructed") then title = "*" .. title end end

local output = {paramsSanityCheck(lang, args, id, title)}

-- Add the langcode and `id`, to match the format of scraped parameters. insert(args, 1, lang:getCode) args["id"] = id argsOf[args[1] .. ">" .. title .. ">" .. id] = args

-- Add anchor and categories to output. local fulllang_name = lang:getFullName insert(output, "<ul id=\"" .. fulllang_name .. ":_" .. id .. "\"></ul>") if page_data.namespace == "" or page_data.namespace == "Reconstruction" or page_data.namespace == "Appendix" then insert(output, etyCategories(title, lang, args, true, false, true)) end

-- Insert tree. if tree then insert(output, frame:extensionTag("templatestyles", "", {src="Module:etymon/styles.css"})) insert(output, (etyTree(title, lang, args, {}, true, false, ""))) insert(output, format_categories({fulllang_name .. " entries with etymology trees"}, lang)) end

-- Insert text. if text == "++" then insert(output, etyText(title, lang, args, true, false)) elseif text == "+" then insert(output, etyText(title, lang, args, true, 1)) elseif text == "-" then insert(output, etyText(title, lang, args, false, 1)) elseif text ~= nil then insert(output, etyText(title, lang, args, false, false)) end

return concat(output) end

return export