Module:User:kc kennylau/zh-usex

-- sandbox of Module:zh-usex

local export = {}

local m_zh = require("Module:zh") local m_languages = require("Module:languages")

local find = mw.ustring.find local gsub = mw.ustring.gsub local match = mw.ustring.match local sub = mw.ustring.sub local split = mw.text.split

-- Use this when the actual title needs to be known. local actual_title = mw.title.getCurrentTitle

-- Use this when testcases need to be able to override the title (for bolding, -- for instance). local title = actual_title local PAGENAME = PAGENAME or title.text

local data = mw.loadData("Module:zh-usex/data") local punctuation = data.punctuation local ref_list = data.ref_list local pron_correction = data.pron_correction local polysyllable_pron_correction = data.polysyllable_pron_correction

local zh_format_end = " "

--local Han_pattern = "[" .. require("Module:scripts").getByCode("Hani"):getCharacters .. "]" local Han_pattern = "[一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𮹟𰀀-𲎯]" local UTF8_char = '[%z\1-\127\194-\244][\128-\191]*' local UTF8_char2 = '[%z\1-Z\\^-\127\194-\244][\128-\191]*' -- not "[" or "]"

local tr_data = { cmn = { segment_c = " %-", separator_conv = {[""]="",[" "]=" ",["-"]="",["--"]="-"}, link_ignore = "\1.^", tr_cap = true, combine = function(t) return t:gsub("^%f[aoe\195-\199]","\3") -- temporary substitute of the apostrophe end, },	yue = { segment_c = " ", separator_conv = {[""]="",[" "]=" "}, link_ignore = "\1", tr_cap = false, combine = function(t) return " "..t end, },	["nan-hbl"] = { segment_c = " ~", separator_conv = {[""]="",[" "]=" ",["~"]="-"}, link_ignore = "\1%%.^", tr_cap = true, combine = function(t) return "-"..t end, },	hak = { segment_c = " ~", separator_conv = {[""]="",[" "]=" ",["~"]="-"}, link_ignore = "\1.^", tr_cap = true, combine = function(t) return "-"..t end, },	default = { segment_c = " ", separator_conv = {[""]="",[" "]=" "}, link_ignore = "\1", tr_cap = false, }, }

local function get_tr(display, norm_code) local given, given_pos = {}, 1 -- record the characters with given transcription local punc, punc_pos = {}, 1 -- record the punctuations with given transcription local tr_datapoint = tr_data[norm_code] local word_regex = "[^"..tr_datapoint.link_ignore.." \2{}".."]+" -- regex that matches words local tr_word = display:gsub("\1", " ") :gsub("%["..UTF8_char2.."%]", "") :gsub("("..UTF8_char.."){([^{}]*)}", function(a,b) -- record given tr and replace with "{"			given[given_pos] = a:find("^%w$") and b or tr_datapoint.combine(b)			given_pos = given_pos + 1			return "{"		end) :gsub("%f[^ ][^ ]+%f[ ]", function(a) -- record punctuation and replace with "}"			if punctuation[a] then				punc[punc_pos] = punctuation[a]				punc_pos = punc_pos + 1				return "}"			end			return a		end) :gsub("","\1"):gsub("","\2") -- substitute bold tags for further processing :gsub(word_regex,function(word)			-- first attempt to get the pronunciation of the whole word			local res = polysyllable_pron_correction[norm_code][word]				or pron_correction[norm_code][word]			if res then return res end			local length = 0 -- for check_pron (a bit hacky because check_pron only checks if length == 1)			if word:find("^"..UTF8_char.."$") then length = 1 end			res = m_zh.check_pron(word, norm_code, length)			if res then return tr_datapoint.combine(res:gsub("/.+","")) end			-- if it fails, get pronunciation of each character			return word:gsub(UTF8_char, function(ch) local ch_res = pron_correction[norm_code][ch] if ch_res then return ch_res end ch_res = m_zh.check_pron(ch, norm_code, 1) return ch_res and tr_datapoint.combine(ch_res:gsub("/.+","")) or ch			end)		end) if norm_code == "cmn" then tr_word = tr_word:gsub("%.%.","-") end if norm_code ~= "yue" then tr_word = tr_word:gsub("%."," ") end given_pos, punc_pos = 0,0 tr_word = tr_word:gsub("{",function -- substitute back the stored results			given_pos = given_pos + 1			return given[given_pos]		end) :gsub("}",function -- substitute back the punctuations			punc_pos = punc_pos + 1			return punc[punc_pos]		end) return tr_word end

local function make_link(target, display) target = target == "" and display or target -- Remove bold tags from target target = target:gsub("","") -- Generate link to Chinese section local result = "" .. display .. "" -- For debugging purposes --if actual_title.nsText == "Module" then mw.log(display, target, "->", result) end return result end

local function convert(conv_fun, text) return (text .. "A[A]") :gsub("([^%[%]]*)"..UTF8_char2.."%[("..UTF8_char2..")%]",			function(a,b) return conv_fun(a)..b end) :sub(1,-2) end

function export.show(frame) local params = { [1] = { required = true },	-- example [2] = {},					-- translation [3] = {},					-- variety lit = {}, tr = {}, ref = {}, r = { alias_of = "ref" }, inline = {}, audio = {}, a = { alias_of = "audio" }, collapsed = { type = "boolean" }, -- Allow specifying pagename in testcases on documentation page. pagename = actual_title.nsText == "Template" and {} or nil, nocat = { type = "boolean" }, tr_nocap = { type = "boolean" }, simp = { type = "boolean" } }	local category = frame.args["category"] or error("Please specify the category.") local args, unrecognized_args = require("Module:parameters").process(frame:getParent.args, params, true) if args.pagename then -- Override title in Module namespace. title = mw.title.new(args.pagename) PAGENAME = title.text end local example = args[1] or error("Example unspecified.") local translation = args[2] local literal = args["lit"] local reference = args["ref"] local manual_tr = args["tr"] local display = args["display_type"] local inline = args["inline"] local audio_file = args["audio"] local collapsed = args["collapsed"] local simp = args["simp"] local phonetic = "" local original_length = example:gsub("[^\194-\244]+",""):len local variety = args[3] or frame.args["variety"] or (ref_list[reference] and ref_list[reference][1] or false) or "cmn" local variety_data = data.varieties_by_code[variety] or data.varieties_by_old_code[variety] or error("Variety " .. variety .. " not recognized.") -- unpack doesn't work here because the data was loaded using mw.loadData local std_code, norm_code, desc, tr_desc = variety_data[2], variety_data[3], variety_data[4], variety_data[5] norm_code = norm_code or std_code variety = std_code local lang_obj_wikt = m_languages.getByCode(variety, 3, "allow etym") if next(unrecognized_args) then --Special:WhatLinksHere/Template:tracking/zh-usex/unrecognized arg require("Module:debug").track_unrecognized_args(unrecognized_args, "zh-usex") end if reference then require("Module:debug").track("zh-usex/ref") end if example:find("[%(%)]") then require("Module:debug").track("zh-usex/parentheses") end if example:find("&#") then require("Module:debug").track("zh-usex/html") end -- future escape character? if example:find("`") then require("Module:debug").track("zh-usex/backtick") end if example:find(" ") then require("Module:debug").track("zh-usex/double-space") end if (norm_code == "nan-hbl" or norm_code:find("^hak")) and example:find("%-") then require("Module:debug").track("zh-usex/hyphen") end if example:find("%w%{") then require("Module:debug").track("zh-usex/rom-text") end if not translation or translation == '' then -- per standard Module:usex translation = ' (please add an English translation of this ' .. (category == "quotations" and "quotation" or "usage example") .. ') ' end -- should we generate the other (simp/trad) form -- (in the end, only actually display if the converted text is different) local do_conv = true if norm_code == "vi" or norm_code == "ko" then do_conv = false end local conv_fun = m_zh.ts	if simp then if category ~= "quotations" then error("parameter simp cannot be true in Template:zh-x or Template:zh-co.") end if norm_code == "vi" or norm_code == "ko" or norm_code == "lzh" or variety == "yue-HK" or variety == "cmn-TW" or				variety == "nan-hbl-TW" or variety == "lzh-cmn-TW" or variety == "hak-hai" or variety == "hak-dab" or				variety == "hak-zha" then error(("Parameter simp= cannot be specified for variety '%s'"):format(variety)) end conv_fun = m_zh.st	end -- should we generate the transcription local generate_tr = false if tr_data[norm_code] then if manual_tr then require("Module:debug").track("zh-usex/manual-tr") else generate_tr = true end end local boldify = false -- automatically boldify pagetitle if nothing is in bold if not example:find("'''") and not punctuation[PAGENAME] then boldify = true end -- tidying up the example, making it ready for transcription example = gsub(example, "[？！，. 、“”…；：‘’|（）「」『』—《》〈〉【】·　．～]", " %0 ") example = example:gsub("— —", "——") -- double em-dash (to be converted to single em-dash later) :gsub(""," ") -- process linebreaks :gsub("^ *",""):gsub(" *$",""):gsub(" +"," ") -- process spaces :gsub("%[%[(.-)%]%]%f[^%]]",function(a) -- process 			return a:gsub(" ","\1")		end) :gsub("([^']+)", "%1") -- normalise bold syntax :gsub("%^","^") :gsub("(%["..UTF8_char2.."%])","%1") :gsub("({[^{}]*})","%1") -- parsing: convert "-", "--", "---" to "-", "..", "--" respectively -- so that "-" is the character that delimits links -- further explanation will use the replacement result to refer to the commands if norm_code == "cmn" then example = example:gsub("%-+",{["--"]="..",["---"]="--"}) if example:find("%-[^%-%s]+\\") then require("Module:debug").track("zh-usex/extra-pinyin") end end

local regex_data = tr_data[norm_code] or tr_data.default local segment_c = regex_data.segment_c -- the characters that delimit links local separator_conv = regex_data.separator_conv -- the table for separator mapping local link_ignore = regex_data.link_ignore -- the characters that do not affect links local tr_cap = regex_data.tr_cap -- transliteration can be capitalised local segment_regex = "(["..segment_c.."]*)([^"..segment_c.."]+)" -- the regex that matches each segment and the separator before it	local cache = {} -- store the result of each segment local trad_text = "" local simp_text = "" -- generate the transliteration -- but store the results in the cache -- and also build up trad_text and simp_text local tr_text = example:gsub(segment_regex, function(separator,seg)		separator = separator_conv[separator] or error('Invalid separator: "'..separator..'"')		if cache[seg] then			trad_text = trad_text .. cache[seg].trad			simp_text = simp_text .. cache[seg].simp			return separator..cache[seg].tr		end		if punctuation[seg] then			cache[seg] = {				trad = seg,				simp = seg,				tr = punctuation[seg]			}			trad_text = trad_text .. seg			simp_text = simp_text .. seg			return separator..punctuation[seg]		end		local generate_link = 0		seg, generate_link = seg:gsub("@","")		generate_link = (generate_link == 0)		local target, display = "", seg		local pos = seg:find("\\",1,true)		if generate_link and pos then			-- move formatting from start of target to display			-- e.g. ^甲\乙 --> 甲\^乙			local bold = ""			local caret = ""			local start = 1			if seg:sub(1,3) == "" then				bold,start = "",4 end if tr_cap and seg:sub(start) == "^" then caret,start = "^",start+1 end target, display = seg:sub(start,pos-1), bold..caret..seg:sub(pos+1,-1) if target:find("") then -- Check for bold tags in target. require("Module:debug").track("zh-usex/bold-target") end end local target_trad = target:gsub("%["..UTF8_char2.."%]","") local target_simp = do_conv and convert(conv_fun, target) local occurrences = 0 if boldify then display, occurrences = display:gsub(PAGENAME,""..PAGENAME.."") end if occurrences > 0 then display = display:gsub("%["..PAGENAME.."%]","%["..PAGENAME.."%]") :gsub("%^","^") :gsub("(%["..UTF8_char2.."%])","%1</b>") :gsub("</b>({[^{}]*})","%1</b>") end local display_derom = display:gsub("{[^{}]*}","") :gsub("["..link_ignore.."]+","") local display_trad = display_derom:gsub("%["..UTF8_char2.."%]","") local display_simp = do_conv and convert(conv_fun, display_derom) or "" local seg_tr = generate_tr and get_tr(display, norm_code) or "" if display_trad:gsub("</?b>","") == PAGENAME or target_trad == PAGENAME then generate_link = false if boldify and occurrences == 0 then display_trad = "'''" .. display_trad .. ""				display_simp = "" .. display_simp .. ""				seg_tr = "" .. seg_tr .. "'''"			end end local seg_trad = generate_link and make_link(target_trad, display_trad) or display_trad local seg_simp = generate_link and do_conv and make_link(target_simp, display_simp) or display_simp cache[seg] = { trad = seg_trad, simp = seg_simp, tr = seg_tr }		trad_text = trad_text .. seg_trad simp_text = simp_text .. seg_simp return separator..seg_tr end)	if trad_text == simp_text then		do_conv = false		simp_text = nil	end	if not trad_text:find("</?b>") then		require("Module:debug").track("zh-usex/no-bold")	end	-- format generated tr	-- at this point we have three temporary substitutions:	-- :\1, :\2, ':\3	if generate_tr then		if norm_code == "cmn" then -- format apostrophe			tr_text = tr_text				:gsub("%f[^%z -]([\1\2^]*)\3", "%1")				:gsub("\1\3","\3\1") -- <b>' → '<b>				:gsub("^\3","\3^")  -- ^'   → '^ (shouldn't occur)		elseif norm_code == "nan-hbl" or norm_code == "hak" then -- format hyphens		mw.log(tr_text)			tr_text = tr_text				:gsub("%^%-","-^")				:gsub("\1%-","-\1") -- <b>-  → -<b>				:gsub("%-\2","\2-") -- -</b> → </b>-				:gsub("%f[^%z ]%-%f[^%z %-]","") -- "-chhek" at beginning -> "chhek"				:gsub("%f[%z %-]%-%f[%z ]","") -- "shi-" at the end -> "shi"				:gsub("%-+","-")				:gsub("%-?%%%-?", "--")		mw.log(tr_text) end tr_text = tr_text:gsub("[\1\2\3]",{["\1"]="",["\2"]="",["\3"]="&#39;"}) if tr_text:find(Han_pattern) then require("Module:debug").track("zh-usex/character without transliteration") end end

local tag_start = " <span style=\"color:darkgreen; font-size:x-small;\">&#91;" -- HTML entity since "MSC" is interpreted poorly local tag_end = "&#93; " local simp_link = "simp." local trad_link = "trad." if simp then simp_link, trad_link = trad_link, simp_link end trad_text, auto_spaces = trad_text:gsub("([a-zA-Z]%]%])(%[%[[a-zA-Z])", "%1 %2")	simp_text = do_conv and simp_text:gsub("([a-zA-Z]%]%])(%[%[[a-zA-Z])", "%1 %2") or false	phonetic = manual_tr or (generate_tr and tr_text)

if auto_spaces > 0 then require("Module:debug").track("zh-usex/auto-spaces") end -- overall transcription formatting if phonetic then phonetic = gsub(phonetic, " </b>", "</b> ") phonetic = gsub(phonetic, " ", " ") if norm_code == "yue" or norm_code == "zhx-tai" or norm_code == "nan-tws" or norm_code == "nan-hnm" or			norm_code == "zhx-sic" or norm_code == "cjy" or norm_code == "hsn" or norm_code == "gan" or			variety == "hak-mei" then phonetic = gsub(phonetic, "([a-zê]+)([1-9%-]+)", "%1%2") -- superscript tones end phonetic = gsub(phonetic, " ([,%.?!;:’”)])", "%1") -- remove excess spaces from punctiation		phonetic = gsub(phonetic, "([‘“(]) ", "%1") phonetic = phonetic:gsub(" ", " ") if not manual_tr then if norm_code == "nan-hbl" then phonetic = gsub(phonetic, " +%-%-", "--") end end

-- capitalisation if not manual_tr then if norm_code == "yue" or norm_code == "zhx-tai" or norm_code == "cjy" or norm_code == "hsn" or				norm_code == "cmn-wuh" or norm_code == "nan-tws" or norm_code == "wxa" or norm_code == "wuu" or				variety == "hak-mei" then args.tr_nocap = true end if not args.tr_nocap and match(example, "[. ？！]") then phonetic = "^" .. gsub(phonetic, "([%.?!]) ", "%1 ^") end if not args.tr_nocap then phonetic = gsub(phonetic, "([%.%?%!][”’]) (.)", "%1 ^%2") phonetic = gsub(phonetic, " (.)", " ^%1") phonetic = gsub(phonetic, ": ([“‘])(.)", ": %1^%2") end phonetic = gsub(phonetic, "%^<b>", "<b>^") phonetic = gsub(phonetic, "%^+.", mw.ustring.upper) phonetic = gsub(phonetic, "%^", "") end

if norm_code == "wuu" then local wuu_pron = require("Module:wuu-pron") if phonetic:find(":") then phonetic = "''" .. wuu_pron.wugniu_format(phonetic:sub(4)) .. "''"			else phonetic = "''" .. wuu_pron.wugniu_format(wuu_pron.wikt_to_wugniu(phonetic)) .. "''"			end elseif norm_code == "cmn-wuh" or norm_code == "wxa" then phonetic = "<span class=\"IPA\">[" .. phonetic .. "] "

elseif norm_code == "cdo" then local cdo_pron = require("Module:cdo-pron") phonetic = "''" .. phonetic .. "''" ..				(not match(phonetic, "-[^ ]+-[^ ]+-[^ ]+-")					and " / <span class=\"IPA\"> [" .. cdo_pron.sentence(phonetic) .. "] "					or "")

else phonetic = "''" .. phonetic .. "''"		end phonetic = "<span lang=\"zh-Latn\" style=\"color:#404D52\">" .. phonetic .. " "	end local collapse_start, collapse_end, collapse_tag, collapse_border_div, collapse_border_div_end = , , , , '' simplified_start = ' ' if collapsed then collapse_start = ' ' collapse_end = ' ' collapse_tag = ' ' collapse_border_div = '<div class="vsSwitcher" data-toggle-category="usage examples" style="border-left: 1px solid #930; border-left-width: 2px; padding-left: 0.8em;">' collapse_border_div_end = ' ' simplified_start = ' ' end if actual_title.nsText == '' and (not args.nocat) then -- fixme: probably categorize only if text contains the actual word if reference then cat = "" else cat = "" end end local zh_format_start_simp = "<span lang=\"zh-Hans\" class=\"Hans\">" local zh_format_start_trad = "<span lang=\"zh-Hant\" class=\"Hant\">" if simp then zh_format_start_simp, zh_format_start_trad = zh_format_start_trad, zh_format_start_simp end -- indentation, font and identity tags if ((norm_code == "cmn" and original_length > 7)			or (norm_code ~= "cmn" and original_length > 5)			or reference			or collapsed			or (match(example, "[，. ？！、：；　]") and norm_code == "wuu")			or (norm_code == "cdo" and original_length > 3)			or (inline or "" ~= "")) then

trad_text = zh_format_start_trad .. trad_text .. zh_format_end

if not phonetic then translation = "''" .. translation .. "''"		end

if phonetic then phonetic = "<dd>" .. collapse_start .. phonetic translation = "<dd>" .. translation .. "</dd>" tr_tag = tag_start .. tr_desc .. tag_end .. collapse_end .. "</dd>" else translation = "<dd>" .. translation .. "</dd>" end

if audio_file then audio = "<dd></dd>" end if do_conv then trad_tag = collapse_start .. tag_start .. desc .. ", " .. trad_link .. tag_end .. collapse_end .. collapse_tag simp_text = simplified_start .. collapse_start .. zh_format_start_simp .. simp_text .. zh_format_end simp_tag = tag_start .. desc .. ", " .. simp_link .. tag_end .. collapse_end elseif norm_code == "vi" or norm_code == "ko" then trad_tag = collapse_start .. tag_start .. desc ..", " .. trad_link .. tag_end .. collapse_end .. collapse_tag else trad_tag = collapse_start .. tag_start .. desc ..", " .. trad_link .. " and " .. simp_link .. tag_end .. collapse_end .. collapse_tag end

if reference then reference = "<dd>" .. collapse_start .. " From: " .. (ref_list[reference] and ref_list[reference][2] or reference) .. " " .. collapse_end .. "</dd>" end

return collapse_border_div .. "<dl class=\"zhusex\">" .. trad_text .. trad_tag .. (simp_text or "") .. (simp_tag or "") .. (reference or "") .. (phonetic and phonetic .. tr_tag or "") .. (audio or "") .. translation .. "</dl>" .. (cat or "") .. collapse_border_div_end

else trad_text = zh_format_start_trad .. trad_text .. zh_format_end divider = " ―  "

if variety ~= "cmn" then ts_tag = tag_start .. desc .. tag_end tr_tag = tag_start .. tr_desc .. tag_end end

if not phonetic then translation = "''" .. translation .. "''"		end

if do_conv then simp_text = "<span lang=\"zh-Hani\" class=\"Hani\">／ " .. zh_format_start_simp .. simp_text .. zh_format_end end

if audio_file then audio = " " end

return trad_text .. (simp_text or "") .. (ts_tag or "") .. divider .. (phonetic and phonetic .. (tr_tag or "") .. (audio or "") .. divider or "") .. translation .. (literal and " (literally, “" .. literal .. "”)" or "") .. (cat or "") end end

-- function export.migrate(text, translation, ref) -- 	if type(text) == "table" then -- 		if not text.args or not text.args[1] then -- 			text = text:getParent -- 		end -- 		if text.args[2] and text.args[2] ~= '' then -- 			ref = text.args[1] -- 			translation = text.args[3] -- 			text = text.args[2] -- 		else -- 			text = text.args[1] -- 		end -- 	end -- 	text = text:gsub('^[%*#: \n]+', ):gsub('[ \n]+$', ):gsub(' +', '　'):gsub('\n+', ' '):gsub('|', '\\'):gsub('\'\'\'%[%[', ' '):gsub('%]%]\'\'\, ' '):gsub('%]%]%[%[', ' '):gsub('%]%]', ):gsub('%[%[', '') -- :gsub('\'\'\, ):gsub(',', '，'):gsub('!', '！'):gsub('%?', '？') -- 	if translation then -- 		if ref and ref ~= '' then -- 			return '' -- 		else -- 			return '' -- 		end -- 	else -- 		return text -- 	end -- end

return export