Module:User:Suzukaze-c/zh-usex

local export = {}

local M = require("Module:zh") local m_links = require("Module:links")

local varinfo = mw.loadData("Module:User:Suzukaze-c/zh/data/info").data

local gsub = mw.ustring.gsub local split = mw.text.split local match = mw.ustring.match local find = mw.ustring.find local trim = mw.text.trim

local match_Han = '[㐀-鿕𠀀-𬺡]' local match_nonHan = '[^㐀-鿕𠀀-𬺡]' local match_Han_mod1 = "['㐀-鿕𠀀-𬺡]" -- picks up hanzi + bold local match_nonHan_mod1 = "[^'@㐀-鿕𠀀-𬺡]" -- takes special syntax into account local match_nonHan_mod2 = "[^'@^㐀-鿕𠀀-𬺡]" -- include capitalization syntax too

local function var_gen(abbr, var_page, var_link_name, rom_page, rom_link_name) local link_var =  .. (var_link_name or varinfo[abbr]['var']) ..  local iso = varinfo[abbr]['iso'] local link_rom =  .. (rom_link_name or varinfo[abbr]['rom']) ..  return { link_var, iso, link_rom } end

local variety_list = { ["m"] = var_gen('m'),

["c"] = var_gen('c'), ["c-gz"] = var_gen('c', false, 'Guangzhou Cantonese'), ["c-lit"] = var_gen('c', false, 'Literary Cantonese'),

["md"] = { "Min Dong", "cdo", "Bàng-uâ-cê / IPA" },

["mn"] = var_gen('mn'), ["mn-tw"] = { "Taiwanese", "nan", "Pe̍h-ōe-jī" }, ["mn-t"] = var_gen('mn-t'), ["w"] = { "Wu", "wuu", "IPA" }, ["w-sh"] = { "Shanghainese", "wuu", "IPA" }, ["h"] = { "Hakka", "hak", "Pha̍k-fa-sṳ" },

["cl"] = var_gen('cl'), }

local m_punctuation = require('Module:User:Suzukaze-c/punctuation') local punctuationZhRegexRange = m_punctuation.langRegexRange('zh')

local ref_list = { ['Analects']  =  { "cl",  "The Analects of Confucius, circa 475 – 221 BCE" }, ['Hanfeizi']  =  { "cl",  "Han Feizi, circa 2nd century BCE" }, ['Hanshu']    =  { "cl",  "The Book of Han, circa 1st century CE" }, ['Liji']      =  { "cl",  "The Book of Rites, circa 4th – 2nd century BCE" }, ['Mengzi']    =  { "cl",  "Mengzi (Mencius), circa 4th century BCE" }, ['Mozi']      =  { "cl",  "Mozi (book), circa 4th century BCE" }, ['Shangshu']  =  { "cl",  "The Book of Documents, circa 4th – 3rd century BCE" }, ['Shiji']     =  { "cl",  "The Records of the Grand Historian, by Sima Qian, circa 91 BCE" }, ['Shijing']   =  { "cl",  "The Classic of Poetry, circa 11th – 7th centuries BCE" }, ['Shujing']   =  { "cl",  "The Book of Documents, circa 7th – 4th centuries BCE" }, ['Shuowen']   =  { "cl",  "Shuowen Jiezi, circa 2nd century CE" }, ['Houhanshu'] =  { "cl",  "The Book of the Later Han, circa 5th century CE" }, ['Yijing']    =  { "cl",  "I Ching, 3rd – 2nd millennia BCE" }, ['Zhanguoce'] =  { "cl",  "Zhan Guo Ce, circa 5th – 3rd centuries BCE" }, ['Zhuangzi']  =  { "cl",  "Zhuangzi, circa 3rd – 2nd centuries BCE" }, }

-- TODO: finish this, for use within export.show and within testcases -- NOTE: don't concat d.ex or d.tr into a string (remember the idea for tidying up pinyin using an external function that takes a table, or the possibility of sending d.ex to an external function for conversion into romanization?) -- IDEAS: -- combining with zh-l? -- allowing manual wikilinks? function export.process(d) -- $d [data] -- $p [processed] -- =require('Module:debug').dump(p.process({ex='lorem ipsum 牛腩飯',pagename='飯'}))

-- d.ex	-- d.tr	-- d.variety -- d.pagename

local b_esc = '㊟⒝㊟' local sp_esc = '㊟⒮㊟'

-- 「美國 華盛頓州」→「㊟⒝㊟美國㊟⒝㊟ ㊟⒝㊟華盛頓州㊟⒝㊟」 if find(d.ex, "'''") then d.ex = gsub(d.ex, "([^']+)", function(text) return b_esc .. gsub(text, ' ', b_esc .. ' ' .. b_esc) .. b_esc end) else d.ex = gsub(d.ex, d.pagename, b_esc .. d.pagename .. b_esc) end

-- space cleanup d.ex = gsub(d.ex, ' +', ' ')

-- preserve spaces between Latn words -- 「lorem ipsum」→「lorem㊟⒮㊟ipsum」 d.ex = gsub(d.ex, "(" .. match_nonHan .. ") (" .. match_nonHan .. ")", "%1" .. sp_esc .. "%2")

-- pad punctuation with spaces d.ex = m_punctuation.space(d.ex, 'zh') -- pad '\n' with spaces (never part of a word) d.ex = gsub(d.ex, '\\n', ' \\n ')

return d end

function export.show(frame) local example = frame.args[1] or error('Example unspecified.') local manual_tr = frame.args['tr'] or false local translation = frame.args[2] or ' Lacking translation. '	local ref = frame.args['ref'] or frame.args['r'] or false local variety = frame.args[3] or (ref_list[ref] and ref_list[ref][1] or false) or 'm'	local pagename = frame.args['TEST-01'] or mw.title.getCurrentTitle.text

local variety_name = variety_list[variety][1] local iso = variety_list[variety][2] local variety_rom_name = variety_list[variety][3] local trad_example, simp_example, tr_example = {}, {}, {}

if ref_list[ref] then ref = ref_list[ref][2] end

-- save approximate usex length for later local function length(example) example = gsub(example, ' ', '') -- syntax example = gsub(example, '{[^}]+}', '') -- syntax example = gsub(example, '[/^_.%-]', '') -- syntax example = gsub(example, '[^㐀-鿕𠀀-𬺡　-ㄭ！-～][^㐀-鿕𠀀-𬺡　-ㄭ！-～]', '兩') -- more-or-less account for half-width characters return mw.ustring.len(example) end local len = length(example)

-- 「美國 華盛頓州」→「美國 華盛頓州」 if find(example, "'''") then example = gsub(example, "([^']+)", function(text) return "" .. gsub(text, " ", " ") .. "" end) else example = gsub(example, pagename, "" .. pagename .. "") end

-- space cleanup example = gsub(example, ' +', ' ')

-- preserve spaces between Latn words example = gsub(example, "(" .. match_nonHan_mod1 .. ") (" .. match_nonHan_mod1 .. ")", "%1㍊㍖%2") -- 「lorem ipsum」→「lorem㍊㍖ipsum」 example = gsub(example, "(" .. match_nonHan_mod1 .. "'*) ('*" .. match_nonHan_mod1 .. ")", "%1㍊㍖%2") -- 「lorem ipsum」→「lorem㍊㍖ipsum」

-- pad punctuation with spaces example = m_punctuation.space(example, 'zh') -- pad '\n' with spaces (never part of a word) example = gsub(example, '\\n', ' \\n ')

-- un-split xiehouyu that has been split (only works on the page of the xiehouyu itself...) example = gsub(example, gsub(pagename, '，', ' ， '), pagename)

-- internal POJ double hyphen markup example = gsub(example, '%-%-', '￥')

-- space cleanup example = gsub(example, ' +', ' ') example = trim(example)

example = split(example, ' ') for i, word in pairs(example) do		local trad_word, simp_word, tr_word = word, word, word

if word == '\\n' then trad_word, simp_word, tr_word = ' ', ' ', ' ' elseif m_punctuation.convChar(word, lang) then tr_word = m_punctuation.convChar(word, lang) else -- { }	change roman, part 1 if find(trad_word, '%{') then trad_word = gsub(trad_word, '{'..'([^{}]+)'..'}', '') simp_word = gsub(simp_word, '{'..'([^{}]+)'..'}', '') end

-- [ ]	change simplified if find(trad_word, '%[') then trad_word = gsub(trad_word, '%['..'('..match_Han_mod1..'+)'..'%]', '') simp_word = gsub(simp_word, '('..match_Han..')' .. '%['..'('..match_Han_mod1..'+)'..'%]', '%2') tr_word = gsub(tr_word, '%['..'('..match_Han_mod1..'+)'..'%]', '') else simp_word = M.ts(simp_word) end

-- { }	change roman, part 2 if find(tr_word, '%{') then if iso == 'cmn' then tr_word = gsub(tr_word, '('..match_Han..')' .. '{'..'([^{}]+)'..'}', '%2') -- 「要{jiu1}」→「jiu1」 tr_word = gsub(tr_word, '('..match_nonHan_mod2..'+)' .. '{'..'([^{}]+)'..'}', '%2') -- 「size{saai1 si2}」→「saai1 si2」 elseif iso == 'nan' or iso == 'hak' or iso == 'cdo' then tr_word = gsub(tr_word, '('..match_Han..')' .. '{'..'([^{}]+)'..'}', '%2❖') tr_word = gsub(tr_word, '('..match_nonHan_mod2..'+)' .. '{'..'([^{}]+)'..'}', '%2❖') else tr_word = gsub(tr_word, '('..match_Han..')' .. '{'..'([^{}]+)'..'}', '%2❧') tr_word = gsub(tr_word, '('..match_nonHan_mod2..'+)' .. '{'..'([^{}]+)'..'}', '%2❧') end end

-- auto roman -- TODO: replace with actual auto roman lol if iso == 'cmn' then tr_word = gsub(tr_word, match_Han, "vē") -- TODO: process with Module:cmn-pron (see current Module:zh-usex) elseif iso == 'nan' or iso == 'hak' then tr_word = gsub(tr_word, match_Han, 've❖') elseif iso == 'yue' then tr_word = gsub(tr_word, match_Han, 've1❧') end

-- \	change link text trad_word = gsub(trad_word, '\\', '|') simp_word = gsub(simp_word, '\\', '|') tr_word = gsub(tr_word, '.+\\', '')

-- ^	capitalize roman trad_word = gsub(trad_word, '%^', '') simp_word = gsub(simp_word, '%^', '') tr_word = gsub(tr_word, '%^(.)', mw.ustring.upper)

-- _	split link, join roman trad_word = gsub(trad_word, '_', ']][[') -- if I replace "_" with " " the space remains after processing simp_word = gsub(simp_word, '_', ']][[') tr_word = gsub(tr_word, '_', '')

-- .	join link, split roman trad_word = gsub(trad_word, '%.', '') simp_word = gsub(simp_word, '%.', '') tr_word = gsub(tr_word, '%.', ' ')

-- ￥	min nan poj double hyphen trad_word = gsub(trad_word, '￥', '') simp_word = gsub(simp_word, '￥', '') tr_word = gsub(tr_word, '￥', '--')

-- return spaces trad_word = gsub(trad_word, '㍊㍖', ' ') simp_word = gsub(simp_word, '㍊㍖', ' ') tr_word = gsub(tr_word, '㍊㍖', ' ')

-- linking if find(trad_word, "@") or find(simp_word, "@") then trad_word = gsub(trad_word, '@', '') simp_word = gsub(simp_word, '@', '') tr_word = gsub(tr_word, '@', '') elseif find(trad_word, "") or find(simp_word, "") then -- with bold formatting trad_word = gsub(trad_word, '([^ ]+)', function(text) return  .. text ..  end) simp_word = gsub(simp_word, '([^ ]+)', function(text) return  .. text ..  end) else trad_word = gsub(trad_word, '([^ ]+)', function(text) return  .. text ..  end) simp_word = gsub(simp_word, '([^ ]+)', function(text) return  .. text ..  end) end end

trad_example[i], simp_example[i], tr_example[i] = trad_word, simp_word, tr_word end

trad_example = table.concat(trad_example, '') simp_example = table.concat(simp_example, '') tr_example = table.concat(tr_example, ' ')

-- romanization hyphen substitute tr_example = gsub(tr_example, '❖([^A-Za-zÀ-ʬ])', '%1') -- dispose of before a non-letter tr_example = gsub(tr_example, '❖$', '') -- dispose of at very end tr_example = gsub(tr_example, '❖', '-')

-- romanization space substitute tr_example = gsub(tr_example, "❧", " ") -- 「一. 」→「jat1❧◆.◇」→「jat1 ◆.◇」 tr_example = gsub(tr_example, "❧", ' ')

-- punctuation spacing tr_example = m_punctuation.main(tr_example)

tr_example = trim(tr_example)

if manual_tr then tr_example = manual_tr end

-- roman beautifying if iso == 'cmn' then -- TODO: format? cmn-pron end if iso == 'yue' then -- TODO: super end if variety == 'mn-t' then -- TODO: super end if iso == 'cdo' then -- TODO: rom + ipa end if iso == 'wuu' then -- TODO: rom > ipa end

-- trad/simp on different lines if ref or find(trad_example, ' ') then len = 99 end

-- fancy links and language tagging stuff local lang, sc = require("Module:languages").getByCode(iso), require("Module:scripts").getByCode('Hani') trad_example = m_links.full_link({ lang = lang, term = trad_example .. '//', sc = sc }) simp_example = m_links.full_link({ lang = lang, term = simp_example, sc = sc })

if trad_example == simp_example then simp_example = false end

-- tags local function tag(text) return ' &#91;' .. text .. '&#93; ' -- HTML entity since "MSC" is interpreted poorly end local tag_text = { ['ts'] = 'trad. and simp.', ['t'] = 'trad.', ['s'] = 'simp.', }

-- add structure tr_example =  .. tr_example ..  .. tag(variety_rom_name) tr_example = gsub(tr_example, ' +', ' ') if len > 10 then if simp_example then example = trad_example .. tag(variety_name..', '..tag_text['t']) .. ' ' .. simp_example .. tag(variety_name..', '..tag_text['s']) else example = trad_example .. tag(variety_name..', '..tag_text['ts']) end tr_example = '' .. tr_example .. '' .. (ref and ' From: ' .. ref .. ' ' or '') .. '' .. translation .. '' else if simp_example then example = trad_example .. tag(variety_name..', '..tag_text['t']) .. '／' .. simp_example .. tag(variety_name..', '..tag_text['s']) else example = trad_example .. tag(variety_name..', '..tag_text['ts']) end tr_example = ' ― ' .. tr_example .. ' ― ' .. translation end

local testing = frame:preprocess(' ' .. example .. '\n◆◆◆◆\n' .. tr_example .. '\n◆◆◆◆\n' .. translation .. '\n◆◆◆◆\n' .. len .. '  ') .. ' '	return example .. tr_example .. testing end

return export