Module:User:Theknightwho/recognition data/make

local append = require("Module:table").append local concat = table.concat local copy = require("Module:table").shallowcopy local deep_equals = require("Module:table").deepEquals local char = string.char local insert = table.insert local sort = table.sort local split = mw.text.split local u = mw.ustring.char

local script_codes = { ["Adlam"] = "Adlm", ["Ahom"] = "Ahom", ["Anatolian_Hieroglyphs"] = "Hluw", ["Arabic"] = "Arab", ["Armenian"] = "Armn", ["Avestan"] = "Avst", ["Balinese"] = "Bali", ["Bamum"] = "Bamu", ["Bassa_Vah"] = "Bass", ["Batak"] = "Batk", ["Bengali"] = "Beng", ["Bhaiksuki"] = "Bhks", ["Bopomofo"] = "Bopo", ["Brahmi"] = "Brah", ["Braille"] = "Brai", ["Buginese"] = "Bugi", ["Buhid"] = "Buhd", ["Canadian_Aboriginal"] = "Cans", ["Carian"] = "Cari", ["Caucasian_Albanian"] = "Aghb", ["Chakma"] = "Cakm", ["Cham"] = "Cham", ["Cherokee"] = "Cher", ["Chorasmian"] = "Chrs", ["Coptic"] = "Copt", ["Cuneiform"] = "Xsux", ["Cypriot"] = "Cprt", ["Cypro_Minoan"] = "Cpmn", ["Cyrillic"] = "Cyrl", ["Deseret"] = "Dsrt", ["Devanagari"] = "Deva", ["Dives_Akuru"] = "Diak", ["Dogra"] = "Dogr", ["Duployan"] = "Dupl", ["Egyptian_Hieroglyphs"] = "Egyp", ["Elbasan"] = "Elba", ["Elymaic"] = "Elym", ["Ethiopic"] = "Ethi", ["Georgian"] = "Geor", ["Glagolitic"] = "Glag", ["Gothic"] = "Goth", ["Grantha"] = "Gran", ["Greek"] = "Grek", ["Gujarati"] = "Gujr", ["Gunjala_Gondi"] = "Gong", ["Gurmukhi"] = "Guru", ["Han"] = "Hani", ["Hangul"] = "Hang", ["Hanifi_Rohingya"] = "Rohg", ["Hanunoo"] = "Hano", ["Hatran"] = "Hatr", ["Hebrew"] = "Hebr", ["Hiragana"] = "Hira", ["Imperial_Aramaic"] = "Armi", ["Inscriptional_Pahlavi"] = "Phli", ["Inscriptional_Parthian"] = "Prti", ["Japanese"] = "Jpan", ["Javanese"] = "Java", ["Kaithi"] = "Kthi", ["Kannada"] = "Knda", ["Katakana"] = "Kana", ["Kawi"] = "Kawi", ["Kayah_Li"] = "Kali", ["Kharoshthi"] = "Khar", ["Khitan_Small_Script"] = "Kits", ["Khmer"] = "Khmr", ["Khojki"] = "Khoj", ["Khudawadi"] = "Sind", ["Korean"] = "Kore", ["Latinx"] = "Latinx", ["Lao"] = "Laoo", ["Latin"] = "Latn", ["Lepcha"] = "Lepc", ["Limbu"] = "Limb", ["Linear_A"] = "Lina", ["Linear_B"] = "Linb", ["Lisu"] = "Lisu", ["Lycian"] = "Lyci", ["Lydian"] = "Lydi", ["Mahajani"] = "Mahj", ["Makasar"] = "Maka", ["Malayalam"] = "Mlym", ["Mandaic"] = "Mand", ["Manichaean"] = "Mani", ["Marchen"] = "Marc", ["Masaram_Gondi"] = "Gonm", ["Medefaidrin"] = "Medf", ["Meetei_Mayek"] = "Mtei", ["Mende_Kikakui"] = "Mend", ["Meroitic_Cursive"] = "Merc", ["Meroitic_Hieroglyphs"] = "Mero", ["Miao"] = "Plrd", ["Modi"] = "Modi", ["Mongolian"] = "Mong", ["Mro"] = "Mroo", ["Multani"] = "Mult", ["Myanmar"] = "Mymr", ["Nabataean"] = "Nbat", ["Nag_Mundari"] = "Nagm", ["Nandinagari"] = "Nand", ["New_Tai_Lue"] = "Talu", ["Newa"] = "Newa", ["Nko"] = "Nkoo", ["Nushu"] = "Nshu", ["Nyiakeng_Puachue_Hmong"] = "Hmnp", ["Odia"] = "Orya", ["Ogham"] = "Ogam", ["Ol_Chiki"] = "Olck", ["Old_Hungarian"] = "Hung", ["Old_Italic"] = "Ital", ["Old_North_Arabian"] = "Narb", ["Old_Permic"] = "Perm", ["Old_Persian"] = "Xpeo", ["Old_Sogdian"] = "Sogo", ["Old_South_Arabian"] = "Sarb", ["Old_Turkic"] = "Orkh", ["Old_Uyghur"] = "Ougr", ["Osage"] = "Osge", ["Osmanya"] = "Osma", ["Pahawh_Hmong"] = "Hmng", ["Palmyrene"] = "Palm", ["Pau_Cin_Hau"] = "Pauc", ["Phags_Pa"] = "Phag", ["Phoenician"] = "Phnx", ["polytonic"] = "polytonic", ["Psalter_Pahlavi"] = "Phlp", ["Rejang"] = "Rjng", ["Runic"] = "Runr", ["Samaritan"] = "Samr", ["Saurashtra"] = "Saur", ["Sharada"] = "Shrd", ["Shavian"] = "Shaw", ["Siddham"] = "Sidd", ["SignWriting"] = "Sgnw", ["Sinhala"] = "Sinh", ["Sogdian"] = "Sogd", ["Sora_Sompeng"] = "Sora", ["Soyombo"] = "Soyo", ["Sundanese"] = "Sund", ["Syloti_Nagri"] = "Sylo", ["Syriac"] = "Syrc", ["Tagalog"] = "Tglg", ["Tagbanwa"] = "Tagb", ["Tai_Le"] = "Tale", ["Tai_Tham"] = "Lana", ["Tai_Viet"] = "Tavt", ["Takri"] = "Takr", ["Tamil"] = "Taml", ["Tangsa"] = "Tnsa", ["Tangut"] = "Tang", ["Telugu"] = "Telu", ["Thaana"] = "Thaa", ["Thai"] = "Thai", ["Tibetan"] = "Tibt", ["Tifinagh"] = "Tfng", ["Tirhuta"] = "Tirh", ["Toto"] = "Toto", ["Ugaritic"] = "Ugar", ["Vai"] = "Vaii", ["Vithkuqi"] = "Vith", ["Wancho"] = "Wcho", ["Warang_Citi"] = "Wara", ["Yezidi"] = "Yezi", ["Yi"] = "Yiii", ["Zanabazar_Square"] = "Zanb", }

for k, v in pairs(copy(script_codes)) do	script_codes[v] = v end

local Grek_chars = require("Module:scripts/data").Grek.characters local Latn_chars = require("Module:scripts/data").Latn.characters

local scripts_data = mw.title.new("User:Theknightwho/Unicode/Scripts.txt"):getContent local script_extensions_data = mw.title.new("User:Theknightwho/Unicode/ScriptExtensions.txt"):getContent

local m = {{}, {}} local codepoints = {}

local function process_data(data) for line in data:gmatch("([^\n]+)%f[%z\n]") do		local range = line:match("^([%x%.]+)") local scripts = line:match("; [^#]+ #") if range and scripts then range = split(range, "%.%.") for i, v in ipairs(range) do				range[i] = tonumber(v, 16) end scripts = split(scripts, " ") for i = range[1], range[2] or range[1] do				local t = codepoints local char = u(i) local bytes = {char:byte(1, #char)} for j, byte in ipairs(bytes) do					if j == #bytes then local scripts = copy(scripts) for i, script in ipairs(scripts) do							script = script_codes[script] if script == "Latn" then if mw.ustring.match(char, "[" .. Latn_chars .. "]") then table.insert(scripts, "Latinx") else script = "Latinx" end elseif script == "Grek" then if mw.ustring.match(char, "[" .. Grek_chars .. "]") then table.insert(scripts, "polytonic") else script = "polytonic" end elseif script == "Hira" or script == "Kana" then table.insert(scripts, "Japanese") elseif script == "Hang" then table.insert(scripts, "Korean") elseif script == "Hani" then table.insert(scripts, "Japanese") table.insert(scripts, "Korean") end if script and not (t[byte] and t[byte]:find(script)) then t[byte] = (t[byte] and t[byte] .. ", " .. script) or script end end else t[byte] = t[byte] or {} t = t[byte] end end end end end end

process_data(scripts_data) process_data(script_extensions_data)

local main = {}

main["Arab"] = { {0x060C}, {0x061B, 0x061C}, {0x061F}, {0x0640}, {0x064B, 0x0655}, {0x0660, 0x0669}, {0x0670}, {0x06D4}, {0xFD3E, 0xFD3F}, {0xFDF2}, {0xFDFD}, }

main["Beng"] = { {0x09E6, 0x09EF}, }

main["Copt"] = { {0x102E0, 0x102FB}, }

main["Cyrl"] = { {0x0483, 0x0487}, {0x1DF8}, {0x2E43}, {0xA66F}, }

main["Deva"] = { {0x0951, 0x0952}, {0x0964, 0x096F}, {0x1CD0, 0x1CF6}, {0x1CF8, 0x1CF9}, {0xA830, 0xA839}, {0xA8F1}, {0xA8F3}, }

main["Dupl"] = { {0x1BCA0, 0x1BCA3}, }

main["Geor"] = { {0x10FB}, }

main["Gran"] = { {0x11301}, {0x11303}, {0x1133B, 0x1133C}, }

main["Gujr"] = { {0x0AE6, 0x0AEF}, }

main["Guru"] = { {0x0A66, 0x0A6F}, }

main["Hani"] = { {0x3001, 0x3003}, {0x3006}, {0x3008, 0x3011}, {0x3013, 0x301F}, {0x302A, 0x302D}, {0x3030}, {0x3037}, {0x303E, 0x303F}, {0x3190, 0x319F}, {0x31C0, 0x31E3}, {0x3220, 0x3247}, {0x3280, 0x32B0}, {0x32C0, 0x32CB}, {0x32FF}, {0x3358, 0x3370}, {0x337B, 0x337F}, {0x33E0, 0x33FE}, {0xA700, 0xA707}, {0xFE45, 0xFE46}, {0xFF61, 0xFF64}, {0x1D360, 0x1D371}, {0x1F250, 0x1F251}, }

main["Hano"] = { {0x1735, 0x1736}, }

main["Java"] = { {0xA9CF}, }

main["Jpan"] = { {0x3031, 0x3035}, {0x303C, 0x303D}, {0x3099, 0x309C}, {0x30A0}, {0x30FC}, {0xFF70}, {0xFF9E, 0xFF9F}, }

main["Kali"] = { {0xA92E}, }

main["Kana"] = { {0x30FB}, {0xFF65}, }

main["Knda"] = { {0x0CE6, 0x0CEF}, }

main["Latinx"] = { {0x0363, 0x036F}, {0x20F0}, }

main["Linb"] = { {0x10100, 0x10102}, {0x10107, 0x10133}, {0x10137, 0x1013F}, }

main["Mani"] = { {0x10AF2}, }

main["Mong"] = { {0x1802, 0x1803}, {0x1805}, {0x202F}, }

main["Mymr"] = { {0x1040, 0x1049}, }

main["Nand"] = { {0x1CFA}, }

main["polytonic"] = { {0x0342}, {0x0345}, {0x1DC0, 0x1DC1}, }

main["Syrc"] = { {0x1DFA}, }

main["Taml"] = { {0x0BE6, 0x0BF3}, {0x11FD0, 0x11FD1}, {0x11FD3}, }

for script, ranges in pairs(main) do	for _, range in ipairs(ranges) do		for i = range[1], range[2] or range[1] do			local char = u(i) local bytes = {char:byte(1, #char)} local t = codepoints for j, byte in ipairs(bytes) do				if j == #bytes then local v = split(t[byte], ", ") sort(v, function(a, b)						return a == script					end) t[byte] = concat(v, ", ") else t = t[byte] end end end end end

local function compress(t, a, b)	local ranges = {} for i = a, b do		if type(t[i]) == "table" then t[i] = compress(t[i], 128, 191) end if #ranges > 0 then local br			for j = #ranges, 1, -1 do				if deep_equals(t[i], ranges[j][1]) then if ranges[j][#ranges[j]] == i - 1 then ranges[j][#ranges[j]] = i					else insert(ranges[j], i)						insert(ranges[j], i)					end br = true break end end if not br then insert(ranges, {t[i], i, i}) end else insert(ranges, {t[i], i, i}) end end if ranges[1][2] == a and ranges[1][3] == b then return ranges[1][1] else local new_t = {} for _, range in ipairs(ranges) do			local key = {} for i = 2, #range - 1, 2 do				if range[i] == range[i+1] then insert(key, "\\" .. ("%03d"):format(range[i])) elseif range[i] == range[i+1] - 1 then insert(key, "\\" .. ("%03d"):format(range[i]) .. "\\" .. ("%03d"):format(range[i+1])) else insert(key, "\\" .. ("%03d"):format(range[i]) .. "-" .. "\\" .. ("%03d"):format(range[i+1])) end end new_t[concat(key)] = range[1] end return new_t end end

codepoints = compress(codepoints, 1, 244)

local tables = {} local function handle_duplicates(t) for k, v in pairs(t) do		if type(v) == "table" then handle_duplicates(v) if deep_equals(v, ref) then t[k] = true else local cache for _, tbl in ipairs(tables) do					if deep_equals(v, tbl) then t[k] = tbl cache = true break end end if not cache then table.insert(tables, v)				end end end end if deep_equals(t, ref) then t = true else local cache for _, tbl in ipairs(tables) do			if deep_equals(t, tbl) then t = tbl cache = true break end end if not cache then table.insert(tables, t)		end end end

handle_duplicates(codepoints)

return codepoints