User:Erutuon/scripts/scriptRecognition.js

/*	Two functions: a codepoint-to-Wiktionary-script-code function like char_to_script in Module:Unicode data, and a string-to-script-code function. Language-agnostic. Originally from User:Erutuon/scripts/watchlistScriptTagging.js and User:Erutuon/scripts/scriptTitles.js. Requires ECMAScript 2016 (ES7) because it uses Array.prototype.includes.

/* jshint esversion: 6 */ /* globals mw */

(function scriptRecognitionIIFE { 'use strict';

// The following data is from Module:Unicode data/scripts and ultimately // based on Module:scripts/data. const scriptRanges = [ [		[0x41, 0x5A, 'Latn'], [0x61, 0x7A, 'Latn'], [0xC0, 0xD6, 'Latn'], [0xD8, 0xF6, 'Latn'], [0xF8, 0x24F, 'Latn'], [0x370, 0x3E1, 'Grek'], [0x3E2, 0x3EF, 'Copt'], [0x3F0, 0x3FF, 'Grek'], [0x400, 0x45F, 'Cyrl'], [0x464, 0x469, 'Cyrs'], [0x46A, 0x46D, 'Cyrl'], [0x46F, 0x471, 'Cyrs'], [0x472, 0x475, 'Cyrl'], [0x476, 0x489, 'Cyrs'], [0x48A, 0x527, 'Cyrl'], [0x531, 0x58F, 'Armn'], [0x590, 0x5FF, 'Hebr'], [0x600, 0x6FF, 'Arab'], [0x700, 0x74F, 'Syrc'], [0x750, 0x77F, 'Arab'], [0x780, 0x7B1, 'Thaa'], [0x7C0, 0x7FA, 'Nkoo'], [0x800, 0x83E, 'Samr'], [0x840, 0x85E, 'Mand'], [0x860, 0x86A, 'Syrc'], [0x8A0, 0x8FF, 'Arab'], [0x900, 0x97F, 'Deva'], [0x981, 0x9FA, 'Beng'], [0xA01, 0xA75, 'Guru'], [0xA81, 0xAF1, 'Gujr'], [0xB01, 0xB77, 'Orya'], [0xB82, 0xBFA, 'Taml'], [0xC01, 0xC7F, 'Telu'], [0xC82, 0xCF2, 'Knda'], [0xD02, 0xD7F, 'Mlym'], [0xD82, 0xDF4, 'Sinh'], [0xE01, 0xE5B, 'Thai'], [0xE81, 0xEDF, 'Laoo'], [0xF00, 0xFDA, 'Tibt'] ],	[		[0x1000, 0x109F, 'Mymr'], [0x10A0, 0x10CD, 'Geok'], // Asomtavruli [0x10D0, 0x10FC, 'Geor'], // Mkhedruli [0x1100, 0x11FF, 'Hang'], [0x1200, 0x1399, 'Ethi'], [0x13A0, 0x13F4, 'Cher'], [0x1400, 0x167F, 'Cans'], [0x1680, 0x169C, 'Ogam'], [0x16A0, 0x16F0, 'Runr'], [0x1700, 0x1714, 'Tglg'], [0x1720, 0x1734, 'Hano'], [0x1740, 0x1753, 'Buhd'], [0x1760, 0x1773, 'Tagb'], [0x1780, 0x17F9, 'Khmr'], [0x1800, 0x18AA, 'Mong'], [0x1900, 0x194F, 'Limb'], [0x1950, 0x1974, 'Tale'], [0x1980, 0x19DF, 'Talu'], [0x19E0, 0x19FF, 'Khmr'], [0x1A00, 0x1A1F, 'Bugi'], [0x1A20, 0x1AAD, 'Lana'], [0x1B00, 0x1B7C, 'Bali'], [0x1B80, 0x1BBF, 'Sund'], [0x1BC0, 0x1BFF, 'Batk'], [0x1C00, 0x1C4F, 'Lepc'], [0x1C50, 0x1C7F, 'Olck'], [0x1E00, 0x1EFF, 'Latn'], [0x1F00, 0x1FFE, 'polytonic'] ],	[		[0x2200, 0x22FF, 'Zmth'], [0x2300, 0x23F3, 'Zsym'], [0x2500, 0x27BF, 'Zsym'], [0x27C0, 0x27EF, 'Zmth'], [0x2800, 0x28FF, 'Brai'], [0x2980, 0x29FF, 'Zmth'], [0x2A00, 0x2AFF, 'Zmth'], [0x2C00, 0x2C5E, 'Glag'], [0x2C60, 0x2C7F, 'Latinx'], [0x2C80, 0x2CFF, 'Copt'], [0x2D00, 0x2D2D, 'Geok'], // Nuskhuri [0x2D30, 0x2D7F, 'Tfng'], [0x2D80, 0x2DDE, 'Ethi'], [0x2E80, 0x2FDF, 'Hani'] ],	[		[0x3000, 0x303F, 'Hani'], [0x3041, 0x309F, 'Hira'], [0x30A0, 0x30FF, 'Kana'], [0x3105, 0x312D, 'Bopo'], [0x3131, 0x318E, 'Hang'], [0x31A0, 0x31BA, 'Bopo'], [0x31C0, 0x31E3, 'Hani'], [0x31F0, 0x31FF, 'Kana'], [0x3300, 0x3357, 'Kana'], [0x337B, 0x337F, 'Hani'], [0x3400, 0x3FFF, 'Hani'] ],	[		[0x4000, 0x4DB5, 'Hani'], [0x4E00, 0x4FFF, 'Hani'] ],	[		[0x5000, 0x5FFF, 'Hani'] ],	[		[0x6000, 0x6FFF, 'Hani'] ],	[		[0x7000, 0x7FFF, 'Hani'] ],	[		[0x8000, 0x8FFF, 'Hani'] ],	[		[0x9000, 0x9FFF, 'Hani'] ],	[		[0xA000, 0xA4C6, 'Yiii'], [0xA4D0, 0xA4FF, 'Lisu'], [0xA500, 0xA62B, 'Vaii'], [0xA640, 0xA697, 'Cyrs'], [0xA680, 0xA697, 'Cyrl'], [0xA6A0, 0xA6F7, 'Bamu'], [0xA720, 0xA7FF, 'Latinx'], [0xA800, 0xA82B, 'Sylo'], [0xA840, 0xA877, 'Phag'], [0xA880, 0xA8D9, 'Saur'], [0xA8E0, 0xA8FB, 'Deva'], [0xA900, 0xA92F, 'Kali'], [0xA930, 0xA95F, 'Rjng'], [0xA980, 0xA9DF, 'Java'], [0xA9E0, 0xA9FE, 'Mymr'], [0xAA00, 0xAA5F, 'Cham'], [0xAA60, 0xAA7F, 'Mymr'], [0xAA80, 0xAADF, 'Tavt'], [0xAAE0, 0xAAFF, 'Mtei'], [0xAB01, 0xAB2E, 'Ethi'], [0xAB30, 0xAB65, 'Latinx'], [0xAB70, 0xABBF, 'Cher'], [0xABC0, 0xABFF, 'Mtei'], [0xAC00, 0xAFFF, 'Hang'] ],	[		[0xB000, 0xBFFF, 'Hang'] ],	[		[0xC000, 0xCFFF, 'Hang'] ],	[		[0xD000, 0xD7A3, 'Hang'] ],	[		// no data for 0xF000-0xFFFF ],	[		[0xFB13, 0xFB17, 'Armn'], [0xFB1D, 0xFB4F, 'Hebr'], [0xFB50, 0xFDFD, 'Arab'], [0xFE70, 0xFEFC, 'Arab'] ],	[		[0x10000, 0x100FA, 'Linb'], [0x10280, 0x1029C, 'Lyci'], [0x102A0, 0x102D0, 'Cari'], [0x102E1, 0x102FB, 'Copt'], [0x10300, 0x10323, 'Ital'], [0x10330, 0x1034A, 'Goth'], [0x10350, 0x1037A, 'Perm'], [0x10380, 0x1039F, 'Ugar'], [0x103A0, 0x103D5, 'Xpeo'], [0x10400, 0x1044F, 'Dsrt'], [0x10450, 0x1047F, 'Shaw'], [0x10480, 0x104A9, 'Osma'], [0x104B0, 0x104FB, 'Osge'], [0x10500, 0x10527, 'Elba'], [0x10530, 0x10563, 'Aghb'], [0x10600, 0x10767, 'Lina'], [0x10800, 0x1083F, 'Cprt'], [0x10840, 0x1085F, 'Armi'], [0x10860, 0x1087F, 'Palm'], [0x10880, 0x108AF, 'Nbat'], [0x108E0, 0x108FF, 'Hatr'], [0x10900, 0x1091F, 'Phnx'], [0x10920, 0x1093F, 'Lydi'], [0x10980, 0x1099F, 'Mero'], [0x109A0, 0x109BF, 'Merc'], [0x10A00, 0x10A58, 'Khar'], [0x10A60, 0x10A7F, 'Sarb'], [0x10A80, 0x10A9F, 'Narb'], [0x10AC0, 0x10AF6, 'Mani'], [0x10B00, 0x10B3F, 'Avst'], [0x10B40, 0x10B5F, 'Prti'], [0x10B60, 0x10B7F, 'Phli'], [0x10B80, 0x10BAF, 'Phlp'], [0x10C00, 0x10C48, 'Orkh'], [0x10C80, 0x10CB2, 'Hung'], [0x10E60, 0x10E7E, 'Ruminumerals'] ],	[		[0x11000, 0x1106F, 'Brah'], [0x11080, 0x110C1, 'Kthi'], [0x110D0, 0x110F9, 'Sora'], [0x11100, 0x11143, 'Cakm'], [0x11176, 0x11150, 'Mahj'], [0x11180, 0x111D9, 'Shrd'], [0x11200, 0x1123D, 'Khoj'], [0x11280, 0x112A9, 'Mult'], [0x112B0, 0x112F9, 'Sind'], [0x11301, 0x11374, 'Gran'], [0x11400, 0x1145D, 'Newa'], [0x11480, 0x114D9, 'Tirh'], [0x11580, 0x115DD, 'Sidd'], [0x11600, 0x11659, 'Modi'], [0x11680, 0x116C9, 'Takr'], [0x11700, 0x1173F, 'Ahom'], [0x118A0, 0x118FF, 'Wara'], [0x11A00, 0x11A47, 'Zanb'], [0x11A50, 0x11AA2, 'Soyo'], [0x11AC0, 0x11AF8, 'Pauc'], [0x11C00, 0x11C6C, 'Bhks'], [0x11C70, 0x11CB6, 'Marc'], [0x11D00, 0x11D59, 'Gonm'] ],	[		[0x12000, 0x1236E, 'Xsux'], [0x12400, 0x12473, 'Xsux'] ],	[		[0x13000, 0x1342E, 'Egyp'] ],	[		[0x14400, 0x14646, 'Hluw'] ],	[		// no data for 0x15000-0x15FFF ],	[		[0x16800, 0x16A38, 'Bamu'], [0x16A40, 0x16A6F, 'Mroo'], [0x16AD0, 0x16AF5, 'Bass'], [0x16B00, 0x16B8F, 'Hmng'], [0x16F00, 0x16F9F, 'Plrd'] ],	[		[0x17000, 0x17FFF, 'Tang'] ],	[		[0x18000, 0x187EC, 'Tang'], [0x18800, 0x18AF2, 'Tang'] ],	[		// no data for 0x19000-0x19FFF ],	[		// no data for 0x1A000-0x1AFFF ],	[		[0x1B002, 0x1B11E, 'Hira'], // no unique code for hentaigana on Wiktionary [0x1B170, 0x1B2FB, 'Nshu'], [0x1BC00, 0x1BC9F, 'Dupl'] ],	[		// no data for 0x1C000-0x1CFFF ],	[		[0x1D100, 0x1D1DD, 'musical'], [0x1D400, 0x1D7FF, 'Zmth'], [0x1D800, 0x1DAAF, 'Sgnw'] ],	[		[0x1E000, 0x1E02A, 'Glag'], [0x1E800, 0x1E8D6, 'Mend'], [0x1E900, 0x1E95F, 'Adlm'] ],	[		[0x1F300, 0x1F6C5, 'Zsym'] ],	[		[0x20000, 0x20FFF, 'Hani'] ],	[		[0x21000, 0x21FFF, 'Hani'] ],	[		[0x22000, 0x22FFF, 'Hani'] ],	[		[0x23000, 0x23FFF, 'Hani'] ],	[		[0x24000, 0x24FFF, 'Hani'] ],	[		[0x25000, 0x25FFF, 'Hani'] ],	[		[0x26000, 0x26FFF, 'Hani'] ],	[		[0x27000, 0x27FFF, 'Hani'] ],	[		[0x28000, 0x28FFF, 'Hani'] ],	[		[0x29000, 0x29FFF, 'Hani'] ],	[		[0x2A000, 0x2AFFF, 'Hani'] ],	[		[0x2B000, 0x2BFFF, 'Hani'] ],	[		[0x2C000, 0x2CFFF, 'Hani'] ],	[		[0x2D000, 0x2DFFF, 'Hani'] ],	[		[0x2E000, 0x2EBE0, 'Hani'] ] ];

const charToScript = { 0x460: 'Cyrs', 0x461: 'Cyrs', 0x462: 'Cyrl', 0x463: 'Cyrl', 0x2135: 'Zmth', 0x2190: 'Zsym', 0x21FF: 'Zsym', 0xFA0E: 'Hani', 0xFA0F: 'Hani', 0xFA11: 'Hani', 0xFA13: 'Hani', 0xFA14: 'Hani', 0xFA1F: 'Hani', 0xFA21: 'Hani', 0xFA23: 'Hani', 0xFA24: 'Hani', 0xFA27: 'Hani', 0xFA28: 'Hani', 0xFA29: 'Hani', 0x1056F: 'Aghb', 0x16FE0: 'Tang', 0x1B000: 'Kana', 0x1B001: 'Hira' };

// Groups of 4096 (0x1000) codepoints. const groupToScript = [ [ 4, 9, 'Hani' ], [ 11, 13, 'Hang' ], [ 32, 46, 'Hani' ] ];

// Used to decide which script "wins" when text contains characters from two // script categories.

// For example, text containing both Grek and polytonic characters should be // tagged as polytonic. const scriptOverrulings = { 'Grek': 'polytonic', 'Cyrl': 'Cyrs', 'Latinx': 'Latn', };

/*	Handles scripts that contain two or more basic scripts.

The first item is the code of the compound script; the other items are the component scripts, or in the case of Hani, the other scripts that can be used alongside it.

Hani is treated as a compound script for convenience, because it sometimes uses Latn characters. const compoundScripts = [ [ "Hani", "Latn" ], [ "Jpan", "Hani", "Hira", "Kana", "Latn" ], [ "Kore", "Hang", "Hani", "Latn" ] ];

const log = window.scriptRecognition && window.scriptRecognition.log ? console.log.bind(console) : => {};

function linearSearch(codePoint, ranges) { for (const [lower, higher, result] of ranges) { // If ranges are greater than codepoint, no match will be found. Short-circuit the loop. if (codePoint < lower) return null; else if (codePoint <= higher) return result; } }

function compareRanges(range1, range2) { return range1[0] === range2[0] ? range1[1] - range2[1] : range1[0] - range2[0]; }

function binarySearch(codePoint, ranges) { if (!ranges) return null;

if (binarySearch.cache) { const cacheResult = linearSearch(codePoint, binarySearch.cache); if (cacheResult) return cacheResult; } else binarySearch.cache = [];

let bottom = 0, middle = 0, top = ranges.length;

while (bottom <= top) { middle = (bottom + top) >> 1; const range = ranges[middle]; if (!range) break; if (codePoint < range[0]) top = middle - 1; else if (codePoint <= range[1]) { binarySearch.cache.push(range); binarySearch.cache.sort(compareRanges); return range[2]; } else bottom = middle + 1; }

return null; }

// Returns a string (a Wiktionary script code) or null. function codePointToScript(codePoint) { let script = charToScript[codePoint]; if (script) return script; const group = codePoint >> 12; script = linearSearch(group, groupToScript); if (script) return script; const ranges = scriptRanges[group]; if (ranges === undefined) return null; else if (ranges.length > 5) return binarySearch(codePoint, ranges); else return linearSearch(codePoint, ranges); }

function getScripts(string) { const scripts = []; for (const character of string) { const script = codePointToScript(character.codePointAt(0)); if (script !== null && !scripts.includes(script)) scripts.push(script); }	return scripts; }

// Chooses one script out of an array of two or more scripts, or finds an // appropriate compound script. function resolveScriptConflicts(scripts, string) { return scripts.reduce((winningScript, script) => {		if (winningScript === script || scriptOverrulings[script] === winningScript)			return winningScript;		else if (scriptOverrulings[winningScript] === script) {			log(`${script} won out over ${winningScript}`);			return script;		}		const compoundScript = compoundScripts.find( componentScripts => componentScripts.includes(winningScript) && componentScripts.includes(script));		if (compoundScript) {			log(`${script} and ${winningScript} were replaced with ${compoundScript[0]}`);			return compoundScript[0];		} else {			log(`No script chosen out of ${scripts.join(", ")} in this string: ${string}.`);			return undefined;		}	}); }

function getScript(text) { const scripts = getScripts(text); return scripts.length > 0 ? resolveScriptConflicts(scripts, text) : undefined; }

function containsScript(text, scriptToFind) { for (const character of text) if (codePointToScript(character.codePointAt(0)) === scriptToFind) return true; return false; }

window.codePointToScript	= codePointToScript; window.getScripts			= getScripts; window.getScript			= getScript; window.containsScript		= containsScript;

});