User:Tbot/code/script


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Tbot/code/script

""" Given a word and language code, return a script template for the en.wikt

"""


 * 1) table of scripts, each is lowest character code point, highest code + 1, ISO script

Scs = [ (0x0370, 0x0400, 'Grek'), (0x0400, 0x0530, 'Cyrl'), (0x0530, 0x0590, 'Armn'), (0x0590, 0x0600, 'Hebr'), (0x0600, 0x0700, 'Arab'), (0x0700, 0x0750, 'Syrc'), (0x0750, 0x0780, 'Arab'), (0x0900, 0x0980, 'Deva'), (0x0980, 0x0A00, 'Beng'), (0x0A00, 0x0A80, 'Guru'), (0x0A80, 0x0B00, 'Gujr'), (0x0B00, 0x0B80, 'Orya'), (0x0B80, 0x0C00, 'Taml'), (0x0C00, 0x0C80, 'Telu'), (0x0C80, 0x0D00, 'Knda'), (0x0D00, 0x0D80, 'Mlym'), (0x0D80, 0x0E00, 'Sinh'), (0x0E00, 0x0E80, 'Thai'), (0x0E80, 0x0F00, 'Laoo'), (0x0F00, 0x1000, 'Tibt'), (0x1000, 0x10A0, 'Mymr'), (0x10A0, 0x1100, 'Geor'), (0x1100, 0x1200, 'Hang'),    # jamo (0x1200, 0x13A0, 'Ethi'), (0x13A0, 0x1400, 'Cher'), (0x1400, 0x1680, 'Cans'), (0x3040, 0x3100, 'Jpan'), (0x3400, 0xA000, 'Hani'),    # Han Ext A and Unified (0xAC00, 0xD800, 'Hang'), (0x20000, 0x2A6D7, 'Hant') ] # Han Ext B, mostly archaic so assume traditional


 * 1) table of combinations for specific languages that have particular templates

Lsp = { 'fa-Arab':'fa-Arab', 'ur-Arab':'ur-Arab', 'pa-Arab':'pa-Arab', 'ku-Arab':'ku-Arab', 'grc-Grek':'polytonic', 'ja-Hani':'Jpan', 'ja-Hant':'Jpan' }
 * 1) need some more ...

Scripts = set(['ARchar', 'KUchar', 'FAchar', 'THchar', 'URchar', 'Arab', 'fa-Arab', 'ur-Arab',              'pa-Arab', 'ku-Arab',               'THchar', 'polytonic', 'Hebr', 'Beng', 'Hant', 'Hani', 'Jpan', 'Grek',               'Cyrl', 'Deva', 'Sryc', 'Hang', 'RUchar', 'JAchar', 'Hayeren']) for low, high, scode in Scs: Scripts.add(scode) # make sure we have all of those
 * 1) all recognized script templates, including redirects, which we do not canonicalize

def script(word, lc, report = False):

if not word: return '' a = ord(word[0:1]) if a >= 0xd800 and a < 0xdc00: if len(word) < 2: return '' b = ord(word[1:2]) # "UTF-16" crap: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000

sc = '' for low, high, scode in Scs: if a >= low and a < high: sc = scode break

if sc and lc + '-' + sc in Lsp: sc = Lsp[lc + '-' + sc]

if report and not sc and a > 0x0370: print "no match for script for char code %x" % a

return sc

def scriptp(sc):

if sc in Scripts: return True return False