User:Robert Ullmann/code/xhan


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/code/xhan

""" This bot checks Han character entries in en-wikt.xml and writes report pages for each row.

No command line arguments.

Generates (replaces) User:Robert Ullmann/Han/(hexcode) for each row

Generates a problems summary (to-do list) """

import wikipedia import xmlreader import sys import re import pickle from getwikitext import getwikitext import xmldate

def safe(s): return pickle.dumps(s)[1:-5]

def main:

probonly = False recheck = False for arg in sys.argv[1:]: if arg.startswith('-probonly'): probonly = True print "only updating problems" elif arg.startswith('-recheck'): recheck = True print "rechecking entries from current DB" else: print "unknown command line argument %s" % arg

# report dictionary enwikt = { 0:"blank" }

# problems, index is character code number, value is char + text of problem problems = { }

# header levels Lang = {'Translingual':'Han char', 'Cantonese':'yue', 'Japanese':'ja', 'Korean':'ko', 'Mandarin':'cmn', 'Min Nan':'nan', 'Hakka':'hak', 'Gan':'gan', 'Jinyu':'cjy', 'Min Bei':'mnp', 'Min Dong':'cdo', 'Min Zhong':'czo', 'Wu':'wuu', 'Xiang':'hsn', 'Vietnamese':'vi', 'Chinese':'zh', 'Old Chinese':'och', 'Middle Chinese':'???', 'Zhuang':'za', 'Old Korean':'oko' }

L3 = set(['Hanzi', 'Kanji', 'Hanja', 'Han character', 'Pronunciation', 'Proper noun', 'Pronoun',       'Noun', 'Verb', 'Adjective', 'Number', 'Counter', 'Particle', 'Prefix', 'Suffix', 'Affix', 'Adverb',        'Etymology', 'Etymology 1', 'Etymology 2','Etymology 3','Etymology 4',        'Related terms', 'Derived terms', 'Usage notes', 'External links', 'See also',        'Alternative spellings', 'Alternative forms', 'Preposition', 'Adnominal',        'References', 'Interjection', 'Measure word', 'Conjunction' ])

L4 = set(['Compounds', 'References', 'Readings', 'Derived terms', 'Related terms', 'Antonyms',       'Usage notes', 'Synonyms', 'See also', 'Descendants' ])

# template list, these are the templates that are used in one specific language and section # and should always appear in that section. used to build Tdict and Require

Tlist = [  ('Han char', 'Translingual', 'Han character'), ('Han ref', 'Translingual', 'Han character'), ('cmn-hanzi', 'Mandarin', 'Hanzi'), ('nan-hanzi', 'Min Nan', 'Hanzi'), ('yue-hanzi', 'Cantonese', 'Hanzi'), ('ja-kanji', 'Japanese', 'Kanji'), ('ko-hanja', 'Korean', 'Hanja'), ('vi-hantu', 'Vietnamese', 'Han character') ]

# dictionary of templates, built from above Tdict = {} for t, l, s in Tlist: Tdict[t] = (l, s)

# checklist requirements. for each of the first in the tuple, the second must exist, or the entry # is in error Require = [ ('entry', 'Translingual'), ('Translingual', 'Translingual Han character section'), ('Mandarin', 'Mandarin Hanzi section'), ('Min Nan', 'Min Nan Hanzi section'), ('Cantonese', 'Cantonese Hanzi section'), ('Japanese', 'Japanese Kanji section'), ('Korean', 'Korean Hanja section'), ('Vietnamese', 'Vietnamese Han character section') ] # add templates to requirements for t, l, s in Tlist: Require.append( (l + ' ' + s + ' section', t + ' template in ' + l + ' ' + s + ' section') )

# regex precomp rehanchar = re.compile(r'\{\{Han char.*?\}\}') reradno = re.compile(r'\|rn=(\d+)[|}]') rerad = re.compile(r'\|rad=(.)[|}]') reas = re.compile(r'\|as=(\d\d)[|}]') rehanref = re.compile(r'\{\{Han ref.*?\}\}') reuh = re.compile(r'\|uh=(\w+)[|}]') reud = re.compile(r'\|ud=(\d+)[|}]') # header, will treat L1 as a special case reheader = re.compile(r'(={2,6})\s*(.+?)={2,6}(.*)') retemplate = re.compile(r'\{\{([-a-zA-Z ]+)[\}\|]') # templater allows for '* ' before Han ref ...    retemplater = re.compile(r'\*? ?\{\{(Han ref)[\}\|]')

# make sure we are logged in   site = wikipedia.getSite site.forceLogin wikipedia.setAction('Han character report')

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

print "reading XML dump from %s" % xmldate.enXMLdate

entries = 0 hanchars = 0 kprobs = 0

for entry in dump.parse: text = entry.text title = entry.title

entries += 1 if entries % 5000 == 0: print "%d entries, %d characters" % (entries, hanchars)

# figure out if it is a Han character entry:

ishanchar = False if len(title) == 1: a = ord(title[0:1]) #print "one character entry, code is %x" % a           if a >= 0x3400 and a < 0xA000: ishanchar = True if a > 0x4BD5 and a < 0x4E00: ishanchar = False # I Ching characters

# Extension B, in UTF-16 (although XMLreader/Python Lib don't say so): if len(title) == 2: a = ord(title[0:1]) b = ord(title[1:2]) if a >= 0xd800 and a < 0xdc00: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000 if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

if not ishanchar: continue hanchars += 1

# do this twice if needed, first with XML, then with current entry if recheck

rc = True reread = recheck

while rc: rc = False

#u = unichr(i) #ucchar = u.encode("UTF-8") ucs = '%X' % a           #title = "&#%d;" % a            han =  + title + 

#inititalize

Checklist = set(['entry'])

simple = '' defn = '' ex = '' wlinkfound = False deffound = 0 inlevel2 = 0 currlang = '' current3 = '' langfound = 0 l3found = 0 deffound = 0 extra = 0 detail = True MR = '' Yale = '' # first find Han char and Han ref templates, check a few things

mo = rehanchar.search(text) if mo: hanct = mo.group(0) mo = reradno.search(hanct) if mo: radno = int(mo.group(1)) else: ex += 'Radical number missing ' mo = rerad.search(hanct) if mo: rad = mo.group(1) else: ex += 'Radical missing ' mo = reas.search(hanct) if mo: ast = int(mo.group(1)) else: ex += 'Additional strokes parameter missing or incorrect ' # else: ex += 'Han char template missing '

mo = rehanref.search(text) if mo: hanref = mo.group(0) mo = reud.search(hanref) if mo: ud = int(mo.group(1), 10) if ud != a: ex += 'Unicode decimal value incorrect ' mo = reuh.search(hanref) if mo: uh = int(mo.group(1), 16) if uh != a: ex += 'Unicode hex value incorrect ' else: ex += 'Unicode hex value missing ' # else: ex += 'Han ref template missing '

# now parse text line-by-line ...

for line in text.splitlines: # print "line len is %d" % len(line) if line[0:1] == '#': deffound = 1 if line.find('[[') > 0: wlinkfound = True                   lang = line.partition('{{defn|')[2]                    if lang <> :                        lang = lang.split('|')[0]                        lang = lang.split('}')[0]                        if lang in Lang: defn += ', ' + Lang[lang]                        else: defn += ', ' + lang 		    elif simple == : simple = line[1:140]

# look for indicators of un-revised format if detail: #if line.find('total strokes index') > 0: ex += "NanshuBot header not formatted " if line.find('Penkyamp') > 0: ex += "Chinese hanzi not formatted " if line.find('McCune-Reischauer') > 0: ex += "Korean not formatted " if line.find('Morohashi') > 0: ex += "References not formatted "

if line[0:1] == '=' and line[1:2] != '=': ex += "Level one header " continue

mo = reheader.match(line) if mo: header = mo.group(2).strip level = len(mo.group(1)) if mo.group(3): ex += "Stuff after %s header " % header else: level = 0

# check headers by level

if level == 4: if header not in L4 and header not in L3: if detail: ex += "L4 header: %s " % header # multiple etymologies: if header not in L4 and header in L3: l3found = 1 current3 = header Checklist.add(currlang + ' ' + current3 + ' section')

if level == 3: if header in L3: l3found = 1 current3 = header Checklist.add(currlang + ' ' + current3 + ' section') else: if detail: ex += "L3 header: %s " % header current3 = ''

# if level is two, close L2 section if level == 2: current3 = '' if inlevel2 == 1: if detail: ex += "Missing to end %s section " % currlang

# check, pick up new language if level == 2: inlevel2 = 1 if header in Lang: newlang = header else: newlang = '' if newlang <> '': if newlang <> 'Translingual': langfound = 1 if currlang <> '': # check current lang for order if newlang == 'Translingual': ex += '%s before Translingual ' % currlang elif currlang <> 'Translingual': if currlang == newlang: ex += 'two sections for %s ' % currlang if currlang > newlang: ex += '%s out of order ' % currlang else: ex += "L2 header: %s " % header detail = False # in order, or not, current is language if valid currlang = newlang l3found = 0 current3 = '' deffound = 0 Checklist.add(currlang)

# templates mo = retemplate.match(line) if not mo: mo = retemplater.match(line)   # "* {{Han ref..." case if mo: t = mo.group(1).strip if t in Tdict: l, s = Tdict[t] if currlang != l: ex += "Template %s not in %s section " % (t, l)                       elif current3 != s: ex += "Template %s not in %s section " % (t, s)                        # (if error, harmless to add to checklist) Checklist.add(t + ' template in ' + l + ' ' + s + ' section')

# don't require Korean Hanja section on kwukyel notes, should refer to that in Han defn: if line.startswith('#') and 'kwukyel' in line: Checklist.add("Korean Hanja section") Checklist.add("ko-hanja template in Korean Hanja section")

# random things, cruft: if detail: if inlevel2 and line[0:5] == "* '''": ex += "Cruft: %s " % line[2:] if "{{substub}}" in line: ex += "substub template "

# Korean, new format: if line.find('ko-hanja') > 0: if line.find('|mr=') > 0: MR = re.sub(r'.*\|mr=(.*?)[|}].*', r'\1', line) if line.find('|y=') > 0: Yale = re.sub(r'.*\|y=(.*?)[|}].*', r'\1', line)

# line across, exit level 2 if line[0:4] == '': if inlevel2 == 0: if extra == 0: ex += 'Extraneous ' detail = False inlevel2 = 0 if not l3found and detail: ex += "No L3 header in %s section " % currlang if not deffound and detail: ex += "No definition line for %s " % currlang elif inlevel2 == 0: # only other text allowed is templates or blank lines if len(line) > 1: if line[0:2] <> '{{': if extra == 0: if detail: ex += "Extraneous text not in L2 section " extra = 1

# enough already! if detail and len(ex) > 200: detail = False ex += '...more... '

# end for line

# end of entry if detail: # close last section, should be in level 2, exit if inlevel2 == 0: ex += 'Extraneous at end ' else: if l3found == 0: ex += "No L3 header in %s section " % currlang if deffound == 0: ex += "No definition line for %s " % currlang

# even if no detail, report bad Korean Yale if Yale: yf = Yale

if MR.find(u'y\u014f') >= 0 and Yale.find('ey') >= 0: yf = re.sub('ey', 'ye', yf) yf = re.sub('yye', 'yey', yf) elif MR.find(u'he') >= 0 and Yale.find('ye') >= 0: yf = re.sub('ye', 'ey', yf)

if MR.find(u'ya') >= 0 and Yale.find('ay') >= 0: yf = re.sub('ay', 'ya', yf) elif MR.find(u'ae') >= 0 and Yale.find('ya') >= 0: yf = re.sub('ya', 'ay', yf)

if MR.find(u"ch'e") >= 0 and Yale.find('chye') >= 0: yf = re.sub('chye', 'chey', yf) if MR.find(u'ke') >= 0 and Yale.find('kye') >= 0: yf = re.sub('kye', 'key', yf) if MR.find(u'se') >= 0 and Yale.find('sye') >= 0: yf = re.sub('sye', 'sey', yf) if MR.find(u're') >= 0 and Yale.find('lye') >= 0: yf = re.sub('lye', 'ley', yf) if MR.find(u'ne') >= 0 and Yale.find('nye') >= 0: yf = re.sub('nye', 'ney', yf) if MR.find(u'pe') >= 0 and Yale.find('pye') >= 0: yf = re.sub('pye', 'pey', yf) if MR == 'e' and Yale == 'ye': yf = 'ey'

if yf <> Yale: ex += "Korean Yale %s should be %s " % (Yale, yf) kprobs += 1

# run checklist (regardless of detail for now) for r, i in Require: if r in Checklist and i not in Checklist: ex += i + ' missing '

# if there was a problem, reread from current DB?

if ex and reread: page = wikipedia.Page(site, title) print "Re-reading character %X" % a               try: # text = page.get text = getwikitext(site, page) rc = True reread = False continue # go back to top once more except wikipedia.NoPage: print "can't read current page?" pass except wikipedia.IsRedirectPage: print "redirect page?" pass

# add to problems if ex: problems[a] = han + ' ' + re.sub(' ', ', ', ex)[0:-2]

# more details, not reported in problem punchlist if detail: if simple and not wlinkfound: ex += "No wikilink in any definition found " if langfound == 0: ex += "No language section found "

# fixups if defn[0:1] == ',': defn = defn[2:]

# store report line enwikt[a] = '|-\n| ' + ucs + ' || ' + han + ' || ' + simple + ' || ' + defn + ' || ' + ex + '\n'

print "Character %X %s" % (a, safe(ex))

print "%d Korean Yale problems" % kprobs print "%d total problems" % len(problems) print "%d entries, %d characters, writing reports" % (entries, hanchars)

# write report pages

report = '\nProblems as of ' + xmldate.enXMLdate report += ', keep in mind while fixing entries that the check, rather than the entry, may be wrong.\n\n' for c in sorted(problems): report += '* %X ' % c + problems[c] + '\n' report += '\n%d problems\n\n' % len(problems)

# report page try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/Problems') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report if report.strip(' \n') != oldreport.strip(' \n'): reportpage.put(report)

if probonly: return # we are done

for si in range(0x3400, 0x2B000, 256): validentry = False

# save some time if si > 0xA000 and si < 0x20000: continue

# blank and re-intialize report = '{{User:Robert Ullmann/Han/header|from=' + "%X"%si + '|to=' + "%X"%(si+255) report += '|date=' + xmldate.enXMLdate + '}}\n'

for i in range(si, si+256): if i in enwikt: line = enwikt[i] validentry = True report += line

# else: line = '|-\n| ' + '%X'%i + ' || ' + "&"%i + ' || ||  || (entry not found)\n'

# last rows of Han, Ext A, Ext B           if i == 0x9FA5: break if i == 0x4DB5: break if i == 0x2A6D6: break

report += '|}\n'

if not validentry: continue

# report page try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Han/%X' % si) oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink oldreport = ''

if report.strip(' \n') == oldreport.strip(' \n'): print "No change to report for %s" % reportpage.aslink continue

wikipedia.showDiff(oldreport, report)

# file the report reportpage.put(report)

if __name__ == "__main__": try: main finally: wikipedia.stopme