User:Robert Ullmann/code/hanform


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/code/hanform

""" This bot formats Han entries

No command line arguments.

"""

import wikipedia import xmlreader import sys import re import pickle import xmldate from mwapi import getwikitext, getedit

def safe(s): return pickle.dumps(s)[1:-5]

def main:

# regex table (dict, name = tuple of compiled object and replacement) Regex = { }

# examples # Regex['subst:PAGENAME'] = (re.compile(r'\{\{PAGENAME}}'), '') # Regex['template -cattag +context'] = (re.compile(r'\{\{cattag\|'), '{{context|')

Regex['add leading 0 to additional strokes'] = \ (re.compile(r'^\{\{Han char(.*)\|as=(\d)([\|\}])', re.M),           r'{{Han char\1|as=0\2\3')

Regex['format Vietnamese with vi-hantu template, add defn|Vietnamese'] = \ (re.compile(r"^==Vietnamese==\n+===Han character===\n+..? +\((\[\[.*\]\])\)\n+(\[\[|)", re.M),          r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n# {{defn|Vietnamese}}\n\n\2')

Regex['format Vietnamese with vi-hantu template'] = \ (re.compile(r"^==Vietnamese==\n+===Han character===\n+..? +\(?(\[\[.*\]\])\)?\n+#", re.M),          r'==Vietnamese==\n\n===Han character===\n{{vi-hantu|\1}}\n\n#')

Regex['add definition line for Korean'] = \ (re.compile(r'^==Korean==\n+===Hanja===\n+\{\{ko-hanja(.*)\}\}\n+(\[\[|)', re.M),          r'==Korean==\n\n===Hanja===\n{{ko-hanja\1}}\n\n# {{defn|Korean}}\n\n\2')

Regex['add definition line for Middle Chinese'] = \ (re.compile(r"^==Middle Chinese==\n+===Han character===\n+('''.*)\n+(\[\[|)", re.M),          r'==Middle Chinese==\n\n===Han character===\n\1\n\n# {{defn|Middle Chinese}}\n\n\2') # (format matches entries by annoying IP-anon, still need serious work)

Regex['remove bad "Viet ..." cats'] = \ (re.compile(r'\[\[[Cc]ategory:Viet .+\]\]\n?'), r)   Regex['remove bad "Viet" cat'] = \          (re.compile(r'\[\[[Cc]ategory:Viet\]\]\n?'), r)

Regex['header Compounds to L4'] = \ (re.compile(r'^===Compounds===$', re.M), r'====Compounds====')

# other regex precomp, not sure what is needed yet rehanchar = re.compile(r'\{\{Han char\|.*?\}\}') reradno = re.compile(r'\|rn=(\d+)[|}]') rerad = re.compile(r'\|rad=(.)[|}]') reas = re.compile(r'\|as=(\d\d?)[|}]') rehanref = re.compile(r'\{\{Han ref\|.*?\}\}') reuh = re.compile(r'\|uh=([A-Fa-f0-9]+)[|}]')

revihantu = re.compile(r'\{\{vi-hantu.*?\}\}')

# make sure we are logged in   site = wikipedia.getSite site.forceLogin

# get problems list page = wikipedia.Page(site, "User:Robert Ullmann/Han/Problems") rwp = getwikitext(page)

reent = re.compile(r'\* [0-9A-F]+ \[\[(.*?)\]\]')

entries = 0 probs = 0 fixed = 0

for title in reent.findall(rwp):

try: page = wikipedia.Page(site, title) text = getwikitext(page) except wikipedia.IsRedirectPage: print "redirect page? bad!" text = '' except Exception, e:           print "exception?", repr(e) text = '' if not text: continue origtext = text

entries += 1 if entries % 100 == 0: print "%d entries, %d tagged/replaced" % (entries, fixed)

ishanchar = False if len(title) == 1: a = ord(title[0:1]) #print "one character entry, code is %x" % a           if a >= 0x3400 and a < 0xA000: ishanchar = True if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

# Extension B, in UTF-16, narrow build (although XMLreader/Python Lib don't say so): if len(title) == 2: a = ord(title[0:1]) b = ord(title[1:2]) if a >= 0xd800 and a < 0xdc00: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000 if a >= 0x20000 and a <= 0x2A6D6: ishanchar = True

# extraneous links on the page?: if not ishanchar: continue

print "entry %s" % safe(title)

# initalize radno = 0 rad = '' ast = '' acts = set

# parse text ... find the Han char template, some params if we can

mo = rehanchar.search(text) if mo: hanct = mo.group(0) mo = reradno.search(hanct) if mo: radno = int(mo.group(1)) mo = rerad.search(hanct) if mo: rad = mo.group(1) mo = reas.search(hanct) if mo: ast = mo.group(1) else: hanct = '(missing)'

# fix as, regex will fix entry if len(ast) == 1: ast = '0' + ast

# add entire translingual section if needed if hanct == '(missing)' and '==Translingual==' not in text: text = '\n\n\n' % a + text acts.add('added Translingual section') """           print "need xhan template!"            continue            """

# check/correct Unicode Hex in Han ref mo = rehanref.search(text) if mo: hanref = mo.group(0) mo = reuh.search(hanref) if mo: uht = mo.group(1) uh = int(uht, 16) # note that we know the characters are in hex range # but: if uht != uht.upper: uh = 0 # replace with UC             else: uht = '(nil)' uh = 0 # now check, should be the same as title ordinal if uh != a:                if 'uh=|' in hanref or 'uh=}' in hanref: hanrefnew = hanref.replace('|uh=', '|uh=%X'%a) elif 'uh=' in hanref: hanrefnew = hanref.replace('|uh=' + uht, '|uh=%X'%a) else: hanrefnew = hanref[:-2] + '|uh=%X}}'%a text = text.replace(hanref, hanrefnew) acts.add('added/corrected Unicode hex value -%s +%X' % (uht, a))

# compute sort key: if rad and ast: skey = rad + ast else: skey = '' # that gives us sort= for {defn} and rs= for others

# now do regex, see if we have a substitution

for rx in Regex: newtext = Regex[rx][0].sub(Regex[rx][1], text) if newtext != text: acts.add(rx) text = newtext

# add sort keys if possible: (skip Japanese because key may be different, can do later) if skey: for lang in ['Han', 'Mandarin', 'Korean', 'Vietnamese', 'Cantonese', 'Min nan', 'Middle Chinese']: if  + lang +  in text: text = text.replace( + lang + ,  + lang + ) acts.add('add sort keys')

# add sort key to template(s), vi-hantu for now if skey and 'vi-hantu' in text: mo = revihantu.search(text) if mo and 'rs=' not in mo.group(0): newv = mo.group(0)[:-2] + '|rs=' + skey + '}}' text = text.replace(mo.group(0), newv) acts.add('add sort keys')

# changes?

if not acts: continue act = ', '.join(acts)

# some change, write it       if act: # (redundant)

fixed += 1

print "replacing in %s: %s" % (safe(title), safe(act))

act = "Han format: " + act

# try to fix the entry saved = False while not saved: try: currtext = getedit(page) if currtext.strip('\n ') != origtext.strip('\n '): print "page changed during edit?" break page.put(text, comment=act) saved = True except KeyboardInterrupt: print "keyboard interrupt" return except Exception, e:                   print "exception %s, trying again" % safe(e)

# limit number of fixes for testing # if fixed > 3: break

print "%d entries, %d fixed" % (entries, fixed)

# done

if __name__ == "__main__": try: main finally: wikipedia.stopme