User:Robert Ullmann/Pronunciation exceptions/code


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/Pronunciation_exceptions/code

""" This bot looks for and executes replacements, customized for each run

This version looks unmatched wikisyntax and parens

No command line arguments.

"""

import wikipedia import xmlreader import sys import re import pickle import xmldate import socket from mwapi import getwikitext

def safe(s): return pickle.dumps(s)[1:-5]


 * 1) work cache, record time last looked at entry
 * 2) each record is key: lc:word, pickled with safe, value is integer time

import shelve cache = shelve.open("pronex") from time import time


 * 1) we want to identify trouble cases, line by line
 * 2) they are applied after checking all the AF regex fixes

Flags = set([ '', "{|", "//", "/", "[[Rhymes:", "[[rhymes:", "hymes:--", "hymes|-",             "US", "(US)", "UK", "(UK)",              "[[RP", "WEAE",              "* [[", "* ''", "* (" ])

reenpr = re.compile(r'\{\{enPR\|(.*?)}}') reipa = re.compile(r'\{\{IPA\|(.*?)}}') resampa = re.compile(r'\{\{SAMPA\|(.*?)}}') rerfp = re.compile(r'\{\{rfp\|(.*?)}}') rederef = re.compile(r'') # not quite correct, but will do for now redecom = re.compile(r'') rehttp = re.compile(r'\[http:.*?\]') restack = re.compile(r"^([:#\*]+)\s*") reblank = re.compile(r"^\* ?\[\[(IPA|SAMPA|AHD)\]\]:? *//$")
 * 1) match "stackable" format characters at start of lines, so we can have one space exactly
 * 1) match entire line is "blank" IPA, SAMPA, etc:


 * 1) exact copies of AF regex it will fix (manually copied)

AFcount = 0 Prex = {}

def preset:

# Pronunciate # like Regex, but applied line by line only in pronunciation sections # use ^ and $ as needed with re.M for prescreen Prex['template enPR/IPA/SAMPA'] = \ (re.compile(r'^\*? ?([^ \{\|\}/]+), /([^\{\|\}/]+)/, /([^\|\}/]+)/$', re.M),                    r'*, , ') Prex['template enPR/IPA/SAMPA (RP, UK, US)'] = \ (re.compile(r"^\*? ?\((RP|UK|US)\):? *" r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /([^\|\}/]+)/$', re.M),                    r'* , , ') Prex['template enPR/IPA/SAMPA with {a}'] = \ (re.compile(r"^\*? ?(\{\{a\|[^\}]+\}\}):? *" r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /([^\|\}/]+)/$', re.M),                    r'* \1, , ')

Prex['+rhymes template'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:English:-(?P .+?)\|-(?P=s)\]\]"),                               r)    # w/O "Rhymes:":    Prex['+rhymes template w/Rhymes: in link'] = \           (re.compile("^([\*:]+) *\[\[[Rr]hymes:English:-(?P .+?)\|Rhymes: -(?P=s)\]\]", re.M),                                r'\1 ')    Prex['+rhymes template (Finnish)'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Finnish:-(?P .+?)\|-(?P=s)\]\]"),                                r)    Prex['+rhymes template w/Rhymes: in link (Finnish)'] = \           (re.compile("^([\*:]+) *\[\[[Rr]hymes:Finnish:-(?P .+?)\|Rhymes: -(?P=s)\]\]", re.M),                                r'\1 ')    Prex['+rhymes template w/Rhymes: in link (French)'] = \           (re.compile("^([\*:]+) *\[\[[Rr]hymes:French:-(?P .+?)\|Rhymes: -(?P=s)\]\]", re.M),                                r'\1 ')    Prex['+rhymes template (Icelandic)'] = \           (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Icelandic:-(?P .+?)\|-(?P=s)\]\]"),                                r'')    Prex['template -Rhymes +rhymes'] = (re.compile(r'\{\{Rhymes([\|\}])'), r'{{rhymes\1')    # multiple rhymes (assume language matches! ;-)    Prex['add additional rhyme to template'] = \           (re.compile(r'(\{\{rhymes\|[^\}]+)\}\} *(,|or|) *\[\[[Rr]hymes:[A-Za-z -]+:-(?P .+?)\| ?-(?P=s)\]\]'),                                r'\1|\3}}')

Prex["rm /'s from enPR template"] = (re.compile(r'\{\{enPR\|/([^ /\[\]\{\}]+?)/\}\}'), r'')

# RP, UK, and US in a wide variety of cases Prex['(RP) to '] = (re.compile(r"^\*? ?[\(\[\{']+RP[\]\)\}:']+", re.M), r'* ') Prex['(UK) to '] = (re.compile(r"^\*? ?[\(\[\{']+UK[\]\)\}:']+", re.M), r'* ') Prex['(US) to '] = (re.compile(r"^\*? ?[\(\[\{']+US[\]\)\}:']+", re.M), r'* ') Prex['(italbrac RP) to '] = (re.compile(r"^\*? ?\{\{italbrac\|\[*RP\]*\}\}:?", re.M), r'* ') Prex['(italbrac UK) to '] = (re.compile(r"^\*? ?\{\{italbrac\|\[*UK\]*\}\}:?", re.M), r'* ') Prex['(italbrac US) to '] = (re.compile(r"^\*? ?\{\{italbrac\|\[*US\]*\}\}:?", re.M), r'* ') Prex['IPA: WEAE to IPA:'] = \ (re.compile(r"^\*? ?IPA: [\(\[\{']+WEAE[\]\)\}:']+", re.M), r'* IPA:') Prex['(GenAm) to '] = (re.compile(r"^\*? ?\[\[w:G[^\|]+\|GenAm\]\]", re.M), r'* ') Prex['(Canada) to '] = (re.compile(r"^\*? ?[\(\[\{']+Canada[\]\)\}:']+", re.M), r'* ') Prex['(Australia) to '] = \ (re.compile(r"^\*? ?[\(\[\{']+Australia[\]\)\}:']+", re.M), r'* ') Prex['(Aus) to '] = (re.compile(r"^\*? ?[\(\[\{']+Aus[\]\)\}:']+", re.M), r'* ') Prex['(GenAm|US) to '] = \ (re.compile('^' + re.escape("* (US)"), re.M),           r'* ') Prex['(RecPr|UK) to '] = \ (re.compile('^' + re.escape("* (UK)"), re.M),           r'* ')

# untemplated SAMPA and IPA, several combinations, also for "AHD", allow an template in front Prex['template IPA'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:IPA\||)IPA\]*:? *([/\[][^\{\|\}/\]]+?[/\]])$", re.M),            r'* \1') Prex['template IPA -IPAchar'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:IPA\||)IPA\]*:? *\{\{IPAchar\|([/\[][^\{\|\}/\]]+?[/\]])\}\}$", re.M),            r'* \1') Prex['template SAMPA'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:SAMPA\||)SAMPA\]*:? *([/\[])(|)([^\|\}/]+?)(|)([/\]])$", re.M),            r'* \1') Prex['template enPR (was AHD)'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)\[*(w:AHD\||)AHD\]*:? *([^ \{\|\}/]+?)$", re.M),            r'* \1') Prex['template X-SAMPA'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:X-SAMPA\||)X-SAMPA\]*:? *([/\[])(|)([^\{\|\}/]+?)(|)([/\]])$", re.M),            r'* \1')

Prex['or/comma to multiple parameters in IPA template'] = \ (re.compile(r"\{\{IPA\|([^\}]+/)(, ?| or | or )(/[^\}]+)\}\}"), r'') Prex['or/comma to multiple parameters in enPR template'] = \ (re.compile(r"\{\{enPR\|([^\}]+/)(, ?| or | or )(/[^\}]+)\}\}"), r'') Prex['or/comma to multiple parameters in SAMPA template'] = \ (re.compile(r"\{\{SAMPA\|([^\}]+/)(, ?| or | or )(/[^\}]+)\}\}"), r'')

# accent templates, try to cover the A-cai/Min Nan cases and others, up to 4

Prex['+accent template 1'] = (re.compile(r"^\* \("           r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r"\):?", re.M), r'* ') Prex['+accent template 2'] = (re.compile(r"^\* \("           r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r"\):?", re.M), r'* ') Prex['+accent template 3'] = (re.compile(r"^\* \("           r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r"\):?", re.M), r'* ') Prex['+accent template 4'] = (re.compile(r"^\* \("           r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"            r"\):?", re.M), r'* ')

# hyphenation ...   Prex['+hyphenation template'] = (re.compile(r"'*Hyphenation:?'*:? *([^ \{\}]+)$", re.M), r'') Prex['middot to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u00B7' + '(.+?\}\})'),           r'\1|\2') Prex['hyphpt to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u2027' + '(.+?\}\})'),           r'\1|\2') Prex['bullet to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + u'\u2022' + '(.+?\}\})'),           r'\1|\2') Prex['middot (HTML) to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)&middot;(.+?\}\})'),           r'\1|\2')

# sorting enPR, IPA, (X-)SAMPA: Prex['enPR before SAMPA'] = (re.compile(r'\{\{(X-|)SAMPA\|([^\}]*)\}\}, \{\{enPR\|([^\}]*)\}\}'),                                           r', ') Prex['IPA before SAMPA'] = (re.compile(r'\{\{(X-|)SAMPA\|([^\}]*)\}\}, \{\{IPA\|([^\}]*)\}\}'),                                           r', ') Prex['enPR before IPA'] = (re.compile(r'\{\{IPA\|([^\}]*)\}\}, \{\{enPR\|([^\}]*)\}\}'),                                           r', ')

def trouble(s): global AFcount

s2 = s = restack.sub(r'\1 ', s)   # skip AF fix(es), do what it will do:

for rx in Prex: s2 = Prex[rx][0].sub(Prex[rx][1], s2) if s2 != s:       if AFcount < 500: print "AF will fix:" print "  %s" % safe(s) print "to %s" % safe(s2) AFcount += 1 return False # as AF will do something

# IPA, SAMPA, enPR are in AF.StarTemp: if s.startswith(('{{IPA|', '{{SAMPA|', '{{enPR|')): AFcount += 1 return False

# "blank" non-templates, in general Regex in AF: if reblank.match(s): AFcount += 1 return False

if "Manuel de Codage" in s: return False

# remove rfp and contents, ref tags, comments, http links s = rerfp.sub(' ', s)   if s == '{{rfap}}': return False s = rederef.sub(' ', s)   s = redecom.sub(' ', s)    if s.startswith(''): return False s = rehttp.sub(' ', s)

# non-templates (skip "[Aa]udio-IPA" for now) if "IPA" in s and "{{IPA" not in s and "udio-IPA" not in s: return "IPA not template" if "enPR" in s and "{{enPR" not in s: return "enPR not template" if "SAMPA" in s and "{{SAMPA" not in s and "{{X-SAMPA" not in s: return "SAMPA not template" if "AHD" in s: return "AHD found"

# check sequence e = s.find("{{enPR|") i = s.find("{{IPA|") m = s.find("{{SAMPA|") if e > 0 and i > 0 and i < e: return "IPA before enPR" if i > 0 and m > 0 and m < i: return "SAMPA before IPA" if e > 0 and m > 0 and m < e: return "SAMPA before enPR"

# a must be at start, and only follow wikisyntax a = s.find("{{a|") if a > 0 and s[0:a].strip(':* '): return "{a} template not at beginning"

# now check templates for c in reenpr.findall(s): if not c.strip: return 'empty enPR template' for p in c.split('|'): p = p.strip if " or " in p: return '"or" should be multiple template parameters' if " or " in p: return '"or" should be multiple template parameters' # next is fixed by AF at present # if p.startswith('/') and p.endswith('/'): return "slashes in enPR template"

for c in reipa.findall(s): if not c.strip: return 'empty IPA template' for p in c.split('|'): p = p.strip if p.startswith('lang='): continue if " or " in p: return '"or" should be multiple template parameters' if " or " in p: return '"or" should be multiple template parameters' if p.startswith('/'): if not p.endswith('/'): return "mismatched /'s in IPA template" elif p.startswith('['): if not p.endswith(']'): return "mismatched [ ]'s in IPA template" else: return "no / / or [ ]'s in IPA template"

for c in resampa.findall(s): if not c.strip: return 'empty SAMPA template' for p in c.split('|'): p = p.strip if p.startswith('lang='): continue if p[:1].isdigit and p[1:2] == '=': p = p[2:] if " or " in p: return '"or" should be multiple template parameters' if " or " in p: return '"or" should be multiple template parameters' if p.startswith('/'): if not p.endswith('/'): return "mismatched /'s in SAMPA template" elif p.startswith('['): if not p.endswith(']'): return "mismatched [ ]'s in SAMPA template" else: return "no / / or [ ]'s in SAMPA template"

# some simple cases that are just flagged for flag in Flags: if flag in s: return "flag " + flag + ""

# couple of other randoms if s.endswith('/'): return "line ends with /" # if s.startswith('[['): return "line starts with <no" + "wiki>[[</no" + "wiki>"   # if s.startswith('('): return "line starts with ("    # if s.startswith('{{'): return "line starts with <no" + "wiki>{{</no" + "wiki>"    # if s.startswith("") and not s.endswith(""): return "line starts with <no" + "wiki>''</no" + "wiki>"    # next rule is fixed by AF in most cases (one of these at start), fix this rule sometime    # if ('{{enPR|' in s or '{{IPA|' in s or '{{SAMPA|' in s) and not s.startswith('*'):    #   return 'line does not start with *'

return False


 * 1) (sporked from Tbot/script, no need to keep up to date):


 * 1) table of scripts, each is lowest character code point, highest code + 1, ISO script

Scs = [ (0x0080, 0x0250, 'Latin'), (0x0250, 0x02B0, 'IPA'), (0x0370, 0x0400, 'Greek'), (0x0400, 0x0530, 'Cyrillic'), (0x0530, 0x0590, 'Armenian'), (0x0590, 0x0600, 'Hebrew'), (0x0600, 0x0700, 'Arabic'), (0x0700, 0x0750, 'Syriac'), (0x0750, 0x0780, 'Arabic Ext'), (0x0900, 0x0980, 'Devanagari'), (0x0980, 0x0A00, 'Bengali'), (0x0C00, 0x0C80, 'Telugu'), (0x0D00, 0x0D80, 'Malayalam'), (0x1A00, 0x1100, 'Georgian'),

(0x1E00, 0x1F00, 'Latin Ext'), (0x1F00, 0x2000, 'Greek Ext'),

(0x3040, 0x30A0, 'Hiragana'), (0x30A0, 0x3100, 'Katakana'), (0x3400, 0xA000, 'Han'),    # Han Ext A and Unified (0xAC00, 0xD800, 'Hangeul'),

(0x20000, 0x2A6D7, 'Han Ext B') ] # Han Ext B

def tkey(word):

# generate a TOC key for a given word

# simple case first, also handles '' if word[:1] <= 'z': return word[:1]

a = ord(word[0:1]) if a >= 0xd800 and a < 0xdc00: if len(word) < 2: return word # ouch! b = ord(word[1:2]) # "UTF-16" crap: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000

sc = '' for low, high, scode in Scs: if a >= low and a < high: sc = scode break

if not sc: print "no match for script for char code %x" % a       return word[:1]

return sc

def main: global AFcount

socket.setdefaulttimeout(40)

# list of entry names to ignore Stops = set

reports = { } preset

# make sure we are logged in   site = wikipedia.getSite site.forceLogin

# read Stops page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions/stops") text = page.get for s in re.findall(r'\* \[\[(.*?)\]\]', text): Stops.add(s)

print 'found %d stops' % len(Stops)

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

entries = 0 probs = 0 fixed = 0 reps = 0 replimit = 1000 cis = 0 lasttab = 0

rem = """ remainder, one per link, not checked against current, one reason for exception

"""

# testing test = False tmod = 20 if test: replimit /= tmod print "in test mode"

for entry in dump.parse: text = entry.text title = entry.title if title.find(':') >= 0: continue # if title.find('/') >= 0: continue if not title: continue # ?

entries += 1 if entries % 10000 == 0: print "%d entries, %d problems" % (entries, probs)

# if test and title[0:1] != 'c': continue if test and entries % tmod != 0: continue

# skim a lot of the db for now # if entries % tmod != 0: continue

if title in Stops: continue

# screen entries: tag = False

inPron = False for line in text.splitlines: if '=Pronunciation=' in line: inPron = True continue if line.startswith('='): inPron = False if not inPron: continue a = trouble(line) if a:               if line.startswith('{|') and entries < 300000: lasttab = entries tag = True break

# now see if it is something that should be reported:

if tag:

ckey = safe(title) # must be string for bsd dbm if ckey in cache: last = cache[ckey] if last > time - (70 * 24 * 3600): print "%s in 70 day cache, not checked" % safe(title) continue

probs += 1

# ... pick up current version from en.wikt

if reps < replimit:

print '%s is possible problem, getting current entry' % safe(title)

try: page = wikipedia.Page(site, title) # text = page.get text = getwikitext(page) except wikipedia.NoPage: print "Can't get %s from en.wikt!" % safe(page.aslink) text = '' except wikipedia.IsRedirectPage: print safe(title), 'is now a redirect page' text = '(redirect page)' # will be treated as fixed and added to cache except KeyboardInterrupt: raise KeyboardInterrupt except Exception, e:                   print "unknown exception, maybe timeout" continue # do this again next time

else: print '%s is possible problem' % safe(title) rem += '* ' + title + " " + a + "\n"

if not text: continue

# check each line for trouble

act = '' inPron = False for line in text.splitlines: if '=Pronunciation=' in line: inPron = True continue if line.startswith('='): inPron = False if not inPron: continue a = trouble(line) if a and a not in act: act += ', ' + a

# if fixed, add to cache so we don't keep re-checking

if not act: print "%s has been fixed" % safe(title) cache[ckey] = time # entry has been fixed for now cis += 1 if cis % 20 == 0: cache.sync continue

else: continue

# don't write any change to entry, report:

if act: act = "    " + act.strip(', ') + "" if reps < replimit: xp = wikipedia.Page(site, title) url = xp.urlname repline = \ "* %s (edit) %s" % (title, url, act) # go isolate the lines s = 0 se = 0 ts = '' inPron = False for line in text.splitlines: if line.startswith('='): s += 1 if '=Pronunciation=' in line: inPron = True continue if line.startswith('='): inPron = False if not inPron: continue if trouble(line): if not se: se = s                  ts += ', ' + trouble(line) repline += '\n*: <tt><no' + 'wiki>' + line + '</no' + 'wiki></tt>' print reps, safe(title), safe(line)

if reps < replimit: repline = repline.replace('SECTXX', "%d"%se) reports[title] = repline reps += 1

if test and reps > replimit: break

continue # no corrections here!

print "%d entries, %d problems" % (entries, probs) cache.close print "last table at entry %d" % lasttab

if not test: page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions") else: page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions/test") try: oldrep = page.get except wikipedia.NoPage: pass

ss = ', '.join(sorted(Stops)) fs = '' for flag in sorted(Flags): fs += ", <tt><no" + "wiki>" + flag + "</no" + "wiki></tt>" fs = fs.lstrip(", ")

report = """

occurances of pronunciation section exceptions


 * from XML dump as of %s, checked against live wiki {{subst:CURRENTDAY}} {{subst:CURRENTMONTHNAME}} {{subst:CURRENTYEAR}}
 * see talk page for rules in effect
 * checks may not be perfect at this point
 * entries are not listed if AutoFormat would fix something, though perhaps not entirely
 * total AF will fix: %d
 * some entries are listed as "stops" and not shown
 * stops in effect: %s
 * from User:Robert Ullmann/Pronunciation exceptions/stops
 * specific strings flagged: %s
 * "blank" IPA, SAMPA, etc (i.e. "* SAMPA: //") are not reported
 * %d total problems, limit of %d shown, remainder listed in User:Robert Ullmann/Pronunciation exceptions/remains

Please do section edit and remove completed entries, the automation will then recheck them. If you do most of a section but not quite all, feel free to just blank the section, any leftovers will get picked up again.

""" % (xmldate.enXMLdate, AFcount, ss, fs, reps, replimit)

if test: report += "this is a test run, you want to look at User:Robert Ullmann/Pronunciation exceptions\n"

prev = '' s = 0 i = 1 for t in sorted(reports): if tkey(t) != prev: report += '\n==' + tkey(t) + '==\n\n' prev = tkey(t) s = 0 i = 1 s += 1 if s > 9: i += 1 report += '\n==' + tkey(t) + ' (%d)==\n\n' % i           s = 0 report += reports[t] + '\n'

wikipedia.setAction("regenerate, add more") page.put(report)

if not test: wikipedia.setAction("updating remainder") page = wikipedia.Page(site, "User:Robert Ullmann/Pronunciation exceptions/remains") page.put(rem)

# done

if __name__ == "__main__": try: main finally: wikipedia.stopme