User:Buttermilch/updated KassadBot

This is the updated KassadBot code I have been running on this account (against the latest git snapshot of pywikibot). It may need further updates.

from __future__ import print_function
 * 1) !/usr/bin/python2
 * 2) -*- coding: utf-8  -*-

""" This bot looks for entries tagged for autoformatting, does a number of tasks

No command line arguments.

"""

import pywikibot from itertools import islice import sys import re import pickle import time import socket import json

def safe(s): return pickle.dumps(s)[1:-5]

def lkey(l): n = l.strip('[]') if not n: return n

if n == 'Translingual': return '0' + n	if n == 'English': return '1' + n

# bad L2 headers if n.lower == 'cyrillic alphabet': return '0' + n	if n.lower == 'arabic alphabet': return '0' + n	if n.lower == 'see also': return '3' + n	if n.lower == 'references': return '4' + n

# handle names like !Kung and 'Auhelawa: move non-alpha to the end of key

if not n[0].isalpha: n = n[1:] + n[0]

return '2' + n


 * 1) this needs to be done here, else we won't be able to use the variables in subroutines which is bad

site = pywikibot.getSite("en", "wiktionary") site.forceLogin(sysop = False)
 * 1) make sure we are logged in
 * 1) site.forceLogin(sysop = True)


 * 1) get our config pages, throw exceptions: we have to stop if we can't read these

print("read headers") page = pywikibot.Page(site, "User:AutoFormat/Headers") headtab = page.get print("read contexts") page = pywikibot.Page(site, "User:AutoFormat/Contexts") ctxtab = page.get

Lcodes = { } Ltocode = { }

i = 0 languages = json.loads(site.expand_text("")) for (code, names) in languages.items: Lcodes[code] = names[0] for name in names: if name in Ltocode: print('warning: non-unique language name ' + name) Ltocode[name] = None else: Ltocode[name] = code i += 1 print("found %d language codes" % i)

Scripts = { }

PSK = { } from random import random from math import log as ln AH = set Regex = { } Prex = {}
 * 1) newpages = set


 * 1) work cache, record time last looked at entry
 * 2) each record is key: lc:word, pickled with safe, value is integer time

import shelve cache = shelve.open("af-cache")

def now: return int(time.clock)

def prescreen: while True: # indef repeat print('(%d, reading random pages)' % now) for page in site.randompages(namespaces=[0]): yield page.title

naptime = 0
 * 1) share timer with main

def rcpages(site): # generator which yields recentchanges, but not unpatrolled changes # also entries in category # in between, yields pages that satisfy the prescreen in random order global naptime

site = pywikibot.getSite("en", "wiktionary") cat = pywikibot.Category(site, "Category:Requests for autoformat")

seen = set

nextcat = now - 1 nextrc = now - 1

hold = { } rcex = re.compile(r'title="(.+?)"')

for title in prescreen:

seen.add(title) print('(%d, from random pages)' % now) page = pywikibot.Page(site, title) yield page

nf = 0 nd = 0

# get our category, every 10-15 minutes or so		if now > nextcat: #cat.catlist(purge = True) #attn Kassad: raised priority of autoformat category - it seems too stuffed up to me. previous was 7 for page in cat.articles: nf += 1 if nf > 500: break   # just munch the cat, not too hungry ;-)				# if len(hold) > 100 and nf > 1: break   # try to keep up, cat can wait? needed?				print('(%d)' % now)				seen.add(page.title)				if page.title in hold: del hold[page.title]				yield page			nextcat = now + 740

# recent changes #reducing duration if now > nextrc: print('(%d, reading recent changes)' % now) for rc in islice(site.recentchanges, 5000): if rc['ns'] != 0: continue # other stray stuff in NS:0 title = rc['title'] ht = 50 if title not in seen: seen.add(title) hold[title] = now + ht # scatter out into future ... (numbers fairly arbitrary, but work well) ht += 6 if ht > 21 * 3600: ht /= 7 # ? if more than most of a day nf += 1 print("found: [%s] hold until %d" % (safe(title), hold[title])) nextrc = now + 600

pastime = now for title in sorted(hold): # 10 on a pass is enough if nd > 9: break if hold[title] > pastime: continue print('(%d, rc held to %d)' % (now, hold[title])) del hold[title] nd += 1 page = pywikibot.Page(site, title) yield page

print('(%d, %d held)' % (now, len(hold)))

continue


 * 1) now have some serious recursion fun!
 * 2) fuzzy returns string match score
 * 3) r is min required, calls may have neg r, may return value < r

def fuzzy(a, b, r):

if not a or len(a) < r: return 0 if not b or len(b) < r: return 0

if a == b: return len(a) if a[0] == b[0]: return 1 + fuzzy(a[1:], b[1:], r-1) if a[-1] == b[-1]: return 1 + fuzzy(a[:-1], b[:-1], r-1)

# try with each char forward p = a.find(b[0]) if p >= 0: sca = 1 + fuzzy(a[p+1:], b[1:], r-1) else: sca = 0

p = b.find(a[0]) if p >= 0: scb = 1 + fuzzy(b[p+1:], a[1:], r-1) else: scb = 0

# no match either/or way, skip this char, one or both if not sca and not scb: sk = fuzzy(a[1:], b[1:], r)	elif not sca: sk = fuzzy(a, b[1:], r)	elif not scb: sk = fuzzy(a[1:], b, r)	else: sk = 0

return max(sk, sca, scb)

def infline(title, lang, header):

pos = header.lower if pos.startswith(' {{') and bn[:-j].endswith('|'): an = a[i-3:] bn = b[i-3:] j -= 3 # return '-' + a[i-3:][:11] + ' +' + b[i-3:][:7] # gaa ...

if j: return '-' + an[:-j] + ' +' + bn[:-j] else: return '-' + an + ' +' + bn

# okay, try that! not so pretty is it?


 * 1) sort language sections:

retransline = re.compile(r'\* \[*([^\]:\{\}]+?)\]*:')    # match an already canonicalized line retransreq = re.compile(r'\* \{\{trreq\|([^\}]+?)\}\}')  # trans req template retranstbc = re.compile(r'\* \{\{ttbc\|([^\}]+?)\}\}')   # trans to be checked, allow here? redetemp = re.compile(r'\{\{\w*\|') redechar = re.compile(r'[\{\}\|\[\]]') redecomm = re.compile(r'')

def nlen(s):

# simplest form: # return 1 + len(s)/135 # +1 for each length of line that will probably wrap (WAG) # this routine can be twaeked more if needed

# better: s2 = redetemp.sub(, s)	s2 = redechar.sub(, s2) s2 = redecomm.sub('', s2)

# dbg: # if len(s2) >= 85: print "long line (%d): %s" % (1+len(s2)/85, safe(s2))

return 1 + len(s2)/85

rewsafe = re.compile(r'[\{\}\[\]\|\<\>]+')
 * 1) reduce text to "safe" for wiki as a template parameter:

reseeonly = re.compile(r"\{\{trans-top\|(.+?)\}\}\n+[ :']*[Ss]ee[ ':]*(\[\[.+?\]\])(.*)$", re.S)
 * 1) match a see-only case:

def transort(tmo):

ts = { } tsk = { }

# take apart by language, treat header as "language" nil

prob = '' prev = '' k = 0 for tline in tmo.group(0).splitlines: if tline.startswith('{{trans-top'): if '' in ts: prob = "trans-top found inside section, missing trans-bottom?" break ts[''] = tline tsk[''] = 0 continue if tline == '{{trans-mid}}': continue if tline == '{{trans-bottom}}': continue if not tline: continue mo = retransline.match(tline) if not mo: mo = retransreq.match(tline) if not mo: mo = retranstbc.match(tline) if mo: lang = mo.group(1) if lang in ts: prob = "duplicate language: " + lang break if lang.startswith('{{'): prob = "unexpected template: " + lang break if lang in Lcodes: llstemp = Lcodes[lang] lang = llstemp ts[lang] = tline nl = nlen(tline) tsk[lang] = nl			k += nl			prev = lang continue if tline.startswith('* '): prob = "unparsed language line: " + tline break # [tbd: treat ** as a sub language, eg key is "Chinese | Mandarin"] if tline.startswith('*:') or tline.startswith('**'): # allow both here ts[prev] += '\n' + tline nl = nlen(tline) tsk[prev] += nl			k += nl			continue if tline.startswith(': ') and not prev: # e.g. : see reference ts[prev] += '\n' + tline tsk[prev] += 1 k += 1 continue if tline.startswith('<!--') and not prev: ts[prev] += '\n' + tline # no addition to counts continue prob = "unknown line format: " + tline break

# blank section or nothing worth sorting, do nothing? um, format it default # if not k: return tmo.group(0)

# pick up see-only case before looking at prob: if not prev: # no languages found mo = reseeonly.match(tmo.group(0)) if mo: print("matched see in trans section") gloss = mo.group(1).strip # leaves ''s as an issue target = mo.group(2).strip if '#' not in target and '|' not in target: target = target.strip('[]') rest = mo.group(3) # check remainder rest = rest.replace("{{trans-mid}}", '') rest = rest.replace("{{trans-bottom}}", '') if not rest.strip(" '\n"): if gloss == target: return "{{trans-see|" + target + "}}" else: return "{{trans-see|" + gloss + "|" + target + "}}" else: pass # something else, leave alone or tag problem ...

if prob: print("in trans section,", safe(prob)) prob = rewsafe.sub(' ', prob) # wiki-safe ;-)		return "{{rfc-tsort|" + prob + "}}\n" + tmo.group(0)   # rfc tag + unchanged

# re-assemble, balance columns

m = 0 tsnew = '' for lang in sorted(ts, key=lkey): tsnew += ts[lang] + '\n' m += tsk[lang] if k and m >= (k + 1) / 2: tsnew += '{{trans-mid}}\n' k = 0 # if not m: tsnew += '{{trans-mid}}\n' if '{{trans-mid}}' not in tsnew: tsnew += '{{trans-mid}}\n' # better test? should be the same as not m	tsnew += '{{trans-bottom}}\n'

return tsnew

def prokey(s):

# is (sorted) stable? as of Python 2.3, yes ;-)

# simple prolog sort, LHS after RHS, unknown in the middle if s.startswith('{{was wotd'): return '0' # moved in monobook if s.startswith('{{wiki'): return '1'    # sister templates if s.startswith('{{commons'): return '1' # sister templates if s.startswith('{{inter'): return '1'   # sister templates if s.startswith('{{zh-'): return '2'     # Chinese floatright if s.startswith('{{ja-'): return '2'     # Japanese floatright if s.startswith('[[Image'): return '3'        # images	if s.startswith('[[image'): return '3'         # images	#attn Kassad: we now call them files. remove?	if s.startswith('[[File'): return '3'         # images	if s.startswith('[[file'): return '3'         # images

# LHS: if s.startswith('{{selfref'): return '6' if s.startswith('{{also'): return '7' if s.startswith('{{xsee'): return '7' if s.startswith('{{xalso'): return '7'

if s: print("prolog sort: no key for %s" % safe(s)) else: return '9' # blank lines usually are at end, will be removed

return '5'

def main: global naptime

socket.setdefaulttimeout(30)

# regex precomp, force headers to canonical: # first allows singleton = rehead1 = re.compile(r'(={2,6})(.+?)={2,6}(.*)$') rehead2 = re.compile(r'(={1,6})([^=<]+?)={1,6}(.*)$') rehead3 = re.compile(r'(={1,6})([^=<]+?)=+(.*)$') rehead4 = re.compile(r'(=+)([^=<]+)(.*)$') realleq = re.compile(r'=+$')

# L2 headers reL2head = re.compile(r'==?\s*([^=]+)={1,6}(.*)') # lang= on bad headers, so allow singleton ='s:	reheader = re.compile(r'(={3,6})\s*(.+?)={2,6}(.*)') reiwiki = re.compile(r'\[\([^\+?)\]\]\s*:(.*)') retrans2 = re.compile(r'\* \[\[([^\]]+?)\]\]\s*:(.*)') retrans3 = re.compile(r'\* ([^:]+?):(.*)') # the below should hopefully fix a bug that happened with Serbo-Croatian retrans4 = re.compile(r'\* ([\w\-]+)(.*)') # missing : retag = re.compile(r'\{\{rfc-auto(\|.*?|)}}') regender = re.compile(r"([mfcn])") reglossfix = re.compile(r'(.+)\(\d+\)$') retopgloss = re.compile(r'\{\{top(\|.*?|)}}$') recontext = re.compile(r"^# *\((.+?)\):? ?(.*)$", re.M)	recontext2 = re.compile(r"^# *\((.+?)\):? ?(.*)$", re.M) recontext3 = re.compile(r"^# *\{\{italbrac\|([^}]+?)}}:? ?(.*)$", re.M)	repronn = re.compile(r'Pronunciation \d+')

# be careful to match and remove newline in these unless they happen to be at the very end: rerfclevel = re.compile(r"^\{\{rfc-level\|.*\+.*\}\}\n?", re.M)	rerfcxphrase = re.compile(r"^\{\{rfc-xphrase\|.*\}\}\n?", re.M)	rerfcheader = re.compile(r"^\{\{rfc-header\|.*\}\}\n?", re.M)	rerfcsubst = re.compile(r"^\{\{rfc-subst\}\}\n?", re.M)	rerfcpronn = re.compile(r"^\{\{rfc-pron-n\|.*\}\}\n?", re.M)

# italbracs not on context/defn lines, template italbrac->i replacement separate # limited forms ... nowilink with pipes, no templates, look for : in mo.g3 # look for gloss, etc, * lines to start ... reibcomma = re.compile(r"^(\*\s*)\(([^\)^'^\|^\{]+):?\)(:?)")	reibcomma2 = re.compile(r"^(\*\s*)\(([^\)^'^\|^\{]+):?\)(:?)")

# match "stackable" format characters at start of lines, so we can have one space exactly restack = re.compile(r"^([:#\*]+)\s*")

# regex table (dict, name = tuple of compiled object and replacement) Regex['subst:PAGENAME'] = (re.compile(r'\{\{PAGENAME}}'), '{{subst:PAGENAME}}') Regex['template -cattag +context'] = (re.compile(r'\{\{cattag\|'), '{{context|') Regex['template -Unicode +unicode'] = (re.compile(r'\{\{Unicode\|'), '{{unicode|') Regex['template -Wikipedia +wikipedia'] = (re.compile(r'\{\{Wikipedia([\|\}])'), r'{{wikipedia\1') Regex['template -WP +wikipedia'] = (re.compile(r'\{\{WP([\|\}])'), r'{{wikipedia\1') Regex['template -Acronym +acronym'] = (re.compile(r'\{\{Acronym([\|\}])'), r'{{acronym\1') Regex['template -Initialism +initialism'] = (re.compile(r'\{\{Initialism([\|\}])'), r'{{initialism\1') Regex['template -Abbreviation +abbreviation'] = (re.compile(r'\{\{Abbreviation([\|\}])'), r'{{abbreviation\1') Regex['template -AHD +enPR'] = (re.compile(r'\{\{AHD([\|\}])'), r'{{enPR\1')

# translations #Regex['template -trans-bot +trans-bottom'] = (re.compile(r'\{\{trans-bot\}\}'), '{{trans-bottom}}') #Regex['template -trans-middle +trans-mid'] = (re.compile(r'\{\{trans-middle\}\}'), '{{trans-mid}}')

Regex['elided Translations to be checked header'] = (re.compile( r'^={3,6}Translations to be checked={3,6}\n*\{\{checktrans', re.M), '{{checktrans') Regex['elided Translations to be checked header and comment'] = (re.compile( r'^={3,6}Translations to be checked={3,6}\n*<!--\s*Remove this section.*\n*\{\{checktrans', re.M),		 '{{checktrans') Regex['checktrans and trans-top to checktrans-top'] = (re.compile( r'^\{\{checktrans\}\}\n*\{\{trans-top\|\w*lations to be \w*\}\}', re.M), '{{checktrans-top}}')

Regex['checktrans/top/mid/bottom to checktrans-top etc'] = (re.compile( r'^\{\{checktrans\}\}\n*\{\{top\}\}(.*?)^\{\{mid\}\}(.*?)^\{\{bottom\}\}', re.M|re.S),		 r'{{checktrans-top}}\1{{checktrans-mid}}\2{{checktrans-bottom}}')

Regex['template -ttbc-top +checktrans-top'] = (re.compile(r'\{\{ttbc-top\}\}'), '{{checktrans-top}}') Regex['template -ttbc-mid +checktrans-mid'] = (re.compile(r'\{\{ttbc-mid\}\}'), '{{checktrans-mid}}') Regex['template -ttbc-bottom +checktrans-bottom'] = (re.compile(r'\{\{ttbc-bottom\}\}'), 		 '{{checktrans-bottom}}')

Regex['template -trad +t'] = (re.compile(r'\{\{trad\|'), '{{t|') Regex['template -trad- +t-'] = (re.compile(r'\{\{trad-\|'), '{{t|')

Regex['un-indent {{also}} template'] = (re.compile(r'^:\{\{also\|', re.M), '{{also|')

# given name, preferred syntax

Regex['xx: to lang=xx in given name template'] = (		  re.compile(r'(\{\{given name[^\}]*?\|)\|?([-a-z]{2,10}):\}\}'), r'\1lang=\2}}') Regex['from language to from=language in given name template'] = (		  re.compile(r'(\{\{given name[^\}]*?\|)from ([-a-zA-Z ]+)\|?([\}\|])'), r'\1from=\2\3')

# table format lines, row divs to one "-" Regex['table |--* to |-'] = (re.compile(r'^\|--+', re.M), r'|-')

# stuff left from preload templates # careful this first one starts with 3 {'s, check previous character? not for now Regex['remove template subst detritus'] = (re.compile('\{\{\{[0-9a-z]+\|(.*?)\}\}\}'), r'\1') Regex['remove template subst detritus #if etc'] = (re.compile('\{\{#\w+:\|\|?\}\}'), r'') # temp for esbot leftovers: Regex['remove esbot:catline'] = (re.compile('\{\{esbot:catline.*\{\{ending\}{5,5}'), r'')

# see templates Regex['template -see +also'] = (re.compile(r'\{\{see\|'), r'{{also|') Regex['template -See +also'] = (re.compile(r'\{\{See\|'), r'{{also|') Regex['template -see also +also'] = (re.compile(r'\{\{see also\|'), r'{{also|') # fix Japanese sees, allow a line for kanjitab after header (do not use re.S)	Regex['Japanese see/also in section to ja-see-also'] = \ (re.compile(r'^(==Japanese==\n*.*\n*){\{(see|also)\|', re.M), \							 r'\1{{ja-see-also|')

Regex['add language in front of {{t}}'] = (re.compile(r'^\*? *\{\{t(\+|-|)\|([a-z-]+)\|', re.M), \				 r'* {{\2}}: {{t\1|\2|') # (a few more general Regex below)

StarTemp = set([ 'Han ref', 'ja-readings', 'ethnologue', 'websters-online', 'pedialite',					 'Hanja ref', 'Linguist List', 'IPA', 'SAMPA', 'enPR', 'ISO 639', 'R:1913' ]) restartemp = re.compile(r'\{\{(.+?)[\|\}]')

# trans lines gender templates regex, ordered list: Trex = [ ] # first replace ' cases with templates, look for leading space: Trex.append((re.compile(r" ([mfcn])"), r' {{\1}}')) Trex.append((re.compile(r" (pl|plural)"), ' {{p}}')) Trex.append((re.compile(r" (sg|sing|singular)"), ' {{s}}')) Trex.append((re.compile(r" m( and| or|,|/|) ?f"), ' {{m|f}}')) # now look for combinations: Trex.append((re.compile(r"\{\{([mfcn])}},? \{\{([fcn])}},? \{\{([cnps])}}"), r'{{\1|\2|\3}}')) Trex.append((re.compile(r"\{\{([mfcn])}},? \{\{([fcnps])}}"), r'{{\1|\2}}')) # hmmm... Trex.append((re.compile(r"\{\{t([\+\-]?)\|([^\|]*?)\|([^\|]*?)\|mf}}"), r'{{t\1|\2|\3|m|f}}'))

# match trans sections retransect = re.compile(r"^\{\{trans-top.*?^\{\{trans-bottom\}\}\n", re.M|re.S)

# Pronunciate # like Regex, but applied line by line only in pronunciation sections # use ^ and $ as needed with re.M for prescreen Prex['template enPR/IPA/SAMPA'] = \ (re.compile(r'^\*? ?([^ \{\|\}/]+), /([^\{\|\}/]+)/, /([^\|\}/]+)/$', re.M),					 r'* {{enPR|\1}}, {{IPA|/\2/}}, {{SAMPA|/\3/}}') Prex['template enPR/IPA/SAMPA (RP, UK, US)'] = \ (re.compile(r"^\*? ?\((RP|UK|US)\):? *" r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /([^\|\}/]+)/$', re.M),					 r'* {{a|\1}} {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}') Prex['template enPR/IPA/SAMPA with {a}'] = \ (re.compile(r"^\*? ?(\{\{a\|[^\}]+\}\}):? *" r'([^ \{\|\}/]+), /([^\{\|\}/]+)/, /([^\|\}/]+)/$', re.M),					 r'* \1 {{enPR|\2}}, {{IPA|/\3/}}, {{SAMPA|/\4/}}')

Prex['+rhymes template'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:English:-(?P .+?)\|-(?P=s)\]\]"),								r'{{rhymes|\1}}')	# w/O "Rhymes:":	Prex['+rhymes template w/Rhymes: in link'] = \			(re.compile("^([\*:]+) *\[\[[Rr]hymes:English:-(?P .+?)\|Rhymes: -(?P=s)\]\]", re.M),							r'\1 {{rhymes|\2}}')	Prex['+rhymes template (Finnish)'] = (re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Finnish:-(?P .+?)\|-(?P=s)\]\]"),								r'{{rhymes|\1|lang=fi}}')	Prex['+rhymes template w/Rhymes: in link (Finnish)'] = \			(re.compile("^([\*:]+) *\[\[[Rr]hymes:Finnish:-(?P .+?)\|Rhymes: -(?P=s)\]\]", re.M),								r'\1 {{rhymes|\2|lang=fi}}')	Prex['+rhymes template w/Rhymes: in link (French)'] = \			(re.compile("^([\*:]+) *\[\[[Rr]hymes:French:-(?P .+?)\|Rhymes: -(?P=s)\]\]", re.M),								r'\1 {{rhymes|\2|lang=fr}}')	Prex['+rhymes template (Icelandic)'] = \			(re.compile("'*Rhymes:'* *\[\[[Rr]hymes:Icelandic:-(?P .+?)\|-(?P=s)\]\]"),								r'{{rhymes|\1|lang=is}}')	Prex['template -Rhymes +rhymes'] = (re.compile(r'\{\{Rhymes([\|\}])'), r'{{rhymes\1')	# multiple rhymes (assume language matches! ;-)	Prex['add additional rhyme to template'] = \			(re.compile(r'(\{\{rhymes\|[^\}]+)\}\} *(,|or|) *\[\[[Rr]hymes:[A-Za-z -]+:-(?P .+?)\| ?-(?P=s)\]\]'),								r'\1|\3}}')

Prex["rm /'s from enPR template"] = (re.compile(r'\{\{enPR\|/([^ /\[\]\{\}]+?)/\}\}'), r'{{enPR|\1}}')

# RP, UK, and US in a wide variety of cases Prex['(RP) to {{a|RP}}'] = (re.compile(r"^\*? ?[\(\[\{']+RP[\]\)\}:']+", re.M), r'* {{a|RP}}') Prex['(UK) to {{a|UK}}'] = (re.compile(r"^\*? ?[\(\[\{']+UK[\]\)\}:']+", re.M), r'* {{a|UK}}') Prex['(US) to {{a|US}}'] = (re.compile(r"^\*? ?[\(\[\{']+US[\]\)\}:']+", re.M), r'* {{a|US}}') Prex['(italbrac RP) to {{a|RP}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*RP\]*\}\}:?", re.M), r'* {{a|RP}}') Prex['(italbrac UK) to {{a|UK}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*UK\]*\}\}:?", re.M), r'* {{a|UK}}') Prex['(italbrac US) to {{a|US}}'] = (re.compile(r"^\*? ?\{\{italbrac\|\[*US\]*\}\}:?", re.M), r'* {{a|US}}') Prex['IPA: WEAE to {{a|WEAE}} IPA:'] = \ (re.compile(r"^\*? ?IPA: [\(\[\{']+WEAE[\]\)\}:']+", re.M), r'* {{a|WEAE}} IPA:') Prex['(GenAm) to {{a|GenAm}}'] = (re.compile(r"^\*? ?\[\[w:G[^\|]+\|GenAm\]\]", re.M), r'* {{a|GenAM}}') Prex['(Canada) to {{a|Canada}}'] = (re.compile(r"^\*? ?[\(\[\{']+Canada[\]\)\}:']+", re.M), r'* {{a|Canada}}') Prex['(Australia) to {{a|Australia}}'] = \ (re.compile(r"^\*? ?[\(\[\{']+Australia[\]\)\}:']+", re.M), r'* {{a|Australia}}') Prex['(Aus) to {{a|Aus}}'] = (re.compile(r"^\*? ?[\(\[\{']+Aus[\]\)\}:']+", re.M), r'* {{a|Aus}}') Prex['(GenAm|US) to {{a|GenAm}}'] = \ (re.compile('^' + re.escape("* (US)"), re.M),			r'* {{a|GenAm}}') Prex['(RecPr|UK) to {{a|RP}}'] = \ (re.compile('^' + re.escape("* (UK)"), re.M),			r'* {{a|RP}}')

# untemplated SAMPA and IPA, several combinations, also for "AHD", allow an {{a}} template in front Prex['template IPA'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:IPA\||)IPA\]*:? *([/\[][^\{\|\}/\]]+?[/\]])$", re.M),			 r'* \1{{IPA|\3}}') Prex['template IPA -IPAchar'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:IPA\||)IPA\]*:? *\{\{IPAchar\|([/\[][^\{\|\}/\]]+?[/\]])\}\}$", re.M),			 r'* \1{{IPA|\3}}') Prex['template SAMPA'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:SAMPA\||)SAMPA\]*:? *([/\[])(|)([^\|\}/]+?)(|)([/\]])$", re.M),			 r'* \1{{SAMPA|\3\5\7}}') Prex['template enPR (was AHD)'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)\[*(w:AHD\||)AHD\]*:? *([^ \{\|\}/]+?)$", re.M),			 r'* \1{{enPR|\3}}') Prex['template X-SAMPA'] = \ (re.compile(r"^\*? ?(\{\{a\|.+?\}\} *|)" r"\[*(w:X-SAMPA\||)X-SAMPA\]*:? *([/\[])(|)([^\{\|\}/]+?)(|)([/\]])$", re.M),			 r'* \1{{X-SAMPA|\3\5\7}}')

Prex['or/comma to multiple parameters in IPA template'] = \ (re.compile(r"\{\{IPA\|([^\}]+/)(, ?| or | or )(/[^\}]+)\}\}"), r'{{IPA|\1|\3}}') Prex['or/comma to multiple parameters in enPR template'] = \ (re.compile(r"\{\{enPR\|([^\}]+/)(, ?| or | or )(/[^\}]+)\}\}"), r'{{enPR|\1|\3}}') Prex['or/comma to multiple parameters in SAMPA template'] = \ (re.compile(r"\{\{SAMPA\|([^\}]+/)(, ?| or | or )(/[^\}]+)\}\}"), r'{{SAMPA|\1|\3}}')

# accent templates, try to cover the A-cai/Min Nan cases and others, up to 4

Prex['+accent template 1'] = (re.compile(r"^\* \("			r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r"\):?", re.M), r'* {{a|\2}}') Prex['+accent template 2'] = (re.compile(r"^\* \("			r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r"\):?", re.M), r'* {{a|\2|\4}}') Prex['+accent template 3'] = (re.compile(r"^\* \("			r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r"\):?", re.M), r'* {{a|\2|\4|\6}}') Prex['+accent template 4'] = (re.compile(r"^\* \("			r"\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r", *\[*(w?:?[A-Za-z -]+\||)([A-Za-z -]+)\]*"			r"\):?", re.M), r'* {{a|\2|\4|\6|\8}}')

# hyphenation ... Prex['+hyphenation template'] = (re.compile(r"'*Hyphenation:?'*:? *([^ \{\}]+)$"), r'{{hyphenation|\1}}') Prex['middot to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + '\u00B7' + '(.+?\}\})'),			r'\1|\2') Prex['hyphpt to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)' + '\u2027' + '(.+?\}\})'),			r'\1|\2') Prex['middot (HTML) to | in hyphenation template'] = (re.compile(r'(\{\{hyphenation\|.+?)&middot;(.+?\}\})'),			r'\1|\2')

# "blank" IPA/SAMPA/AHD, include new-line, so put these in general regex Regex['replaced IPA // with {{rfp}}'] = (re.compile(r'^\* \[\[IPA\]\]:? *//\n', re.M), '{{rfp}}\n') Regex['removed SAMPA //'] = (re.compile(r'^\* \[\[SAMPA\]\]:? *//\n', re.M), '') Regex['removed AHD //'] = (re.compile(r'^\* \[\[AHD\]\]:? *//\n', re.M), '')

# IPA template fix to add lang=, capture all but }} without = reIPAlang = re.compile(r'(\{\{IPA\|[^}=]+)\}\}')

# combine to single lines, lines are canonical repronsing3 = re.compile(r"^\* \{\{enPR\|(.*?)\}\}\n\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}", re.M)	repronsing3a = re.compile(r"^\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}\n\* \{\{enPR\|(.*?)\}\}", re.M)	repronsing2 = re.compile(r"^\* \{\{IPA\|(.*?)\}\}\n\* \{\{SAMPA\|(.*?)\}\}", re.M)

Level = { } L43 = { } POS = { } EOS = [ 'See also', 'References', 'External links', 'Anagrams', 'Dictionary notes', 'Trivia', 'Statistics'] TOS = [ 'Pronunciation', 'Alternative spellings', 'Alternative forms', 'Production' ] HAN = ['Han character', 'Kanji', 'Hanzi', 'Hanza'] HT = ( '{{abbreviation', '{{initialism', '{{acronym', '{{numeral' ) NS = { } Hfix = { } reheadtab = re.compile(r'\| (.*?)\|\|\s*([1-5/]*)\s*\|\|(.*?)\|\|(.*?)\|\|(.*)') i = 0 for line in headtab.splitlines: mo = reheadtab.match(line) if mo: header = mo.group(1).strip if mo.group(2).strip == '4/3': L43[header] = True Level[header] = 4 print("header %s is 4/3" % header) else: Level[header] = int(mo.group(2)) if mo.group(3).strip == 'NS': ns = NS[header] = True else: ns = False if mo.group(4).strip == 'POS': POS[header] = True

for variant in mo.group(5).split(','): variant = variant.lower.strip if not variant: continue Hfix[variant] = header """				if not ns:					if variant.endswith('s'): Hfix[variant[-1]] = header					else: Hfix[variant + 's'] = header				"""

Hfix[header.lower] = header if not ns: if header.endswith('s'): Hfix[header.lower[-1]] = header else: Hfix[header.lower + 's'] = header i += 1

print("found %d headers" % i)

# lots of possible ety sects, 1 to 24 for i in range(1, 25): Hfix['etymology %d'%i] = 'Etymology %d'%i Level['Etymology %d'%i] = 3

Contexts = { } rectxtab = re.compile(r"\|\s*(.*?)\s*\|\|(.*)") i = 0 for line in ctxtab.splitlines: mo = rectxtab.match(line) if mo: m1 = mo.group(1).strip m2 = mo.group(2).strip if not m1 or not m2: continue # only use first, table at top over-rides auto, templates over-ride redirects if m1 not in Contexts: Contexts[m1] = m2			i += 1

print("found %d context templates" % i)

# turn on/off for now contextp = True

entries = 0 fixed = 0

# (specific stats)

# Set up set of all headers that are valid (at L3 or higher)

for header in Level: AH.add(header)

# Sigh. True means prohibited from changing 4/3 levels Connel = True

for page in rcpages(site):

naptime += 3 days = (time.time - 1199145600) / 86400 # days since 1 Jan 08 if random < days/370: Connel = False   # some of the time, as they need to be checked else: Connel = True

title = page.title

print("page %s" % safe(title))

if title.lower == 'main page': print("skip Main page ...") continue

entries += 1

try: text = page.get origtext = text except pywikibot.NoPage: print("Can't get %s from en.wikt" % safe(title)) text = '' continue except pywikibot.IsRedirectPage: print("Redirect page %s" % safe(title)) text = '' continue except pywikibot.LockedPage: print("Locked/protected page %s" % safe(title)) text = '' continue

acts = set

mo = retag.search(text) if mo: if mo.group(1).strip(' |'): acts.add('rm tag:' + mo.group(1).strip(' |')) else: acts.add('rm tag') text = retag.sub('', text)

# rfc level trickery newtext = rerfclevel.sub('', text) if newtext != text: print('took out rfc-level') acts.add('rm rfc-level tag') text = newtext

# same for xphrase newtext = rerfcxphrase.sub('', text) if newtext != text: print('took out rfc-xphrase') acts.add('rm rfc-xphrase tag') text = newtext

# same for header newtext = rerfcheader.sub('', text) if newtext != text: print('took out rfc-header') acts.add('rm rfc-header tag') text = newtext

# same for subst newtext = rerfcsubst.sub('', text) if newtext != text: print('took out rfc-subst') acts.add('rm rfc-subst tag') text = newtext

# same for pron-n newtext = rerfcpronn.sub('', text) if newtext != text: print('took out rfc-pron-n') acts.add('rm rfc-pron-n tag') text = newtext

if '{{rfc' in text: rfc = True #elif '{{rfc|' in text: rfc = True #elif '{{rfc-' in text: rfc = True else: rfc = False rfcact = ''

# overall regex, using table

for rx in Regex: newtext = Regex[rx][0].sub(Regex[rx][1], text) if newtext != text: acts.add(rx) text = newtext

# report multiple blank lines (force save), will be taken out by parsing if '\n\n\n\n' in text: # 3 or more, not just 2 acts.add("remove multiple blank lines")

# categories found in the entry or implied by context and perhaps inflection templates catseen = set

# now parse. take the entry apart into languages (ha!)

curr = '*prolog' last = '' Lsect = { '*prolog':[ ], '*iwiki':[ ] } Lcats = { } waslinked = [ ] divs = 0 header = ''

for line in text.splitlines:

# canonical headers first. some later code is redundant, but so what? it does "rest" if line and line.startswith('='): mo = rehead1.match(line) if not mo: mo = rehead2.match(line) if not mo: mo = rehead3.match(line) if not mo: mo = rehead4.match(line) # must match 4 or else what?! (all eq = is the answer to this question!) if not mo: mo = realleq.match(line) if mo: acts.add("remove line of only ='s") else: acts.add('remove bogus = line') continue oline = line level = len(mo.group(1)) if not mo.group(2).strip: acts.add('removed nil header') # !!! line = '' else: line = '='*level + mo.group(2).strip + '='*level + mo.group(3) if line != oline: acts.add('format headers')

# L2 headers mo = reL2head.match(line) if mo: header = mo.group(1).strip hf = reunlink.sub(r'\1', header) if hf != header: if '|' in hf: hf = hf.split('|')[1] if hf not in Level: acts.add('unlink language header ' + hf) header = hf

# validate language [needs to be fixed for case before first lang section!] if header.capitalize in Level: """					if not rfc:						text = '{{rfc-level|' + header + ' as level 2 header}}\n' + text						rfcact = 'add rfc-level tag for L1/2 header ' + header						rfc = True					else:						print "(no edit, bad L2 header and rfc)"						rfcact = 'bad L1/2 header ' + header					""" # try fixing, move to min level for this header: level = Level[header.capitalize] acts.add('L1/2 header ' + header + ' to L' + str(level)) # header + anything else, will get moved later Lsect[curr].append('='*level + header + '='*level + mo.group(2)) continue # with current language section

# subst code template if header.startswith('{{'): if header[2:-2] in Lcodes: hf = Lcodes[header[2:-2]] acts.add('L2 header -' + header + ' +' + hf) header = hf

# check sort order if header and last and lkey(header) < lkey(last): acts.add(last + '/' + header + ' sorted into order') last = header

if header not in Lsect: Lsect[header] = [ ] Lcats[header] = [ ]

else: acts.add('merged ' + header + ' sections') curr = header if mo.group(2).strip: acts.add('stuff after L2 header moved') Lsect[curr].append(mo.group(2).strip) continue

# look for iwiki mo = reiwiki.match(line) if mo and mo.group(1) == title: Lsect['*iwiki'].append(line) continue

# wiki format + one space line = restack.sub(r'\1 ', line)

# trailing spaces if len(line) > 2 and line.startswith('=') and line.endswith(' '): acts.add('rm spaces after header') line = line.rstrip

# take out dividers if line.startswith(''): if line == '': divs += 1 continue

# other lines Lsect[curr].append(line)

# any language sections? if len(Lsect) == 2: # no, tag if not tagged if ( 'nolanguage/box' not in text and '{{wikify' not in text and				 '{{delete' not in text and '{{only in' not in text ): text = '{{subst:nolanguage}}\n' + text rfcact = 'tagged nolanguage' rfc = True else: print("(no edit, tagged nolanguage, wikify or delete)") continue # next entry

# each section

for lang in Lsect: if lang.startswith('*'): continue if lang in Ltocode: lcode = Ltocode[lang] else: lcode = ''

# find Etymologies first

etys = [ ] etycount = 0 fh = True for i, line in enumerate(Lsect[lang]): # look for ety headers, and Pronunciation first at L4				mo = reheader.match(line) if mo: level = len(mo.group(1)) header = mo.group(2).strip # rest = mo.group(3)

# special case pronunciation, occurs with some frequency

if fh and level != 3 and fuzzy(header.lower, 'pronunciation', 11) >= 11 and len(header) < 15: acts.add('Pronunciation changed to level 3') Lsect[lang][i] = '===' + header + '===' # and leave fh set: continue

# just do fuzzy! if fuzzy(header.lower, 'etymology', 7) >= 7 and len(header) < 20: if level != 3: if fh: # first header, okay to fix! acts.add('Etymology changed to level 3') # and leave fh set: etycount += 1 etys.append(i) continue elif not rfc: Lsect[lang][i] = line + '{{rfc-level|Etymology not at level 3|lang=%s}}'%lcode acts.add('+{{rfc-level|Etymology not at level 3}}') rfc = True continue else: print("(ety not at L3 and already rfc)") continue etycount += 1 etys.append(i) fh = False

# then fix/rewrite the ety headers, use sub to handle rest, report any changes (spacing an issue): if etycount: for i in range(etycount): line = Lsect[lang][etys[i]] # print 'ety check replace ' + line if etycount > 1: newline = reheader.sub(r'===Etymology %d===\3' % (i+1), line) else: newline = reheader.sub(r'===Etymology===\3', line) if newline.strip('= ') != line.strip('= '): acts.add('header -' + line.strip('= ') + ' +' + newline.strip('= ')) Lsect[lang][etys[i]] = newline

# sigh, think that's it? Sweet, if true...

# general format newlines = [ ]

inPos = inTrans = inPro = inext = defnext = False npos = 0 ety = nety = 0 levelact = '' rfctag = '' header = ''

for line in Lsect[lang]:

# minor spacing on stackable wiktext ... # already done line = restack.sub(r'\1 ', line)

# move cats, may be something else on the line too, or multicats ... # first we need a cat-present predicate catp = False for cat in recat.findall(line): ocat = cat catp = True catname = cat[11:-2].split('|')[0] catname = re.sub('_', ' ', catname).strip cf = cat.find('|') if cf > 0: cat = '[[Category:' + catname + cat[cf:]					else: cat = ''					# we have a canonical cat! is it a novel cat?					if cat in catseen:						 acts.add('rm dup cat [[:' + cat[2:])						 continue					catseen.add(cat)					# rm bad cats from substs left around, see how this works					if '{{{' in cat:						 acts.add('rm bad cat [[:' + cat[2:])						 continue					if cat != ocat: acts.add('canonical cats')					# see if it belongs in a different sect					catmove = False

if ':' in catname: catcode = catname.split(':')[0] if catcode in Lcodes: catlang = Lcodes[catcode] if catlang != lang and catlang in Lcats: acts.add('category ' + catname + ' moved to ' + catlang + ' section') Lcats[catlang].append(cat) catmove = True elif not catname.lstrip(' 01').startswith(lang) and not catname.endswith('derivations') and not catname.endswith('fiction') and not catname.endswith('mythology'): for other in Lcats: if other == lang: continue if catname.lstrip(' 01').startswith(other+' '): acts.add('category ' + catname + ' moved to ' + other + ' section') Lcats[other].append(cat) catmove = True break

# not moved if not catmove: Lcats[lang].append(cat) if catp: line = recat.sub('', line).strip if not line: continue

# headers mo = reheader.match(line) if mo: # hit header with no infl/defn line in previous section? if inext: acts.add('added inflection line for %s/%s' % (lang, header)) newlines.append(infline(title, lcode, header)) newlines.append('') inext = False defnext = True if defnext and header not in HAN: newlines.append('# {{defn|lang=%s}}' % Ltocode[lang]) acts.add('no definition line for %s/%s added {defn}' % (lang, header))

level = len(mo.group(1)) header = mo.group(2).strip rest = mo.group(3)

# unlink header hf = reunlink.sub(r'\1', header) if hf != header: if hf.find('|') > 0: hf = hf.split('|')[1] acts.add('header -' + header + ' +' + hf) header = hf

# fix header if header.lower in Hfix: hf = Hfix[header.lower] if hf != header: acts.add('header -' + header + ' +' + hf) header = hf

# try a fuzzy! if header.lower not in Hfix and not header.startswith('{{'): high = 0 replac = '' hf = header.strip('[]{}').lower for val in sorted(Hfix): # first character must match if hf[0] != val[0]: continue rawsc = fuzzy(hf, val, len(val) - 4) print(safe('fuzzy "%s" "%s" score %d' % (hf, val, rawsc))) if rawsc > high and rawsc > max(max(len(hf), len(val)) - 3, 5): high = rawsc replac = val print(safe('fuzzy for %s: %s score %d' % (hf, replac, high))) if high: hf = Hfix[replac] acts.add('header -' + header + ' +' + hf) header = hf

# tag Transitive and Intransitive verb, and Reflexive if header.lower in ('transitive verb', 'intransitive verb', 'reflexive verb') and not rfc: rfctag = '{{rfc-trverb|' + header + '}}' rfc = True # print "trans/intrans header: %s" % safe(header)

# tag X phrase if header.endswith(' phrase') and not rfc and not header.lower in ('prepositional phrase'): rfctag = '{{rfc-xphrase|' + header + '}}' rfc = True # print "X phrase header: %s" % safe(header)

# tag Pronunciation N headers, preventing the level errors later if repronn.match(header) and not rfc: # not sure if we need the header in the template, but follows the pattern (with a |) rfctag = '{{rfc-pron-n|' + header + '}}' rfc = True

# rfc unrecognized, ignore templates for now, use NS later if header.lower not in Hfix and not rfc and not header.startswith('{{'): rfctag = '{{rfc-header|' + header + '}}' rfc = True # print "unknown header: %s" % safe(header)

# min level, set and comp for nested ety if level == 3 and header.startswith("Etymology") and etycount > 1: ety = 1 nety += 1 npos = 0 push = False else: if ety: # if we are in the last ety sect, and see end of section things at L3: if level < 4 and nety == etycount and header in EOS: inPos = ety = 0 # and ... independent of connel flag, because we always push ;-)							if level < 4 and nety == etycount and header in L43: inPos = ety = 0

# push POS (or level 3?) sections down in ety, push flag because of Connel fix # may be a good idea anyway ... yes, but if we rfc, stop if ety and not rfc: if (header in POS and header not in HAN or header in TOS) and level == 3: level = 4 acts.add('header in ety sect ' + header + ' to L' + str(level)) if header == 'Pronunciation': rfctag = '{{rfc-level|check placement of Pronunciation}}' push = True elif header in POS and header not in HAN or header in TOS: # at correct level! (or too deep already) push = False elif push and header in Level and (level == 4 or level < Level[header] + ety): level += 1 acts.add('header in ety sect ' + header + ' to L' + str(level)) elif level < 4: push = False

# code to shift header levels (general case in POS), disabled per Connel, 18.4.7 if inPos and header in L43: if npos < 2 and level < 4 + ety: if not Connel: level = 4 + ety acts.add('header ' + header + ' to L' + str(level)) else: levelact = ' (AutoFormat would have corrected level of ' + header +')' elif inPos and header in Level: if level < Level[header] + ety: if not Connel: level = Level[header] + ety acts.add('header ' + header + ' to L' + str(level)) else: levelact = ' (AutoFormat would have corrected level of ' + header +')'

# now tag remaining problems if any, various cases # should all contain "+" for the re-visit trick ... if not rfc: if level == 4 + ety and not inPos and header in POS and header not in NS: rfctag = '{{rfc-level|' + header + ' at L4+ not in L3 Ety section' + levelact + '}}' elif level == 4 + ety and not inPos and header in Level and header not in NS: rfctag = '{{rfc-level|' + header + ' at L4+ not in L3 POS section' + levelact + '}}' elif level == 3 + ety and header.startswith('Translation'): rfctag = '{{rfc-level|' + header + ' at L3+' + levelact + '}}' elif level == 5 + ety and not inTrans and header.startswith('Translations to'): rfctag = '{{rfc-level|' + header + ' at L5+, not in Translations' + levelact + '}}'

# blank line newlines.append('')

# header + anything else that wasn't blank newlines.append('='*level + header + '='*level) if rest.strip: if not rest.startswith('{{rfc-'): acts.add('moved stuff after ' + header + ' header') newlines.append(rest.strip) # Usage notes can be anywhere (see ELE) if 'rfc-level|Usage notes' in rfctag: rfctag = '' # suppress the "AF would have" now, just don't tag: if "AutoFormat would have" in rfctag: rfctag = '' if rfctag: if lcode: rfctag = rfctag[:-2] + '|lang=%s}}'%lcode acts.add('+' + rfctag) if 'check placement' not in rfctag: rfc = True newlines.append(rfctag) rfctag = ''

# set flags: inext = defnext = False if level < 4 + ety and (header in POS or header.startswith(HT)): inext = inPos = True npos += 1 elif level < 4 + ety: inPos = False inTrans = (header == 'Translations') tt = False inPro = (header == 'Pronunciation') continue

# look for inflection line if inext: if line.startswith('{{') and not line.startswith('{{wikipedia') or line.startswith("'''") or  \ fuzzy(line, title, len(title) - 1) > len(title) - 1: if line == title: acts.add('replace unformatted headword') continue inext = False defnext = True if line and line.startswith('#'): acts.add('added inflection line for %s/%s' % (lang, header)) newlines.append(infline(title, lcode, header)) defnext = True inext = False # and also do next case for defnext # elide blanks above inflection line if not line: continue

# look for definition lines if defnext and line.startswith('#'): newlines.append('') defnext = False # # used where it shouldn't be				if line.startswith('#') and header not in POS: if header in TOS or header in EOS or (header in Level and Level[header] == 4): line = '*' + line[1:] acts.add("-# +* in %s section" % header)

# serious stuff ...

if line.startswith('# '):

# look for context tag if lang in Ltocode: ctxn = 1 mo = recontext.match(line) if not mo: ctxn = 2 mo = recontext2.match(line) if not mo: ctxn = 3 mo = recontext3.match(line) if mo: print("match context tag %s" % safe(mo.group(1))) tname = cpar(mo.group(1), Contexts) if mo and tname: if lang != 'English': tname += '|lang=' + Ltocode[lang] if contextp and ctxn == 1: acts.add("-(" + mo.group(1) + ") +{{" + tname + "}}") line = recontext.sub(r'# {{' + tname + r'}} \2', line) elif contextp and ctxn == 2: acts.add("-(" + mo.group(1) + ") +{{" + tname + "}}") line = recontext2.sub(r'# {{' + tname + r'}} \2', line) elif contextp and ctxn == 3: acts.add("-{{italbrac|" + mo.group(1) + "}} +{{" + tname + "}}") line = recontext3.sub(r'# {{' + tname + r'}} \2', line) else: print("would have replaced %s with %s" % (safe(mo.group(1)), safe(tname))) # elide cats that correspond for catname in tname.split('|'): if catname == 'context' or catname.startswith('lang='): continue catname = catname[0].upper + catname[1:] # code is prefix ... if lang != 'English': catname = Ltocode[lang] + ':' + catname if contextp: catseen.add('') # catseen.add('') print("added catseen %s" % safe(catname))

# wikilinking? """				# (remember to correct for spacing)				elif not line.startswith('#') and not inTrans and "''" in line:					# look for italbrac cases not on defn lines					newl = reibcomma.sub(ibsub, line)					newl = reibcomma2.sub(ibsub, newl)					if newl != line:						# acts.add('-' + line + ' +' + newl)						# acts.add('template i')						# in pronunciation, use a, anywhere else, we want i-c if at start of * line						if inPro:							newl = re.sub(r'\{\{(i|i-c)\|', '{{a|', newl)						else:							newl = re.sub(r'\{\{i\|', '{{i-c|', newl)						acts.add(sdif(line, newl))						line = newl					# think that will work?				""" # translations lines

# stopgap check: (should be improved, tsort knows haow to handle this) if '{{ttbc|' in line: inTrans = False

if inTrans: # special indent rule, we know there is a previous line if line.startswith(': ') and newlines[-1:][0].startswith('*'): acts.add('-: +*: in trans') line = '*' + line # similar rule for :*, we leave ** alone (is correct for grouped language) # may have intended **, but this is better than leaving it :* if line.startswith(':* ') and newlines[-1:][0].startswith('*'): acts.add('-:* +*: in trans') line = '*:' + line[2:] was = False mo = retrans1.match(line) if not mo: mo = retrans2.match(line) if mo: was = True if not mo: mo = retrans3.match(line) if not mo: mo = retrans4.match(line) if mo: # missing ':' tlang = mo.group(1).strip acts.add("added : after %s in translations" % tlang) if mo: tlang = mo.group(1).strip if was and tlang.find('|') > 0: tlang = tlang.split('|')[1] trest = mo.group(2).strip

if tlang.startswith('{{') and tlang[2:-2] in Lcodes: acts.add('subst %s in trans' % tlang) tlang = Lcodes[tlang[2:-2]] was = False

if was: acts.add('trans unlink ' + tlang)

# conform gender specification templates # tr = regender.sub(r'{{\1}}', trest) tr = trest for rx in Trex: tr = rx[0].sub(rx[1], tr) if tr != trest: #acts.add('gender -' + trest + ' +' + tr) acts.add('gender ' + sdif(trest, tr)) trest = tr

if trest: line = '* ' + tlang + ': ' + trest else: line = '* ' + tlang + ':'

# convert templates # has to be a non-blank previous line, we are in trans section

if line == '{{rfc-trans}}': inTrans = False if line == '{{checktrans}}': inTrans = False if line == '{{checktrans-top}}': inTrans = False if line == '{{ttbc-top}}': inTrans = False

mo = retopgloss.match(line) if mo: if mo.group(1): gloss = mo.group(1)[1:] else: prev = newlines[-1:][0] while not prev: newlines = newlines[:-1] prev = newlines[-1:][0] if prev.startswith(';'): gloss = prev[1:] elif prev.startswith("") and prev.endswith(""): gloss = prev[3:-3] else: gloss = '' if gloss: newlines = newlines[:-1] if gloss: gloss = reglossfix.sub(r'\1', gloss).strip prev = line line = '{{trans-top|' + gloss + '}}' # <- else: line = '{{trans-top}}' acts.add('-' + prev + ' +' + line) tt = True if tt and line == '{{mid}}': line = '{{trans-mid}}' if tt and line == '{{bottom}}': newlines.append('{{trans-bottom}}') # add blank line line = '' tt = False

# end of trans

# templates that should have * outside them mo = restartemp.match(line) if mo and mo.group(1) in StarTemp: line = '* ' + line acts.add('* before ' + mo.group(1))

# pronunciation specific if inPro: refire = True while refire: refire = False for rx in Prex: if "enPR" in rx and lcode != "en": continue line, k = Prex[rx][0].subn(Prex[rx][1], line) if k:								acts.add(rx) refire = True # fire ruleset again

if 'IPA' in line and lcode and lcode != 'en' and '|lang=' not in line: line, k = reIPAlang.subn(r'\1|lang=' + lcode + '}}', line) if k: acts.add('added lang=' + lcode + ' to IPA')

if line == '{{rfp}}' and lcode and lcode != 'en': line = '{{rfp|lang=' + lcode + '}}' acts.add('added lang=' + lcode + ' to rfp')

# move {{also}} to prolog, we are in a language section if line.startswith("{{also|"): Lsect['*prolog'].append(line) acts.add("moved {{also}} to prolog") continue

# all else newlines.append(line)

# at end with no infl / defn line in previous section? if inext: acts.add('added inflection line for %s/%s' % (lang, header)) newlines.append(infline(title, lcode, header)) newlines.append('') inext = False defnext = True if defnext and (header not in HAN or npos == 1): newlines.append('# {{defn|lang=%s}}' % Ltocode[lang]) acts.add('no definition line for %s/%s added {defn}' % (lang, header))

# done with sect Lsect[lang] = newlines

# reassemble ...

newtext = '' prior = False

# sort prolog, and add to newtext if len(Lsect) > 2: pcopy = sorted(Lsect['*prolog'], key=prokey) # shallow copy, sorted if pcopy != Lsect['*prolog']: acts.add('sorted prolog') else: pcopy = Lsect['*prolog'] # no language sections, leave "prolog" alone for line in pcopy: # no blank lines if line: newtext += line + '\n' if line.startswith('=') and not rfc: newtext += '{{rfc-level|header line in prolog, before first L2 header}}\n' acts.add('tagged header before first L2 header') del Lsect['*prolog'] blank = True # not really, this is to suppress blank before 1st L2 header

for lang in sorted(Lsect, key=lkey): if lang == '*iwiki': continue if prior: if not blank: newtext += '\n' newtext += '\n\n' divs -= 1 prior = True if lang not in waslinked: newtext += '==' + lang + '==\n' else: newtext += '==' + lang + '==\n' blank = False for line in Lsect[lang]: # no dup blank lines if line or not blank: newtext += line + '\n' if line: blank = False else: blank = True if Lcats[lang]: if not blank: newtext += '\n' # (note lkey is a different function, but does strip brackets, so works ...) for cat in sorted(Lcats[lang], key=lkey): newtext += cat + '\n' blank = False del Lsect[lang]

# residual tag(s): if ('{{{' in newtext and '}}}' in newtext) or '{{#' in newtext: acts.add('+{{rfc-subst}} syntax tag') newtext += '{{rfc-subst}}\n\n' # force newline even if at end blank = True

# add the iwikis if not blank: newtext += '\n' for line in Lsect['*iwiki']: # no blank lines if line: newtext += line + '\n'

if divs != 0: acts.add("fixed 's")

# rfc-level, etc trickery for rfname in ('level', 'xphrase', 'header', 'subst', 'pron-n'): if 'rm rfc-' + rfname + ' tag' in acts: for ac in sorted(acts): if ac.startswith('+{{rfc-' + rfname): acts.remove('rm rfc-' + rfname + ' tag') acts.remove(ac) print('elided -' + rfname + ' +' + rfname) break

# sort translations if any, if not tagged already: if "{{trans-top" in newtext and "{{rfc-tsort" not in newtext: new2 = retransect.sub(transort, newtext) if new2 != newtext: if "{{trans-see" in new2 and "{{trans-see" not in newtext: acts.add("+trans-see template") if "{{rfc-tsort" not in new2: acts.add("sorted/rebalanced translations") else: acts.add("tagged translations table problem") newtext = new2

# do some combining of pron lines, now that we've done the rulesets: newtext, k = repronsing3.subn(r"* {{enPR|\1}}, {{IPA|\2}}, {{SAMPA|\3}}", newtext) if k: acts.add("combined enPR, IPA, SAMPA on one line") # variant order newtext, k = repronsing3a.subn(r"* {{enPR|\3}}, {{IPA|\1}}, {{SAMPA|\2}}", newtext) if k: acts.add("combined enPR, IPA, SAMPA on one line") newtext, k = repronsing2.subn(r"* {{IPA|\1}}, {{SAMPA|\2}}", newtext) if k: acts.add("combined IPA and SAMPA on one line")

# if page isn't "countable", see if we can add a link in a form-of template #looks like this isn't quite supported anymore. commented out. #if '[[Wiktionary:Page count}}'			#acts.add("+{{count page}} for statistics")

# do minor spacing 1% of the time that there is nothing else to do		if not acts and random < 0.01 and newtext.rstrip(' \n') != text.rstrip(' \n'): acts.add('minor spacing')

# if we added a major rfc, just do that, dump the rest of the work!! if rfcact: acts = set acts.add(rfcact) newtext = text

act = ', '.join(sorted(acts))

# some change, write it (even just rm tag) if act:

fixed += 1 naptime /= 2

print("format %s: %s" % (safe(title), safe(act)))

# try to fix the entry try: page.text = newtext page.save(comment=u'User:KassadBot reincarnate: ' + act) except pywikibot.PageNotSaved: print("failed to save page") # other action? except socket.timeout: print("socket timeout, maybe not saving page") except socket.error: print("socket error, maybe not saving page") # put throttle will do: if not saved: time.sleep(30) #retries -= 1

# end loop

print("entries fixed %d" % fixed)

# done

if __name__ == "__main__": try: main finally: pywikibot.stopme