User:Robert Ullmann/Mismatched wikisyntax/code


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/Mismatched_wikisyntax/code

""" This bot looks for and executes replacements, customized for each run

This version looks unmatched wikisyntax and parens

No command line arguments.

"""

import wikipedia import xmlreader import sys import re import pickle import xmldate import socket from mwapi import getwikitext

def safe(s): return pickle.dumps(s)[1:-5]


 * 1) work cache, record time last looked at entry
 * 2) each record is key: lc:word, pickled with safe, value is integer time

import shelve cache = shelve.open("mismatch") from time import time


 * 1) we want to match [] {}, mrd returns remainder

def mrd(s):

s = s.strip(' .,;abcdefghijklmnopqrpstvwxyz')

# look for an open: while s:

# if we find a close, return it       if s[0] in (')', ']', '}'): return s

if s[0] == '(':            s = mrd(s[1:])             if s[0:1] == ')': return mrd(s[1:]) else: return '(' + s

if s[0] == '[': s = mrd(s[1:]) if s[0:1] == ']': return mrd(s[1:]) else: return '[' + s

if s[0] == '{': s = mrd(s[1:]) if s[0:1] == '}': return mrd(s[1:]) else: return '{' + s

s = s[1:]

# okay, nil remaining return s

Tstops = set retstop = re.compile(r'[\#\*\s:\|]*\{\{([-a-zA-Z1-9 ]+)') Pars = ( '1500s=', '1600s=', '1700s=', '1800s=', '1900s=', '2000s=', 'passage=')
 * 1) multiline templates
 * 1) some lines that start with parameter names, not | ... real crock this ...

resampa = re.compile(r'\{\{(X-|)SAMPA.*?\}\}') reslash = re.compile(r'/[^\{\}/]*\{[^\{\}/]*/') rebrack = re.compile(r'\][^\{\}\]]*\{[^\{\}\]]*\]') rercal = re.compile(r'\{\{R:(CAL|LSJ)\|.*?\}\}') redecom = re.compile(r'')

def mismatch(s):

# remove comments first s = redecom.sub('', s)

# temp: ignore esbot debris (should all be fixed now) # if 'esbot:catline' in s: return False

# remove " in s:       a = s.split(" ", 1)        return mismatch(a[0]) or mismatch(a[1])

# tables, end of templates if s.startswith( ('{|', '|}', '}}') ): return False if s.lstrip == '}}': return False

# multiline templates mo = retstop.match(s) if mo and mo.group(1).strip in Tstops and '}}' not in s: return False

# just ignore crap: if ''): return False   if s.startswith(Pars) and s.endswith('}}'): return False

# remove SAMPA template, uses { in some cases s = resampa.sub(' ', s)   # and anything that looks like it, for one {, either / or bracket s = reslash.sub(' ', s)   s = rebrack.sub(' ', s)

# R:CAL template used in Hebrew entries often contains ), ignore content   # also R:LSJ    s = rercal.sub(' ', s)

if mrd(s): return True return False


 * 1) (sporked from Tbot/script, no need to keep up to date):


 * 1) table of scripts, each is lowest character code point, highest code + 1, ISO script

Scs = [ (0x0080, 0x0250, 'Latin'), (0x0250, 0x02B0, 'IPA'), (0x0370, 0x0400, 'Greek'), (0x0400, 0x0530, 'Cyrillic'), (0x0530, 0x0590, 'Armenian'), (0x0590, 0x0600, 'Hebrew'), (0x0600, 0x0700, 'Arabic'), (0x0700, 0x0750, 'Syriac'), (0x0750, 0x0780, 'Arabic Ext'), (0x0900, 0x0980, 'Devanagari'), (0x0980, 0x0A00, 'Bengali'), (0x0C00, 0x0C80, 'Telugu'), (0x0D00, 0x0D80, 'Malayalam'), (0x1A00, 0x1100, 'Georgian'),

(0x1E00, 0x1F00, 'Latin Ext'), (0x1F00, 0x2000, 'Greek Ext'),

(0x3040, 0x30A0, 'Hiragana'), (0x30A0, 0x3100, 'Katakana'), (0x3400, 0xA000, 'Han'),    # Han Ext A and Unified (0xAC00, 0xD800, 'Hangeul'),

(0x20000, 0x2A6D7, 'Han Ext B') ] # Han Ext B

def tkey(word):

# generate a TOC key for a given word

# simple case first, also handles '' if word[:1] <= 'z': return word[:1]

a = ord(word[0:1]) if a >= 0xd800 and a < 0xdc00: if len(word) < 2: return word # ouch! b = ord(word[1:2]) # "UTF-16" crap: a = (a - 0xd800) * 1024 + (b - 0xdc00) + 0x10000

sc = '' for low, high, scode in Scs: if a >= low and a < high: sc = scode break

if not sc: print "no match for script for char code %x" % a       return word[:1]

return sc

def main:

socket.setdefaulttimeout(240)

# list of entry names to ignore Stops = set

reports = { }

# make sure we are logged in   site = wikipedia.getSite site.forceLogin

# read Stops and Tstops page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/stops") text = getwikitext(page) for s in re.findall(r'\[\[(.*?)\]\]', text): Stops.add(s)

page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/multiline") text = getwikitext(page) for s in re.findall(r'\{\{temp\|(.*?)\}\}', text): Tstops.add(s)

print 'found %d stops, %d multilines' % (len(Stops), len(Tstops))

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

entries = 0 probs = 0 fixed = 0 reps = 0 replimit = 1000 cis = 0

# testing test = False tmod = 20 if test: replimit /= tmod print "in test mode"

for entry in dump.parse: text = entry.text title = entry.title if title.find(':') >= 0: continue # if title.find('/') >= 0: continue if not title: continue # ?

entries += 1 if entries % 10000 == 0: print "%d entries, %d problems" % (entries, probs)

# if test and title[0:1] != 'c': continue if test and entries % tmod != 0: continue

if title in Stops: continue

# screen entries: tag = False

for line in text.splitlines: if mismatch(line): tag = True break

# now see if it is something that should be reported:

if tag:

ckey = safe(title) # must be string for bsd dbm if ckey in cache: last = cache[ckey] if last > time - (70 * 24 * 3600): print "%s in 70 day cache, not checked" % safe(title) continue

probs += 1

# ... pick up current version from en.wikt

if reps < replimit:

print '%s is possible problem, getting current entry' % safe(title)

try: page = wikipedia.Page(site, title) # text = page.get text = getwikitext(page) except wikipedia.NoPage: print "Can't get %s from en.wikt!" % safe(page.aslink) text = '' except wikipedia.IsRedirectPage: print safe(title), 'is now a redirect page' text = '(redirect page)' # will be treated as fixed and added to cache except KeyboardInterrupt: raise KeyboardInterrupt except Exception, e:                   print "unknown exception, maybe timeout" continue # do this again next time

else: print '%s is possible problem' % safe(title)

if not text: continue

# check each line for mismatches

act = '' for line in text.splitlines: if mismatch(line): act = 'mismatched syntax' break

# if fixed, add to cache so we don't keep re-checking

if not act: print "%s has been fixed" % safe(title) cache[ckey] = time # entry has been fixed for now cis += 1 if cis % 20 == 0: cache.sync continue

else: continue

# don't write any change to entry, report:

if act: if reps < replimit: xp = wikipedia.Page(site, title) url = xp.urlname repline = \ "* %s (edit)" % (title, url) # go isolate the lines s = 0 se = 0 for line in text.splitlines: if line.startswith('='): s += 1 if mismatch(line): if not se: se = s                  repline += '\n*:  ' + line + ' ' print reps, safe(title), safe(line)

if reps < replimit: repline = repline.replace('SECTXX', "%d"%se) reports[title] = repline reps += 1

if test and reps > replimit: break

continue # no corrections here!

print "%d entries, %d problems" % (entries, probs) cache.close

if not test: page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax") else: page = wikipedia.Page(site, "User:Robert Ullmann/Mismatched wikisyntax/test") try: oldrep = page.get except wikipedia.NoPage: pass

ss = ', '.join(sorted(Stops)) ts = ', '.join(sorted(Tstops))

report = """

occurances of mismatched wikisyntax


 * from XML dump as of %s, checked against live wiki 9 July 2010
 * checks that, [], {} match, correctly nested
 * matches may not be perfect at this point
 * of course, some of these are not errors
 * some entries are listed as "stops" and not shown (smiley ;-)
 * stops in effect: %s
 * from User:Robert Ullmann/Mismatched wikisyntax/stops
 * multiline templates ignored at present: %s
 * from User:Robert Ullmann/Mismatched wikisyntax/multiline
 * contents of SAMPA template ignored, as SAMPA uses {, also tries to avoid others by looking for /...{.../, but may miss something or produce spurious errors as a result
 * also X-SAMPA, and brackets as well as slashes
 * %d total problems, limit of %d shown

Please do section edit and remove completed entries, the automation will then recheck them. If you do most of a section but not quite all, feel free to just blank the section, any leftovers will get picked up again.

""" % (xmldate.enXMLdate, ss, ts, reps, replimit)

if test: report += " this is a test run, you want to look at User:Robert Ullmann/Mismatched wikisyntax\n"

prev = '' s = 0 i = 1 for t in sorted(reports): if tkey(t) != prev: report += '\n==' + tkey(t) + '==\n\n' prev = tkey(t) s = 0 i = 1 s += 1 if s > 9: i += 1 report += '\n==' + tkey(t) + ' (%d)==\n\n' % i           s = 0 report += reports[t] + '\n'

wikipedia.setAction("regenerate, add more") page.put(report)

# done

if __name__ == "__main__": try: main finally: wikipedia.stopme