User:AutoFormat/code/contexts


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:AutoFormat/code/contexts

""" This code looks for contexts formatted as ( ) on definition lines

Writes AF control file, context templates list, context exceptions list

No command line arguments.

writes report """

import wikipedia import xmlreader import sys import re import pickle import xmldate from mwapi import getwikitext

def safe(s):

ss = pickle.dumps(s) l = len(ss) return ss[1:l-5]


 * 1) parse spec

relab1 = re.compile(r'label=([^\|]*)\[\[([^\]]+)\]\]([^\|]*)\|') relab2 = re.compile(r'label=([^\|\[\]]+)\|') retc = re.compile(r'topcat=([^\|]+)\|') repc = re.compile(r'poscat=([^\|]+)\|') rerc = re.compile(r'regcat=([^\|]+)\|') recc = re.compile(r'([^\w])cat=([^\|]+)\|') relang = re.compile(r'lang=\{\{#if:\{\{\{lang\|}}}\|\{\{\{lang}}}\|([^}]+)}}\|') respace = re.compile(r'_\|') retcat = re.compile(r'tcat=([^\|]+)\|')

def respec(spec):

if spec.startswith('{{context {$undefined$}|'): spec = spec[21:] elif '{{context' not in spec: spec = "(not a context label)" if spec.endswith(' '): spec = spec[:-11] spec += '|' spec = relab1.sub(r'label:\1\2\3, ', spec) spec = relab2.sub(r'label:\1, ', spec) spec = retc.sub(r'topic category:\1, ', spec) spec = repc.sub(r'POS category:\1, ', spec) spec = rerc.sub(r'regional category:\1, ', spec) spec = recc.sub(r'\1category:\2, ', spec) spec = relang.sub(r'default language:\1, ', spec) spec = respace.sub(r'(space), ', spec) spec = spec.strip(' ,|') if '{' in spec: spec = ' ' + spec + ' '

return spec

def main:

# make sure we are logged in   site = wikipedia.getSite site.forceLogin wikipedia.setAction('writing report')

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

entries = 0 words = 0 ctxs = 0

Contexts = { } Examples = { } Templates = { } Redirs = { } First = { } Specs = { } Date = { } Cats = { } Bad = { }

recontext = re.compile(r"^# *(\(|\)|\{\{italbrac\|)(.+?)(\(|\)|}})", re.M)   reredir = re.compile(r"#redirect\s*\[\[template:(.*)\]\]", re.I)    recats = re.compile(r"\[\[Category:([^\|\]]*)", re.I)    reiwiki = re.compile(r'\[\[[a-z-]{2,9}:.+\]\]')

for entry in dump.parse: text = entry.text title = entry.title

entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d contexts" % (entries, words, ctxs)

# skip redirects, unless Templates if text and text[0] == '#': mo = reredir.match(text) if mo: Redirs[title[9:]] = mo.group(1) print "redirect: %s to %s" % (safe(title[9:]), safe(mo.group(1))) continue

# look for templates if title.startswith('Template:') and text.find('{{context') >= 0: # but not the templates themselves! if title.find('/') >= 0: continue if title.find('context') >= 0: continue if title.find('checklabel') >= 0: continue if title.find('pos-') >= 0: continue tname = title[9:] Templates[tname.lower] = tname First[u'' + tname] = text.splitlines[0] Date[tname] = xmldate.enXMLdate[:-5] # sans year

print "template: ", safe(tname)

spec = respec(First[tname]) # if spec is "bad" (contains { is a good indication) pick up current, also other stuff

bad = False if '{' in spec or '}' in spec: bad = True if reiwiki.search(text): bad = True if '}}\n<noinc' in text: bad = True

if bad: oldtext = text print '   getting current version' try: page = wikipedia.Page(site, 'Template:' + tname) # text = page.get(sysop = True) # for protected pages text = getwikitext(page) except wikipedia.NoPage: print "Can't get %s from en.wikt" % safe(page.aslink) text = '' Date[tname] = "can't access" except wikipedia.IsRedirectPage, target: print "Page %s is now a redirect" % safe(page.aslink) text = '' Date[tname] = '9 July' First[u'' + tname] = u'redirect to ' + target[0] if not text: text = oldtext else: First[u'' + tname] = text.splitlines[0] Date[tname] = '9 July' spec = respec(First[tname])

cats = '' # extract tcat from spec mo = retcat.search(spec + '|') if mo: spec = retcat.sub('', spec + '|').strip('|') cats += mo.group(1).capitalize + ' context labels, '

# explicit cats for cat in recats.findall(text): cats += cat + ', ' Cats[tname] = cats.strip(', ') if cats: print "   cats %s" % safe(Cats[tname])

# trouble Bad[tname] = '' # look for iwikis, to report: for iw in reiwiki.findall(text): Bad[tname] += ', bad iwiki: ' + iw + ' ' print "   bad iwiki ", safe(iw) if '}}\n= 0: continue

words += 1

# if entries > 5000: break

# parse text ...

for ctxtup in recontext.findall(text):

ctxstr = ctxtup[1]

for ctx in ctxstr.split(','):

ctx = ctx.strip("'[] ") if not ctx: continue

if ctx[0:2] == 'w:' and ctx.find('|') > 0: ctx = ctx.split('|')[1]

if ctx[0] > 'z' or ctx[0] < 'A': continue

if ctx in Contexts: Contexts[ctx] += 1 else: Contexts[ctx] = 1 print 'context: %s' % safe(ctx)

if ctx in Examples: if len(Examples[ctx]) < 70: Examples[ctx] += ' ' + title + '' else: Examples[ctx] =  + title +  ctxs += 1

# end of for context string

# end of for entry

print "%d entries, %d words, %d ctxs" % (entries, words, ctxs)

# and write the AutoFormat control file

try: reportpage = wikipedia.Page(site, 'User:AutoFormat/Contexts') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink oldreport = "(edit above this line)\n\n"

report = oldreport[:oldreport.find('')] + '\n'

report += '\nas of ' + xmldate.enXMLdate + ';\n' report += 'context templates, redirects, this section generated by bot, edit above horizontal rule\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Context string\n| |Template name\n'

for ctx in sorted(Templates): tname = Templates[ctx] report += "|-\n| " + ctx + " ||" + tname + '\n' report += "|}\n\nRedirects:\n\n"

report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Context string\n| |Redirect\n'

print "WARNING: still writing redirects separately"

for red in sorted(Redirs): if Redirs[red].lower in Templates: # redirect to a template, so valid # if just a case variant, ignore it, we match anyway if red.lower == Redirs[red].lower: continue report += "|-\n| " + red.lower + " ||" + red + '\n'

report += "|}\n" wikipedia.setAction('writing report')

# file the report reportpage.put(report)

# add redirs to Templates, set "first line" to be the redirect for red in Redirs: if Redirs[red].lower in Templates: Templates[red.lower] = Templates[Redirs[red].lower] First[u'' + red] = u'redirect to ' + Redirs[red] Specs[red] = First[red] Cats[red] = '' Date[red] = xmldate.enXMLdate[:-5] Bad[red] = ''

# write context labels report

report = 'Context label templates:\n\n' report += '* categories are those specified explicitly, including with tcat=, not the default cat\n\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| |Template\n| width=10% |as of\n| |Specification\n| width=15% |Template category\n'

for tname in sorted(First, key=unicode.lower): report += '|-\n| ' + tname +' || ' + Date[tname] + ' || ' \ + Specs[tname] + Bad[tname] + ' || ' + Cats[tname] + '\n' print "label %s: %s" % (safe(tname), safe(Specs[tname]))

report += "|}\n" wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Context labels') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

# write contexts report

thresh = 3

report = '\nas of ' + xmldate.enXMLdate + '\n' report += 'Contexts given in definition lines without templates: ' report += '\n%d different "contexts" found, report is those occuring at least %d times' % (len(Contexts), thresh) report += ' or that do have matching templates\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Context\n| |Template\n| |Occurs\n| | Examples\n'

for ctx in sorted(Contexts): tname = ctx.lower.strip('[]') if tname in Templates: tname = Templates[tname] else: tname = '' if not tname and Contexts[ctx] < thresh: continue report += "|-\n| " + ctx + " ||" + tname + ' ||' + str(Contexts[ctx]) + '||' + Examples[ctx] + '\n'

report += "|}\n" wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Contexts') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

if __name__ == "__main__": try: main finally: wikipedia.stopme