User:Robert Ullmann/code/level2


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/code/level2

""" This code looks for valid and invalid L2 headers (languages) in the en.wikt

No command line arguments.

writes reports """

import wikipedia import xmlreader import sys import re import pickle import xmldate

def safe(s):

ss = pickle.dumps(s) l = len(ss) return ss[1:l-5]

def main:

# make sure we are logged in   site = wikipedia.getSite site.forceLogin wikipedia.setAction('writing report')

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

entries = 0 words = 0 L2headers = 0

# valid headers have templates with codes Codes2 = {} Codes3 = {} CodesW = {} # all headers have occurance counts Occurs = {} # invalid headers have examples, but we collect for all Examples = {}

# things that look like codes, but aren't; including ISO 639-2 B codes (of which one is missing?):

Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger', 'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo', 'tib', 'wel', 'zh-tc', 'zh-sc', 'gko', 'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ]

# and fix DAVilla silliness: Codes2['Chinese'] = 'zh'

recmatch = re.compile(r'[a-z-]+$') regood = re.compile(r'(' + re.escape('') + \ r'|)([^\{\}<]+)(' + re.escape('') + r'|) 10: continue

if text[:1] == '#': # record redirects, more breakage July 2010: mo = redirect.match(text) if mo: Reds[code] = mo.group(1) continue

# gratuitiously broken July 2010, can no longer positively ID language templates # if 'Language templates' not in text: continue

mo = regood.match(text) if not mo: # can't report bad templates, as we can no longer tell which are lang temps # print "bad code template %s: %s" % (safe(code), safe(text)) continue lang = mo.group(2)

print "code %s: %s" % (safe(code), safe(lang)) if len(code) == 2: Codes2[lang] = code elif len(code) == 3: Codes3[lang] = code else: CodesW[lang] = code continue

# now skip non main-name-space

if title.find(':') >= 0: continue else: words += 1

# if entries > 5000: break

# parse text ...

for line in text.splitlines:

# comments on the (presumed) end of lines if line.find('= 0: line = line.split('<!--')[0]

if line[0:2] != '==': continue if line[2:3] == '=': continue

L2headers += 1

header = line.strip[2:-2].strip(' []')

# template mess, might as well keep (from L3 code) if header[0:2] == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1|...}\2', header)

if header not in Occurs: Occurs[header] = 0 Occurs[header] += 1

# always collect examples if header not in Examples: Examples[header] =  + title +  continue

if len(Examples[header]) < 210 or header == 'Slovenian': Examples[header] += ' ' + title + ''

# end of for line

# end of for entry

print "%d entries, %d words, %d L2 headers" % (entries, words, L2headers)

# fix up redirects, brokenness from about July 2010: for header in Codes2: code = Codes2[header] for red in Reds: if Reds[red] == code: print "found redirect from %s to %s" % (red, code) if len(red) == 3: Codes3[header] = red else: CodesW[header] = red # does this case occur?: for header in Codes3: code = Codes3[header] for red in Reds: if Reds[red] == code: print "found redirect from %s to %s" % (red, code) CodesW[header] = red # yes, that was sloppy. But what can I do?

nlangs = 0

# report valid headers

report = '\nas of ' + xmldate.enXMLdate + '\n'

report += """ May include bogus codes/languages as ability to distinguish language templates by wikitext was broken June/July 2010: category is now buried in doc page elsewhere in the dump. """

#Codes['Ancient Greek'] = 'grc' #report += '(Ancient Greek set to grc for this run)\n'

# fixes 8.7.10: Codes3['Seneca'] = 'see' Codes3['Old English'] = 'ang' Codes3['!Kung'] = 'knw' if 'Simplified Chinese' in CodesW: del CodesW['Simplified Chinese'] if 'Traditional Chinese' in CodesW: del CodesW['Traditional Chinese']

report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | ISO 639-1\n| | ISO 639-3\n| | Wiki code\n| |Occurs\n| |Language\n| |Category\n'

for header in sorted(Occurs): if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue report += "|-\n| " if header in Codes2: report += "" + Codes2[header] + " ||" else: report += " ||" if header in Codes3: report += "" + Codes3[header] + " ||" else: report += " ||" if header in CodesW: report += "" + CodesW[header] + " ||" else: report += " ||" report += str(Occurs[header]) + '||' + header + ' || Category:' + header + ' language\n' # del Occurs[header] nlangs += 1 report += "|}\n" wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/valid') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

print "valid languages: %d" % nlangs

# now remove valid, to report all the rest (keys allows us to delete) for header in Occurs.keys: if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue del Occurs[header]

# report invalid headers

report = '\nas of ' + xmldate.enXMLdate + '\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Language\n| |Occurs\n| |Examples\n'

for header in sorted(Occurs): report += "|-\n|  " + header + "  ||" + str(Occurs[header]) + '||' + Examples[header] + '\n'

report += "|}\n" wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L2/invalid') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

if __name__ == "__main__": try: main finally: wikipedia.stopme