User:AutoFormat/code/langcodes


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:AutoFormat/code/langcodes

""" This code looks for language code templates in the en.wiki, using API from live DB

Writes AF control file

No command line arguments.

writes reports """

import wikipedia import xmlreader import sys import re import pickle import xmldate import socket

def safe(s):

ss = pickle.dumps(s) l = len(ss) return ss[1:l-5]

def skey(s):

# sort key to put codes in preferred order: sk = '%02d%s' % (len(s), s)

# except: if s == 'zh': sk = '04zh' # after '03cmn'

return sk

def main:

socket.setdefaulttimeout(30)

# make sure we are logged in   site = wikipedia.getSite site.forceLogin wikipedia.setAction('writing report')

Langs = set Lcodes = {} poscodes = set

retitle = re.compile(r'title="([^"]*)"')   recmatch = re.compile(r'Template:[a-z-]{2,10}$')    reccont = re.compile(r'cmcontinue="([^"]*)"') k = 0

# get category from live wikt (too much variation)

ccont = '!'

while ccont:

print "getting cat from", ccont

cats = site.getUrl("/w/api.php?action=query&list=categorymembers" \                          "&cmtitle=Category:Language_templates&cmlimit=1000" \                           "&cmcontinue=" + ccont + "&format=xml")

for title in retitle.findall(cats):

if not recmatch.match(title): print "skipped", repr(title) continue

poscodes.add(title) k += 1

mo = reccont.search(cats) if mo: ccont = mo.group(1) else: ccont = ''

print "possible templates found", k

# now get content from XML scan

relink = re.compile(r"\{\{\{l\|[\[\]']*\}\}\}") reincl = re.compile(r'<noinclude.*$', re.S)   reonly = re.compile(r'^.* (.*) .*$', re.S)

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

for entry in dump.parse: title = entry.title

if title not in poscodes: continue poscodes.remove(title)

code = title[9:]

text = entry.text

text = relink.sub('', text) text = reincl.sub('', text) text = reonly.sub(r'\1', text)

lang = text

# special case(s)

if code == 'see': lang = 'Seneca'

# bugs, fixed in next XML, 4.5.10 if code == 'oun': lang = '!O!ung' if code == 'bdf': lang = 'Biage'

# got one!

print safe(code), safe(lang)

Langs.add(lang) if lang in Lcodes: Lcodes[lang].append(code) else: Lcodes[lang] = [ code ]

# not found?

if poscodes: print "not found:", repr(poscodes)

# report for AF control file:

report = '\n{| class="prettytable"\n' report += '|-\n| | codes\n| |Language\n'

for lang in sorted(Langs): codes = u','.join(sorted(Lcodes[lang], key=skey)) report += "|-\n| " + codes + '||' + lang + '\n'

report += "|}\n" wikipedia.setAction('writing AutoFormat language table')

# write the AutoFormat table page

try: reportpage = wikipedia.Page(site, 'User:AutoFormat/Languages') oldreport = reportpage.get(sysop = True) except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

if __name__ == "__main__": try: main finally: wikipedia.stopme