User:Robert Ullmann/code/tlang


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/code/tlang

""" This code looks for languages in translations sections

some comments and such refer to headers, this code derived from the header analysis

No command line arguments.

writes reports """

import wikipedia import xmlreader import sys import re import pickle import xmldate

def safe(s):

ss = pickle.dumps(s) l = len(ss) return ss[1:l-5]

def main:

# make sure we are logged in   site = wikipedia.getSite site.forceLogin wikipedia.setAction('writing report')

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

entries = 0 words = 0 Trans = 0

# several different cases ... retrans1 = re.compile(r'\*\s*\[\[w:.+\|([^\]]+?)\]\]\s*:(.*)') retrans2 = re.compile(r'\*\s*\[\[([^\]]+?)\]\]\s*:(.*)') retrans3 = re.compile(r'\*\s*(.+?):(.*)') reunlink = re.compile(r'\[\[(.*?)\]\]')

# valid headers have templates with codes Codes2 = {} Codes3 = {} CodesW = {} # all headers have occurance counts Occurs = {} # invalid headers have examples, but we collect for all Examples = {}

# things that look like codes, but aren't; including ISO 639-2 B codes:

Stops = [ 'alb', 'arm', 'baq', 'bur', 'chi', 'cze', 'dut', 'fre', 'geo', 'ger', 'gre', 'ice', 'mac', 'may', 'mao', 'per', 'rum', 'scc', 'scr', 'slo', 'tib', 'wel', 'zh-tc', 'zh-sc', 'zh-yue', 'gko', 'rfc', 'rfd', 'rfv', 'top', 'mid', 'pos-n', 'pie' ]

# and fix DAVilla silliness: Codes2['Chinese'] = 'zh'

recmatch = re.compile(r'[a-z-]+$') regood = re.compile(r'(' + re.escape('') + \ r'|)([^\{\}<]+)(' + re.escape('') + r'|)<noinclude')

# try a particular language report, Romanian to start replang = "" Alltrans = set

for entry in dump.parse: text = entry.text title = entry.title

entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d translations" % (entries, words, Trans)

# look for code templates

if title.startswith('Template:'): code = title[9:] if code in Stops: continue if not recmatch.match(code): continue if 'Language templates' not in text: continue

mo = regood.match(text) if not mo: print "bad code template %s: %s" % (safe(code), safe(text)) continue lang = mo.group(2)

print "code %s: %s" % (safe(code), safe(lang)) if len(code) == 2: Codes2[lang] = code elif len(code) == 3: Codes3[lang] = code else: CodesW[lang] = code continue

# now skip non main-name-space

if title.find(':') >= 0: continue else: words += 1

# if entries > 5000: break

# parse text ...

intrans = False

for line in text.splitlines:

# comments on the (presumed) end of lines if line.find('= 0: line = line.split('= 0: intrans = True continue

if not intrans: continue

mo = retrans1.match(line) if not mo: mo = retrans2.match(line) if not mo: mo = retrans3.match(line) if not mo: continue

# do some cleanup

lang = mo.group(1) lang = reunlink.sub(r'\1', lang) lang = lang.strip(" []'") if not lang: continue if lang.startswith( ('{', '*', ':', '?', '#') ): continue

Trans += 1

if lang not in Occurs: Occurs[lang] = 0 Occurs[lang] += 1

# accumulate report lines for a language if lang == replang: Alltrans.add('* ' + title + ': ' + mo.group(2).strip) print safe(lang + ': ' + title)

# always collect examples if lang not in Examples: Examples[lang] =  + title +  continue

if len(Examples[lang]) < 210: Examples[lang] += ' ' + title + ''

# end of for line

# end of for entry

print "%d entries, %d words, %d translations" % (entries, words, Trans)

nlangs = 0

# report languages

report = 'Languages used in translations sections as of ' + xmldate.enXMLdate + '\n'

#Codes['Ancient Greek'] = 'grc' #report += '(Ancient Greek set to grc for this run)\n'

report += """ See User:Robert Ullmann/Trans languages/uncoded for a list of entries without codes (subset of this list).

{| class="prettytable sortable" """   repinv = 'List of languages without code templates as of ' + xmldate.enXMLdate + """, to be sorted.
 * -\n| |Codes\n| |Language\n| |Occurs\n| |Examples

{| class="prettytable sortable" """
 * -\n| |Codes\n| |Language\n| |Occurs\n| |Examples||Notes

for lang in sorted(Occurs): # if (header not in Codes2) and (header not in Codes3) and (header not in CodesW): continue codes = '' if lang in Codes2: codes += ', ' + Codes2[lang] if lang in Codes3: codes += ', ' + Codes3[lang] if lang in CodesW: codes += ', ' + CodesW[lang] codes = codes.strip(', ') if codes: report += "|-\n| %s || %s || %d ||\n" % (codes, lang, Occurs[lang]) else: report += "|-\n| || %s || %d || %s\n" % (lang, Occurs[lang], Examples[lang]) repinv += "|-\n| || %s || %d || %s ||\n" % (lang, Occurs[lang], Examples[lang]) # del Occurs[header] nlangs += 1 report += "|}\n" repinv += "|}\n" wikipedia.setAction('writing report')

# write the report pages

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages/uncoded') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(repinv)

print "Languages: %d" % nlangs

# report for specific language

if not replang: return # we are done

report = replang + ' entries in translations sections as of ' + xmldate.enXMLdate + '\n\n\n'

report += u'\n'.join(sorted(Alltrans)) + '\n'

wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/Trans languages/' + replang) oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

if __name__ == "__main__": try: main finally: wikipedia.stopme