User:Robert Ullmann/code/level3


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/code/level3

""" This code looks for valis and invalid L3 headers in the en.witk

No command line arguments.

writes reports """

import wikipedia import xmlreader import sys import re import pickle import xmldate

def safe(s):

ss = pickle.dumps(s) l = len(ss) return ss[1:l-5]

def main:

# make sure we are logged in   site = wikipedia.getSite site.forceLogin wikipedia.setAction('writing report')

# get XML dump dump = xmlreader.XmlDump("en-wikt.xml")

entries = 0 words = 0 L3headers = 0

# valid headers have notes Notes = {} # all headers have occurance counts Occurs = {} # invalid headers have examples Examples = {}

# initialize some valid headers

# standard POS, etc: for header in ('Noun', 'Verb', 'Adjective', 'Adverb', 'Pronoun',           'Proper noun', 'Preposition', 'Conjunction', 'Interjection',            'Article', 'Prefix', 'Suffix', 'Affix', 'Infix', 'Counter'): Notes[header] = 'standard POS header'

for header in ('Initialism', 'Abbreviation', 'Letter', 'Symbol', 'Acronym', 'Proverb',           'Contraction', 'Idiom', 'Phrase', 'Syllable'): Notes[header] = 'standard non-POS header'

for header in ('Number', 'Numeral', 'Cardinal number', 'Cardinal numeral', 'Ordinal number', 'Ordinal numeral'): Notes[header] = "see note supra"

for header in ('Etymology', 'Pronunciation', 'Trivia', 'Alternative spellings', 'Alternative forms',            'Anagrams', 'Usage notes'): Notes[header] = "standard L3 header"

for num in range(0, 25): Notes['Etymology ' + str(num)] = 'standard L3 header'

for header in ('Related terms', 'Derived terms', 'Descendants', 'See also', 'References', 'External links', 'Quotations'): Notes[header] = "standard L4/L3 header"

for header in ('Declension', 'Conjugation', 'Inflection', 'Antonyms', 'Synonyms', 'Translations'): Notes[header] = "header should be at L4"

for header in ('Han character', 'Kanji', 'Hanzi', 'Hanja'): Notes[header] = "valid in single Han character entries only, not checked"

Notes['Pinyin'] = "valid only for single syllable entries, not checked"

for header in (, , , ): Notes[header] = "L3 POS header templates"

for entry in dump.parse: text = entry.text title = entry.title

entries += 1 if entries % 10000 == 0: print "%d entries, %d words, %d L3 headers" % (entries, words, L3headers)

# skip non main-name-space

if title.find(':') >= 0 or title.find('/') >= 0: continue else: words += 1

# if entries > 5000: break

if title.startswith('Glossary of'): continue

# parse text ...

for line in text.splitlines:

# comments on the (presumed) end of lines if line.find('= 0: line = line.split('<!--')[0]

if line[0:3] != '===': continue if line[3:4] == '=': continue

L3headers += 1

header = line.strip[3:-3].strip

# template mess if header[0:2] == '{{': header = re.sub(r'(.*?)\|.*?\}(.*)', r'\1}\2', header)

if header not in Occurs: Occurs[header] = 0 Occurs[header] += 1

if header in Notes: continue

if header not in Examples: Examples[header] =  + title +  continue

if len(Examples[header]) < 210: Examples[header] += ' ' + title + ''

# end of for line

# end of for entry

print "%d entries, %d words, %d L3 headers" % (entries, words, L3headers)

# report valid headers

report = '\nas of ' + xmldate.enXMLdate + '\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Header\n| |Occurs\n| |Notes\n'

for header in sorted(Occurs): if header not in Notes: continue report += "|-\n|  " + header + "  ||" + str(Occurs[header]) + '||' + Notes[header] + '\n' del Occurs[header]

report += "|}\n" wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L3/valid') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

# report invalid headers

i = k = 0 report = '\nas of ' + xmldate.enXMLdate + '\n' report += '\n{| border="1" cellpadding="4" cellspacing="0" style="border-collapse: collapse;"\n' report += '|-\n| | Header\n| |Occurs\n| |Examples\n'

for header in sorted(Occurs): report += "|-\n|  " + header + "  ||" + str(Occurs[header]) + '||' + Examples[header] + '\n' i += 1 k += Occurs[header]

report += "|}\n\n" report += "* Number of distinct invalid headers: %d\n" % i   report += "* Total number of invalid headers: %d\n" % k    wikipedia.setAction('writing report')

# write the report page

try: reportpage = wikipedia.Page(site, 'User:Robert Ullmann/L3/invalid') oldreport = reportpage.get except wikipedia.NoPage: print "No present report for %s" % reportpage.aslink

# file the report reportpage.put(report)

if __name__ == "__main__": try: main finally: wikipedia.stopme