User:Robert Ullmann/Prologue/code


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Robert Ullmann/Prologue/code

""" Generates prologue (section 0) examples

"""

import wikipedia import sys import re import socket import urllib from iwiktmwapi import getwikitext, getedit, putedit, readapi

def srep(s): return repr(u''+s)[2:-1]

def lkey(l): # language sort key

n = l.strip('[]') if not n: return n

if n == 'Translingual': return '3' + n   # at end for now

if n == 'English': return '1' + n

# handle names like !Kung and 'Auhelawa: move (one) non-alpha to the end of key

if not n[0:1].isalpha: n = n[1:] + n[0:1]

return '2' + n

relink = re.compile(r'\[\[(.+?)\]\]') reh2 =  re.compile(r'==([^=]+)==') rehead = re.compile(r'=+([^=]+)=+') retag = re.compile(r"[(']+([^)]+)[)']+ (.*)") rexpand = re.compile(r'(.*) ', re.S)
 * 1) match language "tags" on defn lines


 * 1) find a context span in expanded text, at start of definition:
 * 2) [ this relies on context working exactly one way ... might be improved]

respan = re.compile(r' '               '\(   ' '(.*?)'              '   \)  '               '(.*)')


 * 1) example:
 * 2)  (   nautical   )  A strong tackle used to hoist an anchor to the cathead of a ship.

recomma = re.compile(r',  ')

def expand(text, title = ''):

site = wikipedia.getSite("en", "wiktionary")

# call expand templates:

# parameters (and do a post op) par = urllib.urlencode([ ('text', text.encode("UTF-8")),                            ('title', title.encode("UTF-8")) ])

rawt = readapi(site, "action=expandtemplates&format=xml", mode = "POST", par = par)

mo = rexpand.search(rawt) if not mo: print "   can't expand templates?" print repr(rawt) return ''

return wikipedia.unescape(mo.group(1))

recat = re.compile(r'\[\[\s*[Cc]ategory\s*:.*?\]\]') recattag = re.compile(r'\s*', re.S)

def decat(t): """   remove categories from text. not simple as the general parser is complex

a cat at the end of the line should be removed without removing the line break, but line breaks and even blank lines in between cats should be removed and blank lines after (or before?) cats should be removed if multiple, but we   don't handle that case. """

# replace all cats with uniform tag # tag is an HTML comment so if it did by chance occur in the wikitext it would be gone anyway tot = recat.sub('', t)

# now replace any spans around whitespace with singlets k = 1 while k: tot, k = recattag.subn('', tot)

# and remove tags tot = tot.replace(, )

return tot

def main:

socket.setdefaulttimeout(70)

# read list of the pages we should set up as examples

site = wikipedia.getSite("en", "wiktionary") site.forceLogin

page = wikipedia.Page(site, "User:Robert Ullmann/Prologue/feedme") feed = getwikitext(page)

# test: # feed = 'bog cat prolog mama' # feed =  + sys.argv[1] + 

for title in relink.findall(feed):

print "%s:" % srep(title)

try: page = wikipedia.Page(site, title) text = getwikitext(page) except Exception, e:           print "    exception getting page", repr(e) text = '' continue

# now find language sections, POS, defs # lang is language, pos is last header (which may very well not be a POS) lang = '' pos = ''

# defs is dict of lang to list of (POS, def) tuples defs = { }

for line in text.splitlines:

mo = reh2.match(line) if mo: lang = mo.group(1) pos = '' continue

if not lang: continue

mo = rehead.match(line) if mo: pos = mo.group(1) continue

if line[:2] != '# ': continue

# skip {defn} and {defn-form} if '{{defn' in line: continue

# def line, add into list if lang not in defs: defs[lang] = [ ] defs[lang].append( ( pos.lower, line[2:]) )

# (that takes care of extracting the basic info)

# print repr(defs)

# now we have to reprocess the "Serbo-Croatian" drek: # following is an approximation, doing it "correctly" is not possible # as the forced merger discards information and the format is not tractable

if "Serbo-Croatian" in defs:

if "Croatian" in defs or "Serbian" in defs or "Bosnian" in defs \ or "Montenegrin" in defs: pass # use standard language entries

else: dlist = defs["Serbo-Croatian"] for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]: defs[lang] = [ ]

for pos, defn in dlist: # look for tags mo = retag.match(defn) if mo and ("Croatian" in mo.group(1) or "Serbian" in mo.group(1) \                        or "Bosnian" in mo.group(1) or "Montenegrin" in mo.group(1)): # add remainder of def to each language tagged: for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]: if lang in mo.group(1): defs[lang].append( (pos, mo.group(2)) ) else: # use default on correct script if ord(title[0]) >= 0x0400 and ord(title[0]) < 0x0530: defs["Serbian"].append( (pos, defn) ) else: defs["Croatian"].append( (pos, defn) )

# now drop blanks for lang in [ "Serbian", "Croatian", "Bosnian", "Montegrin" ]: if not defs[lang]: del defs[lang]

del defs["Serbo-Croatian"] # done with crap

# consolidate defs ... # keeping order is the trick # use four lists, generate in parallel

langs = [ ] poss = [ ] defns = [ ] ctxs = [ ]

for lang in sorted(defs, key=lkey): dlist = defs[lang] for pos, defn in dlist:

if pos == "han character": pos = "Han character" # fix, should be cap

# do a number of things to clean up defn

# [remove defdate, ref tags, etc, etc)]

ctx = '' if defn.startswith('{{'): # try finding a context ... exp = expand(defn, title) print "(expand def)" mo = respan.match(exp) if mo: ctx = decat(recomma.sub(',', mo.group(1))).replace('&#32;', ' ') defn = mo.group(2).lstrip print "matched context" # [at some point might expand the whole entry first? or not bother for examples]

# seen already? [need some fuzziness in match!] i = 0 while i < len(defns): if pos == poss[i] and defn == defns[i] and ctx == ctxs[i]: break i += 1 if i >= len(defns): langs.append(lang) poss.append(pos) defns.append(defn) ctxs.append(ctx) else: langs[i] += ', ' + lang # (re)generate prologue: # this is easier because we are working from NS:0 entries which don't have # the prologue in them, and we don't need to reprocess defs as much for examples # harder as we need to expand and kill cats

newtext = '{{tocright}}\n'

# first copy the existing stuff (also template, whatever) for line in text.splitlines: if line[:2] == '==': break newtext += line + '\n'

# generate def lines # also need to handle several and many languages, and so on       # section link languages?

for i in range(0, len(defns)): ln = langs[i] if ln == 'English': ln = '' else: ln += ', ' ctx = ctxs[i] if ctx: ctx = ', ' + ctx newtext += "# (" + ln + poss[i] + ctx + ") " + defns[i] + '\n' print "   # " + '(' + ln + poss[i] + ctx + ') ' + repr(defns[i])

# append the rest of the entry: (all after first header)

text = '\n' + text newtext += text[text.find('\n=='):]

# almost there ...

newtext = expand(newtext, title) # kill cats

newtext = decat(newtext)

# and write new page

try: xpage = wikipedia.Page(site, "User:Robert Ullmann/Prologue/examples/" + title) # otext = getedit(xpage) otext = xpage.get # putedit(xpage, newtext, comment = "write example") xpage.put(newtext, comment = "write example") except wikipedia.NoPage: xpage.put(newtext, comment = "write example") # write initial version w/framework pass except Exception, e:           print "    exception getting/writing example page", repr(e) pass

# finished with page loop

# done

if __name__ == "__main__": try: main finally: wikipedia.stopme