User:Interwicket/code/mwapi


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Interwicket/code/mwapi

""" functions to use MW API to replace wikipedia.py browser-client functions

getwikitext(page)              -- get the text of the page, like get getedit(page)                  -- get the text of the page and an edit token page can then be saved with wikipedia.Page.put putedit(page, text, comment)   -- save the page, will never create or recreate, edit only! only if getedit was used, not framework get

readapi(site, request)         -- read from or post to api, with compression and maxlag built in

optional parameter plock to use as lock around anything printed

this version uses persistent HTTP connections

"""

import wikipedia import re import time from threading import currentThread, Lock plockd = Lock # default plock

import urllib, httplib from StringIO import StringIO from gzip import GzipFile


 * 1) connection pool
 * 2) implemented as a queue, so we can share between threads
 * 3) no particular limit, effectively limited by number of threads in program

import Queue pool = Queue.Queue


 * 1) since we aren't using the framework 'throttle', do something better
 * 2) this is a "tick-tock" timer, shared on all threads
 * 3) clicked down each success, up on each network failure of any type

ticktocklock = Lock ticktock = 1.0 def getticktock: global ticktock return ticktock

relagged = re.compile(r'= 10.0: with plock: print "(mwapi readapi: tick tock is %.1f)" % ticktock time.sleep(ticktock) ticktock -= 1.0  # undo first increment in loop

while not done: ticktock += 1.0  # done w/o lock, race condition is rare, not a serious problem, ignored!

# get a connection from pool try: conn = pool.get_nowait except Queue.Empty: conn = None

try: if not conn: # with plock: print "(opening connection to %s)" % site.hostname conn = httplib.HTTPConnection(site.hostname)

# either get or post headers = { 'Cookie': site.cookies(sysop = sysop) or '', 'Accept-Encoding': 'gzip', 'User-Agent': 'Interwicket/1.0' } if mode == "POST": headers['Content-Type'] = 'application/x-www-form-urlencoded' conn.request(mode, url + maxlag, par, headers) resp = conn.getresponse

text = resp.read if 'gzip' in resp.getheader('Content-Encoding', ''): text = GzipFile(fileobj=StringIO(text)).read text = unicode(text, 'UTF-8', errors = 'ignore') done = True except Exception, e:           # work around net problem 5.6.10, ignore first 3 # this is to deal with the atrocious behavior of Iconnect Kenya # which is capable of forcing requests through their proxy and then # killing 80+% with blank status ('BadStatusLine'), and connections closed

repre = repr(e) if nap < 15 and ('10060' in repre or '10054' in repre or 'BadStatusLine' in repre \                               or 'timeout' in repre or 'gaierror' in repre): conn.close conn = None time.sleep(nap) nap = min(nap + nap/2, 300) ticktock -= 0.95  # undo most of increment for this failure continue # quietly

with plock: print "(%s: exception reading API: %s)" % (currentThread.name, repr(e)) text = '' conn.close conn = None time.sleep(nap) nap = min(nap + nap/2, 300) continue

if ' 600: maxlag = "" if maxlag and maxl > 60: with plock: print "(mwapi readapi: next with %s)" % maxlag # sleep replag if not more than 70 time.sleep(min(replag, 70)) done = False pool.put(conn) # should still be good conn = None continue

# if we still have the connection without failure, return it to pool if conn: pool.put(conn)

return text

def ts(t): return time.strptime(t, '%Y-%m-%dT%H:%M:%SZ')

retok = re.compile(r' edittoken="(.*?)"') restartime = re.compile(r' starttimestamp="(.*?)"') retimestamp = re.compile(r' timestamp="(.*?)"') rerevid = re.compile(r' revid="(.*?)"')

def getwikitext(page, revid = None, plock = plockd):

site = page._site

if hasattr(page, '_contents'): if revid: if hasattr(page, '_revisionid') and revid == page._revisionid: return page._contents else: return page._contents # else we need to get page

done = False nap = 5 while not done: # if revid, get a specific revision if revid: rs = "&rvstartid=%s&rvlimit=1" % revid else: rs = '' # throw various exceptions to caller rawt = readapi(site, "action=query&prop=revisions|info&rvprop=content|ids&format=xml"                    + rs + "&titles=" + page.urlname, plock = plock)

i1 = rawt.find(" 0: i1a = rawt[i1:].find('>') if i1a > 0: i1 += i1a + 1 else: i1 = -1 # something bad ... i2 = rawt.find("</rev") if i1 < 0 or i2 < i1: # deleted/does not exist? bad title, no API return if 'missing=""' in rawt: raise wikipedia.NoPage if 'invalid=""' in rawt: raise wikipedia.NoPage if ' ' in rawt: raise wikipedia.NoPage # else with plock: print "(mwapi: no text found, sleeping %d seconds)" % nap # print repr(rawt) time.sleep(nap) nap = min(nap + nap/2, 300) else: done = True

text = rawt[i1:i2] text = wikipedia.unescape(text)

mo = rerevid.search(rawt) if mo: # print "mwapi (debug): revision id from getwikitext", mo.group(1) revid = mo.group(1) else: revid = ''

# for us   page._revisionid = revid

# did we get redirect? if 'redirect=""' in rawt[:i1]: raise wikipedia.IsRedirectPage # and do not set _contents

# tell wikipedia put etc that we have the contents (else it does *another* get!) page._contents = text

return text

def getedit(page, sysop = False, plock = plockd):

site = page._site

done = False nap = 5 notk = 0 while not done: # throw various exceptions to caller rawt = readapi(site, "action=query&prop=info|revisions&intoken=edit&format=xml" +                    "&titles=" + page.urlname, sysop = sysop, plock = plock) # wiki locked; or possibly user blocked? we don't have enough information # this is message from locked wiki if ">Action 'edit' is not allowed for the current user " in rawt: raise wikipedia.UserBlocked

mo = retok.search(rawt) if mo: # token is stored in the site (!) silly, I thought it was an *edit* token site.putToken(mo.group(1), sysop = sysop) done = True else: notk += 1 if notk > 20: raise wikipedia.ServerError  # give up eventually! with plock: print repr(rawt) # probably temporary? print "mwapi: no token received trying to edit %s" % repr(page.aslink) print "mwapi: sleeping %d seconds" % nap time.sleep(nap) nap = min(nap + nap/2, 300)

mo = retimestamp.search(rawt) if mo: # print "mwapi (debug): timestamp", mo.group(1) page._editTime = time.strftime('%Y%m%d%H%M%S', ts(mo.group(1))) # and without reformatting, for our putedit: page._basetimestamp = mo.group(1) else: page._editTime = time.strftime('%Y%m%d%H%M%S', time.gmtime)

mo = restartime.search(rawt) if mo: # print "mwapi (debug): starttimestamp", mo.group(1) page._startTime = time.strftime('%Y%m%d%H%M%S', ts(mo.group(1))) else: page._startTime = time.strftime('%Y%m%d%H%M%S', time.gmtime)

mo = rerevid.search(rawt) if mo: revid = mo.group(1) else: revid = '' if hasattr(page, "_contents"): del page._contents # !

# print "mwapi (debug): start %s, edit %s, revid %s, token %s" % (page._startTime, page._editTime,   #      revid, site.getToken)

return getwikitext(page, revid = revid, plock = plock)

def putedit(page, text, comment = '', sysop = False, plock = plockd):

site = page._site

done = False nap = 5

while not done:

token = site.getToken(sysop = sysop)

# throw various exceptions to caller

# parameters in order, token last, to make sure text is complete (!) par = urllib.urlencode([ ('text', text.encode("UTF-8")),                                ('title', page.title.encode("UTF-8")),                                 ('summary', comment.encode("UTF-8")),                                 ('basetimestamp', page._basetimestamp),                                 ('token', token) ])

rawt = readapi(site, "action=edit&format=xml&bot=1&minor=1&nocreate=1",                    mode = "POST", par = par, sysop = sysop, plock = plock)

if 'result="Success"' in rawt: done = True break

# various errors [?] if 'code="missingtitle"' in rawt: if hasattr(page, "_contents"): del page._contents raise wikipedia.NoPage if 'code="pagedeleted"' in rawt: if hasattr(page, "_contents"): del page._contents raise wikipedia.NoPage if 'code="protectedpage"' in rawt: raise wikipedia.LockedPage

with plock: print "(mwapi putedit error: %s, page %s)" % (repr(rawt[:300]), repr(page.aslink))

time.sleep(nap) nap = min(nap + nap/2, 300) if nap == 300: break # can't go on forever [?]

if __name__ == "__main__":

print "mwapi tests"

site = wikipedia.getSite('en', 'wiktionary')

print "present page foo" page = wikipedia.Page(site, 'foo')

t = getwikitext(page) print repr(t)

print "missing page" page = wikipedia.Page(site, 'foo xxx2')

try: t = getwikitext(page) print repr(t) except Exception, e:       print "exception", repr(e)

"""   print "redirect page"    page = wikipedia.Page(site, 'html')

try: t = getwikitext(page) print repr(t) except Exception, e:       print "exception", repr(e)

print "recent changes"

try: rct = readapi(site,                    "action=query&list=recentchanges&format=xml&rcprop=title|user" +                     "&rctype=new&rcnamespace=0&rclimit=10",                     sysop = True) print repr(rct) except Exception, e:       print "exception", repr(e) """

site = wikipedia.getSite('sw', 'wiktionary')

print "present page cat on sw.wikt" page = wikipedia.Page(site, 'cat')

t = getwikitext(page) print repr(t)

site = wikipedia.getSite('en', 'wiktionary')

print "try updating page on en.wikt"

page = wikipedia.Page(site, 'User:Robert Ullmann/t1')

text = getedit(page) text += "\n\nand some more text" putedit(page, text, "add some more")

print "anna two ..."

text = getedit(page) text += "\n\nand 2 text" putedit(page, text, "add 2")

print "edit missing page" page = wikipedia.Page(site, 'foo xxx2')

try: t = getedit(page) print repr(t) except Exception, e:       print "exception", repr(e)

page._basetimestamp = '0'

print "... saving"

try: t = putedit(page, "foo") print repr(t) except Exception, e:       print "exception", repr(e) print "done"