User:Interwicket/code/allpages

Note: to copy this code, edit this page, and then copy from the edit window! Else you will not get the HTML entities as HTML entities.


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-


 * 1) modded RLU for iwikt, use MW API, remove a lot of cruft, add maxlag and some more reliability

import re, codecs, sys import urllib import time

import wikipedia


 * 1) redirects may be None, True, or False (all different ;-) None is all pages, True is
 * 2) just redirects, False is just non-redirects.

reapt = re.compile('title ?="(.*?)"') relagged = re.compile(r'<error.*"maxlag".* (\d+) seconds') reapfrom = re.compile(r' apfrom="(.*?)"')

def allpages(site = wikipedia.getSite, start = '!', namespace = '0', redirects = None): while True: # encode Non-ASCII characters in hexadecimal format (e.g. %F6) start = start.encode(site.encoding) start = urllib.quote(start)

path = "/w/api.php?action=query&list=allpages&apfrom=" + start + \ "&aplimit=480&format=xml&maxlag=2&namespace=" + namespace

# redirects may be None, False, or True if redirects == None: pass elif redirects == True: path += "&apfilterredir=redirects" elif redirects == False: path += "&apfilterredir=nonredirects"

print '(getting pages in %s from %s)' % (site.lang, start)

# add retry logic, Robert Ullmann 25 Sept 07 done = False nap = 5 while not done: atext = site.getUrl(path)

mo = relagged.search(atext) if mo: print "(server lagged %s seconds)" % mo.group(1) time.sleep(20) continue

if ' ' in atext: done = True else: print "allpages: incomplete reply, sleeping %d seconds" % nap time.sleep(nap) nap = min(nap + nap/2, 300)

for title in reapt.findall(atext): # " is an HTML entity in this field! Robert Ullmann, 20 January 2008               # & too! Robert Ullmann, 8 May 2008                title = title.replace('&quot;', '"') title = title.replace('&amp;', '&') # others, but not sure we need this at all? Page fixes things.

# suppress other namespace-like things here, or Page will gen the "wrong" title if ':' in title: continue

yield wikipedia.Page(site, title)

# find continuation: mo = reapfrom.search(atext) if mo: start = mo.group(1) start = start.replace('&quot;', '"')               start = start.replace('&amp;', '&')                continue            else:                break # we are done, will raise StopIteration


 * 1) define a class so we can instantiate the iter method:

class allpagegen: def __init__(self, start ='!', namespace = '0', site = wikipedia.getSite, redirects = None): self.start = start self.site = site self.redir = redirects self.namespace = namespace

def __iter__(self): for page in allpages(site = self.site, start = self.start,                          namespace = self.namespace, redirects = self.redir): yield page


 * 1) simple unit test:

if __name__ == "__main__":

print "testing allpages, 1000 redirects from Ka in en.wikt, print every 20:"

s = wikipedia.getSite('en', 'wiktionary') kagen = allpagegen(site = s, redirects = True, start = 'Ka')

i = 0 for page in kagen: i += 1 if i%20 == 0: print repr(page.title) if i > 1000: break