User:Interwicket/code/iwiktll

This is "stabilized" and I don't use it; see ../mbwa. If mbwa is given exactly one langlinks file to work with, it will do what this code will do, albeit in a different order. Provided here only because it was here; I may delete this page presently. Robert Ullmann 17:01, 10 February 2009 (UTC)


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-

""" This bot updates iwiki links between wiktionaries

2.2.9: read langlinks.sql.gz dump, compare to union-index, re-evaluate conflicts

Run with options:

-langlinks:(filename)     it is not necessary to specify the "-langlinks.sql.gz" ending overridden by -date if that is used -home:(code) -redir                    add links to redirects on this wikt, important to get right, as it will otherwise remove desired links to redirects (not so yet) -date:(date)              reads file "langlinks/(home)wiktionary-(date)-langlinks.sql.gz"

"""

import wikipedia import xmlreader import sys import socket import re import pickle import pagegenerators import time from random import choice from mwapi import getwikitext, getedit from reciprocal import addrci from config import usernames
 * 1) borrow global:

def safe(s): return pickle.dumps(s)[1:-5]

import shelve

from iwiktrc import hunt, Exists, Lcode, site, naps
 * 1) use hunt routine in iwiktrc, should be able to maintain the same?
 * 1) things used by hunt, use same copies!

def now: return int(time.clock)


 * 1) read language links file, sort internally (not in the order we'd like ;-)
 * 2) compare to union index, yield titles that do not match, with language codes to hunt

import gzip

def llfile(home = , filename = , redirs = False): if not filename: return

# dict of links, to sort out from file # entries are pageid, code, link title # pageid is mostly useless to us, link title is pagename presumably # so dict of sets

links = { } retuple = re.compile(r"\((\d*?),'(.*?)','(.*?)'\)")

print "reading file", filename

f = gzip.open(filename, 'rb')

leftover = '' while True: content = f.read(4096) if not content: break

content = leftover + content # find a break not in UTF-8 i = content.rfind("');") # at end, must check first       if i < 0: i = content.rfind("'),") # usual case if i < 0: leftover = content continue # at end or need to read some more leftover = content[i+3:] content = content[:i+2]

content = unicode(content, 'utf-8', 'ignore')

for tuple in retuple.findall(content): # print repr(tuple)

pid, lc, title = tuple if ':' in title: continue if not title: continue title = title.replace(r"\'", "'") # SQL escape, we've matched ' only before, or )           if title not in links: links[title] = set            links[title].add(lc)

f.close

print "read links for %d titles" % len(links)

# now we have all the links, compare to union index

Uix = shelve.open("union-index") # Uix = {} # testing w/o union index

for title in sorted(links):

if repr(title) in Uix: t, ul, ur = Uix[repr(title)] else: ul = ur = ''

# print repr(title), "LL:", repr(links[title]), "UNION:", repr(ul), "UREDIR:", repr(ur)

if redirs: ul += ur       # compare links to ul, should match # first add home to ll, then it should be identical ll = links[title] ll.add(home)

# if not redirs, but some present, is okay (at this point): if not redirs and ur: for lc in ur: ll.discard(lc) # (also no point in trying to read them in hunt ;-)

if sorted(ll) != sorted(ul):

print "   in LL, not in UNION:", [x for x in ll if x not in ul] print "   in UNION, not in LL:", [x for x in ul if x not in ll]

lcs = set(ul) lcs.discard(home)

yield title, lcs, ur

else: print "(%s matches)" % repr(title)

Uix.close

def main:

socket.setdefaulttimeout(40)

home = 'en' langlinks = '' addredirs = False fdate = ''

for arg in sys.argv[1:]: if arg.startswith('-langlinks:'): langlinks = arg[11:] if not langlinks.endswith("-langlinks.sql.gz") and '.' not in langlinks: langlinks += "-langlinks.sql.gz" print "reading langlinks file %s" % langlinks if arg.startswith('-date:'): fdate = arg[6:] elif arg.startswith('-home:'): home = arg[6:] print "home wikt is %s" % home elif arg.startswith('-redir'): addredirs = True print "add links to redirects" else: print "unknown command line argument %s" % arg if fdate: langlinks = "langlinks/" + home + "wiktionary-" + fdate + "-langlinks.sql.gz" print "reading langlinks file %s" % langlinks

mysite = wikipedia.getSite(home, 'wiktionary') # make sure we are logged in   mysite.forceLogin meta = wikipedia.getSite(code = "meta", fam = "meta")

# get active wikt list # minus crap. Tokipona? what are they thinking? Klingon? ;-)   Lstops = ['tokipona', 'tlh']

page = wikipedia.Page(meta, "List of Wiktionaries/Table") existtab = page.get

""" entry looks like: """
 * Vietnamese
 * Tiếng Việt
 * vi

# reextab = re.compile(r'^\[\[:([a-z-]+):') # reextab = re.compile(r'\| \+)\.wiktionary\.org') reextab = re.compile(r'^\| \[\[w:.*\|(.*)\]\]\n'                        r'^\| .*\n'                         r'^\| \+)\.wiktionary\.org', re.M)    for mo in reextab.finditer(existtab): if mo.group(2) in Lstops: continue Exists.add(mo.group(2)) Lcode[mo.group(1)] = mo.group(2) # see if we have a login in user config, else pretend we do       # has to be done before any call, or login status gets confused! if mo.group(2) not in usernames['wiktionary']: usernames['wiktionary'][mo.group(2)] = "Interwicket" print "found %d active wikts" % len(Exists) if len(Exists) < 150: return

for lc in Exists: site[lc] = wikipedia.getSite(lc, "wiktionary") naps[lc] = 0 # nil, might be referenced by hunt

# naps ... ;-)   naptime = 0    maxnap = 70

# now look for iwikis needed

entries = 0 probs = 0 fixed = 0

for title, lcs, urs in llfile(home = home, filename = langlinks, redirs = addredirs):

if ':' in title: continue # redundant, but eh?

if title.lower == 'main page': continue

print "%s:%s" % (home, safe(title))

# structure of code here is leftover from source (-) tag = True

# now see if it is something that should be tagged/replaced:

if tag:

probs += 1 naptime += 1

# ... pick up current version from en.wikt

# print '%s is possible update, getting current entry' % safe(title)

try: page = wikipedia.Page(mysite, title) # text = page.get text = getwikitext(page) oldtext = text except wikipedia.NoPage: print "   ... %s not in %s.wikt" % (safe(page.title), safe(home)) text = '' except wikipedia.IsRedirectPage: print "   ... redirect page" text = '' except KeyError: # annoying local error, from crappy framework code print "KeyError" time.sleep(200) continue

if not text: continue

act = ''

linksites = wikipedia.getLanguageLinks(text) ls = [s.lang for s in linksites]

# list of iwikis in entry should match lcs, if not, we need to update if sorted(ls) == sorted(lcs): print "   ... is okay" continue

# if not always adding redirs to this wikt, but some present, is ok           if not addredirs: ok = True # need to remove something for s in ls: if s not in lcs and s not in urs: ok = False # need to add something for s in lcs: if s not in ls: ok = False if ok: print "   ... is okay (may have redirects)" continue

# go hunt down some iwikis, add reciprocals when needed # always include en, pass all other lcs

iwikis, missing = hunt(title, text, 'en', lcs = lcs, home = home, addredirs = addredirs) if iwikis: act = "iwiki +" + ", ".join(iwikis) else: print "   ... no new iwikis found"

# remove rms = [ ] for s in ls: if s in missing: rms.append(s) if home in ls: rms.append(home) # pre-existing self-link (!) if rms: if act: act += " -" else: act = "iwiki -" act += ", ".join(sorted(rms))

if not act: continue

# add links, [don't remove unwanted redirects yet] for lc in iwikis: fpage = wikipedia.Page(site[lc], title) linksites[site[lc]] = fpage for lc in rms: del linksites[site[lc]]

try: newtext = wikipedia.replaceLanguageLinks(text, linksites, site = mysite) except ValueError: # throws this trying to "add to self", just effing continue print "   ... replace error in", repr(page.aslink) continue

newtext = newtext.replace('\r\n', '\n') # wikipedia brain-damage if newtext.rstrip(' \n') == text.rstrip(' \n'): continue # didn't change anything # wikipedia.showDiff(text, newtext)

else: continue

# some change, write it       if act:

fixed += 1 naptime /= 2

print "   ... updating %s: %s" % (safe(title), safe(act).strip("'"))

# try to fix the entry try: utext = getedit(page) # utext = page.get if utext != oldtext: print "page changed during attempted update" continue wikipedia.setAction(act) page.put(newtext) # no cache update [and "links" not set up] # iwadd(title, links.keys) except wikipedia.EditConflict: print "Edit conflict?" continue except wikipedia.PageNotSaved: print "failed to save page" # other action? continue except wikipedia.NoPage: print "Can't get %s from en.wikt?" % safe(page.aslink) continue except wikipedia.IsRedirectPage: print "Redirect page now?" continue except socket.timeout: print "socket timeout, maybe not saving page" continue except socket.error: print "socket error, maybe not saving page" continue except KeyError: # annoying local error, from crappy framework code print "KeyError" time.sleep(200) continue

# limit number of fixes for testing # if fixed > 7: break

# pace [not used in the same way, reconsider] if naptime > maxnap: naptime = maxnap """       if naptime > 4:            print "sleeping %d seconds" % naptime        time.sleep(naptime)        """ continue

print "%d entries, %d possible, %d updated" % (entries, probs, fixed)

# done

if __name__ == "__main__": try: main finally: wikipedia.stopme