User:Interwicket/code/reciprocal


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Interwicket/code/reciprocal

""" This bot updates iwiki links between wiktionaries

26.1.9: try adding reciprocals; can then use this in full run?

This process checks for the "Interwicket" user on the FL wikt, tries to log in, create user page, check user status, and create a reciprocal link to match an en.wikt link just added (or about to be added)

"""

import wikipedia import sys import socket import re import pickle from time import time, strftime, gmtime, sleep from mwapi import getwikitext, getedit, readapi, putedit from iwlinks import getiwlinks, replaceiwlinks

from config import usernames
 * 1) borrow global:

import Queue import threading toreplink = Queue.Queue # was 35, soft limit repinit = False rthread = None

plock = threading.Lock
 * 1) with plock: print lock, acquired around all print statements, caller can use to avoid munging lines together

def srep(s): return repr(u''+s)[2:-1]

def safe(s): return srep(s)

class ufo: def __init__(self, **k): for a in k: setattr(self, a, k[a])

class FLwikt: def __init__(self, code): self.lc = code self.lastcheck = None self.status = None self.userpage = False self.mainpage = '' # for test mode: self.edits = 0 self.limit = 2.0 self.newikt = True self.lockedwikt = False self.deletecode = False

self.tbd = -1 # meaning "not known", 0 is valid

try: # getting site will throw exceptions for unknown code(s) self.site = wikipedia.getSite(code, "wiktionary") if code in self.site.family.obsolete: self.lockedwikt = True except Exception: print "(code %s is not valid)" % code # avoid plock I think self.site = None # should not be referenced? self.lockedwikt = True # or doesn't exist at all return # rest of this is invalid

# see if we have a login in user config, else invent it       if code not in usernames['wiktionary']: usernames['wiktionary'][code] = "Interwicket"

# other options self.redirs = None    # may be None, False, or True, can be tested either way self.attop = False self.oneline = False self.sortorder = ''

# now decode what is in the family, so we can list it out (we don't do anything with it!) if self.site.language in self.site.family.interwiki_attop: self.attop = True if self.site.language in self.site.family.interwiki_on_one_line: self.oneline = True pf = self.site.interwiki_putfirst if pf: if pf == self.site.family.alphabetic: self.sortorder = 'alpha by language name' elif pf == self.site.family.alphabetic_revised: self.sortorder = \ 'alpha by language name (revised)' elif pf == self.site.family.fyinterwiki: self.sortorder = 'code in special fy order' elif pf == self.site.family.dodde_order: self.sortorder = 'Dodde order' else: self.sortorder = '%s first' % (','.join(pf))

# specific pairings not to link self.nolink = [ ]

# put these right here for now: (-) if code == 'pl': self.nolink.append('ru') # if code in ['en', 'sv', 'sw']: self.redirs = True

class FLdict(dict): def __init__(self): pass def __missing__(self, code): self[code] = FLwikt(code) return self[code] flws = FLdict # FLwikt by code
 * 1) so we can just reference the dictionary (-)


 * 1) Note: It is very important that flw's are NOT created for things that aren't iwiki codes!
 * 2) This takes some care on the part of calling code.

redits = re.compile('editcount="(\d+)"') noflagtext = None noflaglast = 0 redirtext = None redirlast = 0
 * 1) noflag hack

remainpage = re.compile(r']*>(.*?) ')

gfslock = threading.Lock
 * 1) safety; this was not written to be re-entrant, probably is okay, but is simple to prevent
 * 2) there is a lot of lock contention here when a process like mbwa starts

def getflstatus(flw, nowrite = False): global noflagtext, noflaglast, redirtext, redirlast

# before taking lock, can we just tell caller the status for this one? if flw.lockedwikt: flw.status = 'blocked' with plock: print '(wikt', flw.lc, 'is locked)' return flw.status

# four hours for now if flw.lastcheck and flw.lastcheck > time - (4 * 3600): return flw.status

with gfslock:

was = flw.status

# if not a good status, start with test; in particular change exception to test if flw.status not in ['bot', 'noflag']: flw.status = "test"

# check logged in (or not) # we need try/except here, take keyboard interrupt and make it status = 'exception' # anything else thrown will get handled try: # take print lock around this, will stall other threads, # we may need to respond to login prompt, and it will print messages with plock: flw.site.forceLogin except KeyboardInterrupt: with plock: print "Keyboard interrupt, skipping this wikt" flw.status = 'exception' return except Exception, e:       flw.status = 'exception' with plock: print "exception trying to login on %s:" % flw.lc, str(e) return

try: ustat = readapi(flw.site,            "action=query&meta=userinfo"             "&uiprop=blockinfo|rights|editcount&format=xml") except Exception, e:       with plock: print "exception trying to read user status from %s.wikt:" % flw.lc, str(e) flw.status = "exception" return "exception"

# edit count? mo = redits.search(ustat) if mo: flw.edits = int(mo.group(1))

# we can be bot, or blocked, or not known:

if "bot" in ustat: flw.status = "bot" if "blockedby=" in ustat: flw.status = "blocked" # over-rides "bot", as it can be both if "missing=" in ustat: flw.status = "missing" # ? can get here now?

# noflag hack if flw.status == 'test': if not noflagtext or noflaglast < time - 3600: # just pick up once an hour try: nfp = wikipedia.Page(flws['en'].site, "User:Interwicket/noflags") with plock: print '(reading noflags list)' noflagtext = getwikitext(nfp, plock = plock) noflaglast = time except Exception, e:               with plock: print "some exception getting noflag", str(e) pass # use previous file text if noflagtext: if "* " + flw.lc + "" in noflagtext: flw.status = 'noflag'

# dyn pickup of redir configuration: if not redirtext or redirlast < time - 3600: # just pick up once an hour try: rdp = wikipedia.Page(flws['en'].site, "User:Interwicket/redirs") with plock: print '(reading redirs list)' redirtext = getwikitext(rdp, plock = plock) redirlast = time except Exception, e:           with plock: print "some exception getting redirs list", str(e) pass # use previous file text if redirtext: if "* " + flw.lc + "" in redirtext: flw.redirs = True else: flw.redirs = None # we don't use the "False" state at present

# find main page title from WM "message": try: mtext = readapi(flw.site, "action=query&meta=allmessages&ammessages=mainpage&format=xml") mo = remainpage.search(mtext) flw.mainpage = mo.group(1) except Exception: flw.mainpage = '(exception)'

if flw.status == 'test': if flw.lastcheck: flw.limit += (time - flw.lastcheck) / 4800.0 # allow one more every 90 minutes flw.limit = min(flw.limit, flw.edits + 3.0) # don't accumulate too much quota else: flw.limit = flw.edits # initial state on most runs, allows one

if flw.status != was: with plock: print "(status on %s.wikt is %s)" % (flw.lc, flw.status) if flw.status != 'exception': flw.lastcheck = time

# if nowrite, we are done for now (e.g. used by mbwa in intitialization) if nowrite and flw.status in [ 'bot', 'noflag' ] and was == None: return flw.status

if flw.newikt and flw.status not in ['missing', 'exception']: flw.newikt = False # set up complete

# (release gfslock) if flw.status != was or flw.newikt: updstatus(flw)

return flw.status


 * 1) add or update user page on the FL wikt:

userpage = """Wiktionary interwiki 'bot

User "Interwicket" is the 'bot that adds interwiki (inter-language) links to entries. It is designed for the Wiktionaries. It is not the "wikipedia bot", it is much more efficient. It operates only in the main namespace (NS:0).

Here, user "Interwicket" will add links to all of the other wiktionaries when needed.


 * If user "Interwicket" is blocked here, it will not edit (of course)
 * If user "Interwicket" is given a bot flag here, it will add iwikis whenever needed

Otherwise it will operate in a test mode, doing only a very few edits, that can then be checked (by me, and by anyone else). Most of the possible updates will not be done because of this limit.


 * Discussion page for Interwicket is en:User talk:Interwicket.
 * Code is at en:User:Interwicket/code.
 * Status, number of edits, etc for each wikt at en:User:Interwicket/FL status.
 * My talk page is en:User talk:Robert Ullmann.

Finally, my sincere apologies for writing this message only in English!

"""

noflag = """

The bot has been configured to run here without a bot flag, but at full rate, not in test mode. This is done for some small or inactive wiktionaries. If you are a user or admin here and would like to see it flagged, please note on en:User talk:Interwicket and I will resolve it.

It is sometimes hard to find the bot flag request page on various wikts; if you have one and I have not added a request, please write me a note on en:User talk:Interwicket with a link!

I strongly suggest that this wiktionary subscribe to one or both of the automatic approval policy or global bot policy. Please see Bot policy.

Feel free to ask me any questions. en:User talk:Robert Ullmann """

def adduserpage(flw):

if flw.lc == 'en': flw.userpage = True if flw.userpage: return

page = wikipedia.Page(flw.site, "User:Interwicket") try: op = getedit(page, plock = plock) except wikipedia.NoPage: op = '' pass except wikipedia.UserBlocked: flw.status = 'blocked' updstatus(flw) with plock: print "apparently blocked on", flw.lc, "/ wikt may be locked" return except Exception, e:       with plock: print "exception trying to read %s:" % page.aslink, str(e) return

wikipedia.setAction("writing user page")

# if templates added at top of (whereever) the page (bot template, or placeholder) # and cats, iwikis at end (if one per line, etc), contain ':' utext = (u'\n'.join(re.findall(r'\{\{.*?}}', op))            + '\n\n' + userpage             + '\n\n' + u'\n'.join(re.findall(r'^\[\[.*?:.*?]]$', op, re.M))).strip('\n ')

if flw.status == "noflag": utext += noflag

try: page.put(utext) flw.userpage = True except Exception, e:       with plock: print "exception trying to write %s:" % safe(page.aslink), str(e) return

if flw.status == "missing": flw.status = "test" # trying to re-read status won't work for a while!


 * 1) add a log entry, so we don't lose these in testing
 * 2) temporary, although might be expanded and kept

loglines = [ ]

loglock = threading.Lock

def addlog(link, action): global loglines

with loglock:

# save up 20 to do in one edit:

loglines.append('* ' + strftime("%d %B %H:%M", gmtime) + ' [[:' + link[2:] + action[5:])

if len(loglines) < 20: return loglines.reverse

try: page = wikipedia.Page(flws['en'].site, "User:Interwicket/FL log") text = getedit(page, plock = plock)

k = 0 newt = '' for line in text.splitlines: newt += line + '\n' if line == '': for l2 in loglines: newt += l2 + '\n' k = 1 continue if k:               k += 1 if k > 180: break

putedit(page, newt, comment = "log entry " + link, plock = plock) loglines = [ ] except wikipedia.NoPage: pass except Exception, e:       with plock: print "exception writing log entry", str(e)


 * 1) update status table
 * 2) re-entrant, but might edit-conflict with itself or elide edits (has been noted)

updlock = threading.Lock

def updstatus(flw):

if flw.lockedwikt: return # no point in listing with updlock:

try: page = wikipedia.Page(flws['en'].site, "User:Interwicket/FL status") text = getedit(page, plock = plock)

notes = '' if flw.redirs == True: notes += 'link to redirects, ' if flw.redirs == False: notes += 'no links to redirects, ' if flw.attop: notes += 'iwikis at top, ' if flw.oneline: notes += 'on one line, ' if flw.nolink: notes += 'no links to %s added, ' % (",".join(flw.nolink)) if flw.sortorder: notes += 'sort %s, ' % flw.sortorder notes = notes.rstrip(", ")

# day number used to provide an invisible sort key in date column daynumber = "%04d" % (time/86400 - 14700) # days since about 1 April 2010 today = ' ' + daynumber + ' ' + \ strftime(" %d %B", gmtime).replace(' 0', ' ')

if flw.tbd >= 0: tbdtext = "%d" % flw.tbd else: tbdtext = ''

lines = [] for line in text.splitlines: # keep the old lines we want: if not line.startswith("| "): continue if "" + flw.lc + "" in line:

parts = line.split('||') # (first will have the leading |) if len(parts) < 7: continue # (bad line? will replace it) uf = False if parts[2].strip != flw.status: uf = True if not parts[4].strip.startswith(today): uf = True if tbdtext: if parts[5].strip != tbdtext: uf = True else: tbdtext = parts[5].strip # keep what was there if parts[6].strip != notes: uf = True

# if not worth updating, we are done if not uf: return # else elide this line, to be regenerated continue

lines.append(line)

lines.append(           "| %s ||  || %s || %d || %s  || %s || %s || %s"            % (flw.lc, flw.lc, flw.status, flw.edits, today, tbdtext, notes, flw.mainpage) )

text = """

"""

putedit(page, text, comment = "update status for " + flw.lc, plock = plock) except wikipedia.NoPage: pass except Exception, e:       with plock: print "exception writing status table", str(e)

def addrci(page, mysite, links = { }, redirs = { }, skips = [ ], remove = False): """   page to add to    localsite (to be always added)    links is a dict of pages for all other links    redirs is a dict of pages of other links that are redirects (i.e. subset of links)
 * 1) main event:

will add missing links not in redirs, will add if in redirs and allowed on FL.wikt will remove links that are not in links (if not "None")

does not add or remove anything in skips

only removes anything if remove; with incomplete list call with remove False """   flw = flws[page.site.lang]    if getflstatus(flw) == "blocked": return # no kidding ...

# if not blocked, try writing/overwriting user page, could do on "missing" but # we want to update it on new runs # useful access confirmation anyway if not flw.userpage: adduserpage(flw)

# valid status? if flw.status not in ["test", "bot", "noflag"]: return

# test limit per run if flw.status == "test" and flw.edits > flw.limit: with plock: print "(edit limit reached for %s)" % flw.lc       return

mypage = wikipedia.Page(mysite, page.title) links = links.copy # shallow copy links[mysite.lang] = mypage

# now drop the request into a layer of threading magic:

replink(page = page, links = links, redirs = redirs, skips = skips, remove = remove) return

def replink(page = None, links = { }, redirs = { }, skips = [ ], remove = False, end = False): # [yes, the empty dicts and lists are created once on load, but we aren't going to mutate them] # call replink(end = True) to finish up and exit # this can be called from outside addrci (and I expect it to be)

global repinit, rthread if not repinit: if end: return # no need to start for i in range(1, 4+1): rthread = threading.Thread(target=replinks) rthread.name = 'add replinks %d' % i           rthread.start repinit = True

rtask = ufo(page = page, links = links, redirs = redirs, skips = skips, remove = remove, end = end)

if not rtask.end: sleep(toreplink.qsize) # soft q limit toreplink.put(rtask) if rtask.end: # make sure we have one per thread, extras do not matter for i in range(1, 4+1): toreplink.put(rtask)

def replinks:

with plock: print "(rep link thread started)"

while True: rtask = toreplink.get if rtask.end: break reptask(page = rtask.page, links = rtask.links, redirs = rtask.redirs,               skips = rtask.skips, remove = rtask.remove)

with plock: print "(rep link thread ended)"

rewpr = re.compile(r'\[\[:([a-z-]+):.*?\]\]')

recountpage = re.compile(r'\{\{count page\|[^\|\}]+\}\}\n?')
 * 1) remove count page

ticktock = threading.Lock

reptick = 10.0 # default

def setreptick(rt): global reptick reptick = rt

def reptask(page = None, links = { }, redirs = { }, skips = [ ], remove = False): global reptick

# now we have emerged from the thread magic, continue as before (:-) if not page: return # (?)

flw = flws[page.site.lang]

# we may already have page text, so use page given to us

# some retry logic: done = False nap = 5 while not done and nap < 300:

try: text = getwikitext(page, plock = plock) except wikipedia.NoPage: with plock: print "   ... no page %s now" % safe(page.aslink) break except wikipedia.IsRedirectPage: with plock: print "   ... page %s is a redirect?" % safe(page.aslink) break except Exception, e:           with plock: print "   ... some exception reading %s" % safe(page.aslink), repr(e) # print "(sleeping %d seconds)" % nap sleep(nap) nap += nap/2 continue

oldlinks = getiwlinks(text, flws)

# print "debug, oldlinks are:", repr(oldlinks)

# small optimization: if not links and not oldlinks: break # no links, none in entry

# block edits to "main page" (!) if page.title == flw.mainpage: with plock: print "   ... not updating %s, wikt main page"  % safe(page.aslink) break

# add/remove links

if True: # just for left over indent

act = "iwiki"

# bad links, we seem to find a few, not infrequently (page moves, people adding links) act += ' -' title = page.title for code in oldlinks.keys: if oldlinks[code] != title or code == flw.lc: if len(act) < 70: act += '%s:%s, ' % (code, oldlinks[code]) else: act += code + ', ' del oldlinks[code] # will add valid link in next step if present act = act.rstrip(', -')

act += " +" for code in sorted(links): # but not target page: if code == flw.lc: continue if code in flw.nolink: continue # e.g. pl->ru if code not in oldlinks and (flw.redirs or code not in redirs) and code not in skips: if len(act) < 70: act += '%s:%s, ' % (code, title) else: act += code + ', ' oldlinks[code] = title act = act.rstrip(', +')

if remove: act += ' -' for code in sorted(oldlinks): if code not in links and code not in skips: act += code + ', ' del oldlinks[code] act = act.rstrip(', -')

# with plock: print "(debug: rtask %s action %s)" % (safe(page.aslink), safe(act))

if act == "iwiki": break # nothing was done

newtext = replaceiwlinks(text, oldlinks, flw, flws)

# special case for en.wikt, remove count page if we've added an iwiki: # leave odd variants to AF as before if flw.lc == 'en' and "+" in act and '{{count page|' in newtext: newtext, k = recountpage.subn('', newtext) if k: act += ", -{{count page}}" # pace to max rate, take lock and sleep with ticktock: sleep(reptick)

try: if text != getedit(page, plock = plock): with plock: print "page changed during edit?", srep(page.aslink(forceInterwiki = True)) continue # try this again # page.put(newtext, comment = act) putedit(page, newtext, comment = act, plock = plock) done = True flw.edits += 1 if flw.status == "test" or (" -" in act and "-{" not in act): addlog(page.aslink, act) with plock: print "   ... %s %s" % (srep(page.aslink(forceInterwiki = True)),                              srep(rewpr.sub(r'\1', act[6:]))) except Exception, e:           if nap > 9 or '10054' not in repr(e): # e.g. not another box reset, do report on 3rd failure with plock: print "   ... some exception trying to update %s" % safe(page.aslink), str(e) # print "(sleeping %d seconds)" % nap sleep(nap) nap += nap/2 continue

return

if __name__ == "__main__":

# init all the flws, getiwlinks relies on this for code in flws['en'].site.family.langs: foo = flws[code]

# production calls from mbwa init all of them

# test

# flws['en'].site.forceLogin

with plock: print "test FL get status" # valid = getflstatus(flws['sw']) # valid = getflstatus(flws["en"])

valid = getflstatus(flws["mg"])

with plock: print "test add en to chat on mg" page = wikipedia.Page(flws['mg'].site, "chat") addrci(page, flws['mg'].site)

"""   # other tests:

valid = getflstatus(flws["fr"]) valid = getflstatus(flws["pl"])

# test add userpage

with plock: print "test add user page" adduserpage(flws["sw"])

# test add

# flws['sw'].tbd = 17

with plock: print "test add en to cat on sw" page = wikipedia.Page(flws['sw'].site, "cat") addrci(page, flws['en'].site)

with plock: print "test add en to Mwanzo (main page) on sw" page = wikipedia.Page(flws['sw'].site, "Mwanzo") addrci(page, flws['en'].site)

with plock: print "test add en to cat on pl" page = wikipedia.Page(flws['pl'].site, "cat") addrci(page, flws['en'].site)

with plock: print "test add en to cat on vi" page = wikipedia.Page(flws['vi'].site, "cat") addrci(page, flws['en'].site)

with plock: print "test add en to cat on sw, links fr, vi" page = wikipedia.Page(flws['sw'].site, "cat") links = { 'fr':wikipedia.Page(flws['fr'].site, "cat"), 'vi':wikipedia.Page(flws['vi'].site, "cat") } redirs = { } addrci(page, flws['en'].site, links = links, redirs = redirs)

# should not change any entry

# now fix foo

with plock: print "test fix foo on en" page = wikipedia.Page(flws['en'].site, "foo") addrci(page, flws['en'].site)

# "locked" wikt: with plock: print "test add en to father on as" page = wikipedia.Page(flws['as'].site, "father") addrci(page, flws['en'].site)

# rm bad link with plock: print "test add en to septendecim on ko" page = wikipedia.Page(flws['ko'].site, "septendecim") addrci(page, flws['en'].site) """

replink(end = True)