User:Tbot/code/createflw


 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8  -*-
 * 3) wikipath en wiktionary User:Tbot/code/createflw

""" Create a simple foreign word entry in the en.wikt

Append a section if not already present

"""

import wikipedia import catlib import sys import re import pickle import socket from time import time, sleep import shelve from mwapi import getwikitext, getedit

from __main__ import cache, logpage, plock

def safe(s): return pickle.dumps(s)[1:-5]

def log(s): with plock: print safe(s).strip("'" + '"')


 * 1) entries we've already seen that exist [not looking at sections yet]:

Exists = set


 * 1) some regex

reimage = re.compile(r'\[\[image:(.*?)[\|\]]', re.I) reaudio = re.compile(r'\[\[media:(.*?)[\|\]]', re.I) rethumb = re.compile(r'\[\[(.*?)\|[^\]]*thumb[\|\]]') rejpg = re.compile(r'[\|\{\[=]([^\|\{\[=]*?\.(jpg|png))[\|\]}]') reogg = re.compile(r'[\|\{\[=]([^\|\{\[=]*?\.ogg)[\|\}]')
 * 1) these only catch default namespace names ...
 * 1) catch image by the |thumb| parameter ;-)
 * 1) image by .jpg or .png:
 * 1) and perhaps an ogg file in a template, as in en.wikt?

reIPA = re.compile(r'IPA.*?([/\[][^\{\}\|\[\]]+?[/\]])') reIPAt = re.compile(r'\{\{IPA\|([^\{\}\|]+?)[\}\|]')
 * 1) IPA string

regloss1 = re.compile(r"^\(.*?\)") regloss2 = re.compile(r"^\(.*?\)") regloss3 = re.compile(r"\(\d+\)$")
 * 1) fix glosses, context at start, (1) sense number at end should be removed, each should be subbed with space

rejump = re.compile(r'\{\{jump\|[^}]*}}')


 * 1) need only do once on load

site = wikipedia.getSite("en", "wiktionary") csite = wikipedia.getSite("commons", "commons")

Tlist = dict(  ru = r'\s*\|en=',                uk = r'\s*\|en=',                nl = r':?\*',                sq = r' \{\{en}}',                ga = r'\{\{aistr\|en', # careful here, next char is | which must match \W                lt = r'\{\{env1}}',                yi = r'\|EN=',                tr = r':?\*\{\{en}}:',                mn = r':\*\{\{en}}:' ) Tlist['is'] = r'\|en='  # "is" is a keyword ;-)
 * 1) trans table prefixes, other than "*" at the start of the line
 * 1) and:

Wlist = dict( cs="",             de="",              fr="",              ga=u"",              hu="{{wp1}", # one arg?              la="",              lt="",              nl="",              pt=u"",              sl="",              vi="" )
 * 1) by lc here, various languages

Plist = dict( de="Lautschrift",             es="[Pp]ronunciaci.n",              fr="pron" )
 * 1) pronunciation templates for IPA (modded for regex, use . for diacritics etc):

Istops = set([ 'LuisdeCamoes4.jpg',              'Os Lusadas.jpg',               'Wikipedia.png' ])
 * 1) images that show up in page structure for various reasons, e.g. first two on pt.wikt

cis = 0 def createFLentry(flw, lang, lc, pos, title, gloss, mod): global cis

# for now, don't add to the same page (would cause edit conflict anyway?) if flw == title: # log("skipping addition to same title for now") return True # doesn't matter because not called with title == flw and return value used (see tbot.py)

# check cache # records last time we tried this word, don't try again for 110 days # may need to disable sometimes for debugging!

ckey = lc + ':' + flw if ckey in cache: last = cache[ckey] if last > time - (110 * 24 * 3600): # log("%s:%s in 110 day cache, not checked" % (lc, flw)) return False cache[ckey] = time # assume we will complete check now ... cis += 1 if cis % 20 == 0: cache.sync

log("createFL %s: %s[%s] %s, %s (%s)" % (flw, lang, lc, pos, title, gloss))

# get the FL.wikt page

# fix codes WMF hasn't yet (or has, but we still don't have set correctly :-) zlc = lc   if lc == 'nb': zlc = 'no' if lc == 'cmn': zlc = 'zh' if lc == 'nan': zlc = 'zh-min-nan' # (no yue wikt as yet, hopefully will be created as yue, not zh-yue as in pedia)

try: flsite = wikipedia.getSite(zlc, "wiktionary") flpage = wikipedia.Page(flsite, flw) # fltext = flpage.get fltext = getwikitext(flpage) if fltext: print "FL page exists ..." except wikipedia.NoPage: with plock: print "page not in FL wikt" return False except wikipedia.IsRedirectPage: with plock: print "FL wikt entry is a redirect" return True # can change to t+    except KeyboardInterrupt: raise KeyboardInterrupt except Exception, e:       with plock: print "some exception getting page from FL wikt" return False if not fltext: with plock: print "page not in FL wikt" return False

# see if English word in FL page, presumably as a translation

if title not in fltext: print "FL wikt page does not contain title" # logpage.add("%s entry %s:%s exists, title not in entry" % (title, lc, flw)) return True # we want to insert t+ template, even though not adding entry

# nl.wikt uses ":*", will be other variations, # ru.wikt uses |en= ... etc etc: if lc in Tlist: tpre = Tlist[lc] else: tpre = r'\*' retrans = re.compile(r'^' + tpre + r'.*\W' + re.escape(title) + r'(\W|$)', re.M)

# look for a line that may be a trans line, with title surrounded by non-word characters mo = retrans.search(fltext) if mo: # truncate fltext at that line, so we don't get extra stuff from following sections fltextall = fltext fltext = fltext[0:fltext.find(mo.group(0))]   # must be there, but -1 won't hurt else: print "title not in translation line?" logpage.add("%s entry %s:%s exists, pattern not matched" % (title, lc, flw)) return True # we want to insert t+ template, even though not adding entry

# a short entry may be just the English translation, not very good (80 is arbitrary)

# if len(fltext) < 80: #   print "FL wikt page is too short" #   return True # we want to insert t+ template, even though not adding entry

# now reconfirm local existence and section absent, get text

seealso = '' addc = 'created %s entry ' % lang try: log("getting local page %s" % flw) page = wikipedia.Page(site, flw) text = getedit(page) # check language section ... if re.search('^==\s*\[*' + re.escape(lang) + '\]*\s*==', text, re.M): log("page %s and section %s already exists" % (flw, lang)) return True # meaning there is a page and section there now, so convert to t+

# crappy special case until rationality w/r/t Norwegian and Nynorsk returns ... if lang == "Norwegian" and '==Norwegian ' in text: log("page %s and some Norwegian section already exists" % flw) return True # meaning there is a page and section there now, so convert to t+

# another temporary crappy special case, SC bullshit ... if lang in ['Croatian', 'Bosnian', 'Serbian'] and '==Serbo-' in text: log("page %s and some Serbo- section already exists" % flw) return True # meaning there is a page and section there now, so convert to t+

addc = 'added %s section ' % lang except wikipedia.NoPage: # usual case when entry is new text = '' except wikipedia.IsRedirectPage: # overwrite a redirect if present text = '' addc = 'replaced redirect with %s entry ' % lang seealso = page.getRedirectTarget # limit to case redirects, simple case for now (so we don't "fix" Hebrew) if flw.lower != seealso.lower: log("page %s is a redirect to %s, not replaced" % (flw, seealso)) return True

# see if we can "borrow" image or audio

image = '' mo = reimage.search(fltext) if not mo and '|thumb|' in fltext: mo = rethumb.search(fltext) if not mo: mo = rejpg.search(fltext) if mo: img = mo.group(1) if ':' in img: img = img.split(':')[1] if img and img not in Istops: log("found image: %s" % img) ipage = wikipedia.Page(csite, "Image:" + img) try: ipt = getwikitext(ipage) image = '\n' % (img, flw) with plock: print "found on commons" except wikipedia.NoPage: with plock: print "not found on commons" except Exception, e:                with plock: print "other exception looking for commons image" pass

audio = '' mo = reaudio.search(fltext) if not mo: mo = reogg.search(fltext) if mo: aud = mo.group(1) if ':' in aud: aud = aud.split(':')[1] if aud[0:2].lower != lc: log("audio file name %s does not match language %s" % (aud, lc)) aud = '' if aud: log("found audio: %s" % aud) apage = wikipedia.Page(csite, "Image:" + aud) try: apt = getwikitext(apage) audio = '* \n' % (aud, flw) with plock: print "found on commons" except wikipedia.NoPage: with plock: print "not found on commons" except Exception, e:                with plock: print "other exception looking for commons audio" pass

ipa = '' ipas = set # so repeats don't bother us   for i in reIPA.findall(fltext): ipas.add(i) for i in reIPAt.findall(fltext): ipas.add(i) if lc in Plist: rp = re.compile(r'\{\{' + Plist[lc] + '\|(.*?)\}\}') for i in rp.findall(fltext): ipas.add(i) if len(ipas) == 1: i = ipas.pop.strip if i.startswith('/'): i = '/' + i.strip('[] /') + '/' elif i.startswith('['): i = '[' + i.strip('[] /') + ']' elif i: i = '/' + i.strip('[] /') + '/' if i == '//' or i == '[]': i = '' if i == '/.../' or i == '[...]': i = '' if i:           ipa = "* \n" % (i, lc) log("found IPA %s" % i)   elif len(ipas) > 1: with plock: print "more than one IPA?"

if audio or ipa: pron = '\n===Pronunciation===\n' + ipa + audio else: pron = ''

# 'pedia link? look at all original text; often follow trans table

wplink = '' if ("" in fltextall or "' in fltextall or            (lc in Wlist and Wlist[lc] in fltextall) or             (lc in Wlist and Wlist[lc][:-2] + '|' + flw + '}}' in fltextall)):        wplink = '\n' % lc        print "added wikipedia link"

# set up additional infl params from attribute dict: aip = '' if 'alt' in mod and mod['alt']: aip += '|head=' + mod['alt'] if 'tra' in mod and mod['tra']: aip += '|tr=' + mod['tra'] if 'g' in mod and mod['g']: aip += '|g=' + mod['g'] if 'g2' in mod and mod['g2']: aip += '|g2=' + mod['g2'] if 'g3' in mod and mod['g3']: aip += '|g3=' + mod['g3'] if 'scr' in mod and mod['scr']: aip += '|sc=' + mod['scr']

gwas = gloss gloss = gloss.strip gloss = regloss1.sub(' ', gloss) gloss = regloss2.sub(' ', gloss) gloss = regloss3.sub(' ', gloss) gloss = rejump.sub(' ', gloss) gloss = gloss.strip if not gloss: log("nothing left to gloss ...") return True  # as FL wikt page does exist # decap gloss (some people insist on capitalizing it, which is wrong) and fix, this is almost always right: if gloss.startswith('Of '): gloss = 'of ' + gloss[3:] if gloss.startswith('Country '): gloss = 'country ' + gloss[8:] if gloss.startswith('Person '): gloss = 'person ' + gloss[7:] gl = gloss.lower if "translation" in gl: log("word 'translation' in gloss, skipped") return True   # FL wikt page exists if gl[1:] != gloss[1:]: gl = gloss      # caps in string after first, so probably okay if gloss.startswith(title): gl = gloss  # Proper noun, e.g. "French language" if gl != gwas: log("gloss changed %s -> %s" % (gwas, gl))

# change Proper noun to Noun if lower case; usually the right answer if pos == "Proper noun" and flw[0:1].islower: log("changed Proper noun to Noun") pos = "Noun"

# add to or create entry text: if text: text += '\n\n\n\n' text += """==%s== %s%s%s

%s

 * 1) %s (%s)

""" % (lang, wplink, image, pron, pos, lc, pos.lower, aip, title, gl, lang, title, lc)

# other special things (no reason not to ;-) if lc == 'fr' and pos == 'Verb': text = text.replace("{{tbot", "{{rfinfl|type=conjugation|lang=fr}}\n{{tbot") # [ others as desired ]

# add interwiki, let AutoFormat and Interwicket sort things as needed iw = '%s:%s' % (lc, flw) if iw not in text: text += iw + '\n'

# if overwriting redirect, add see if seealso: text = '{{also|' + seealso + '}}\n' + text

try: with plock: page.put(text, comment = addc + "from translation at %s and %s:%s" % \                      (title, lc, flw), minorEdit = False) except wikipedia.PageNotSaved: with plock: print "failed to save page" return False except socket.timeout: with plock: print "socket timeout, maybe not saving page" return False except socket.error: with plock: print "socket error, maybe not saving page" return False except Exception, e:       with plock: print "some exception saving page", repr(e) return False

# Exists.add(flw) return True