User:Hippietrail/ajaxtranslinks.js

// // TODO used with non-translation sections are parsed anyway // TODO don't treat whitespace as tokens. instead include a prevWhite field for each token // TODO is the sublang system complete? // TODO finish the refactoring // TODO doesn't handle non linked multiword terms like "Sri Lanka"

// TODO == Current bugs (flat dom parser) == // TODO can we cope with missing colon after language name? // TODO sense numbers after terms: Finnish: jargon (1, 2) // TODO trad. and simpl. in some Chinese entries // TODO (pf.) and (impf.) in some Russian entries // TODO comma between term and its gender rose Catalan

// TODO == Old bugs (manual parser) == // TODO Cantonese and Mandarin as sublangs can match each other // TODO handle wikified sublanguages // TODO handle translations that have both a main entry and subentries (corn German) // TODO selflinked language names cause breakage

// TODO == Can we handle these cases? // TODO single terms wikilinked in separate parts: Hungarian: tönköly búza // TODO non-linked reflexive particles: sich treffen, pridružiti se // TODO what to do about "comments" before or after the line or one term? (foo) (foo) (foo) foo // TODO subentries which are not sublanguages: Chinese and Japanese entries at "Mongolian" //       this seems to require non complicated lookahead

////////////////////////////////////////////////////////////////// // // functions for parsing the "other" page in raw wikitext form // //////////////////////////////////////////////////////////////////

// callback from ajaxing in the term in the other language // parse the wikitext of another page // modify the dom of the translation entry function parse_other_raw(li, anchor, term, lang, sublang, page) { var state = 0; var arr = page.split("\n"); var line;

if (anchor) { anchor.className = 'dunno'; anchor.title = 'Looking...'; } // so we can detect synonyms var langpats = build_lang_patterns(lang, sublang);

for (var i = 0; i < arr.length; i++) { line = arr[i];

// redirect? if (line.match(/#\s*[rR][eE][dD][iI][rR][eE][cC][tT]/)) { state = -1; break;

} else { var foundlang; var r;

// ==Lang== or one of its synonyms or variants? r = line.match(langpats[0]); if (r != null) foundlang = r[1];

// ==Lang== or one of its synonyms or variants? r = line.match(langpats[1]); if (r != null) foundlang = r[1];

if (foundlang != null) { state = 1; break; }   }  }

langpats = null;

// set the class and title of the 

// page exists and has an entry for this language if (state == 1) { if (anchor) { anchor.className = ''; anchor.title = term + ' (exists in ' + foundlang + ')'; if (anchor.href.indexOf('#') == -1) { // TODO unicode in lang names breaks: Guaraní -> #Guaran%C3%AD but #Guaran.C3.AD       anchor.href += '#' + foundlang; }

// plain terms have no  so output results some other way } else { li.title += '; res: ' + term + ' exists in ' + foundlang; }

// page exists but has no entry for this language } else if (state == 0) { if (anchor) { anchor.className = 'new partlynew'; anchor.title = term + ' exists (but not in ' + (sublang ? lang + ' (' + sublang + ')' : lang) + ')'; } else { li.title += '; res: ' + term + ' exists but not in ' + (sublang ? lang + ' (' + sublang + ')' : lang); }

// page exists but is a redirect } else if (state == -1) { if (anchor) { anchor.className = 'redirect'; anchor.title = term + ' exists but is a redirect'; } else { li.title += '; res: ' + term + ' exists but is a redirect'; } } }

// the language name used in the translation table might not be a // synonym or variant of the name used in the foreign terms' own page function build_lang_patterns(lang, sublang) { var langs = lang;

// Ancient Greek if (sublang) langs += '|' + sublang + '|' + sublang + ' ' + lang + '|' + lang + ' \\(' + sublang + '\\)|' + lang + ', ' + sublang;

// Chinese if (lang == 'Chinese') langs += '|Mandarin|Cantonese'; else if (lang == 'Mandarin' || lang == 'Cantonese') langs += '|Chinese';

// CJKV/Han characters else if (lang.match(/(CJKV?|Chinese) [cC]haracters/)) langs += '|Translingual';

// Other synonyms and spelling variants else if (lang == 'Anglo-Saxon') langs += '|Old English'; else if (lang == 'Azerbaijani') langs += '|Azeri'; else if (lang == 'Azeri') langs += '|Azerbaijani'; else if (lang == 'Burmese') langs += '|Myanmar'; else if (lang == 'Faeroese') langs += '|Faroese'; else if (lang == 'Faroese') langs += '|Faeroese'; else if (lang == 'Farsi') langs += '|Persian'; else if (lang == 'Guaraní') langs += '|Guarani'; else if (lang == 'Guarani') langs += '|Guaraní'; else if (lang == 'Malay') langs += '|Malaysian'; else if (lang == 'Malaysian') langs += '|Malay'; else if (lang == 'Maori') langs += '|Māori'; else if (lang == 'Māori') langs += '|Maori'; else if (lang == 'Myanmar') langs += '|Burmese'; else if (lang == 'Old English') langs += '|Anglo-Saxon'; else if (lang == 'Persian') langs += '|Farsi'; else if (lang == 'Romani') langs += '|Romany'; else if (lang == 'Romansch') langs += '|Romansh'; else if (lang == 'Romansh') langs += '|Romansch'; else if (lang == 'Romany') langs += '|Romani'; else if (lang == 'Scots Gaelic') langs += '|Scottish Gaelic'; else if (lang == 'Scottish Gaelic') langs += '|Scots Gaelic'; else if (lang == 'Sinhala') langs += '|Sinhalese'; else if (lang == 'Sinhalese') langs += '|Sinhala'; else if (lang == 'Slovak') langs += '|Slovakian'; else if (lang == 'Slovakian') langs += '|Slovak'; else if (lang == 'Slovene') langs += '|Slovenian'; else if (lang == 'Slovenian') langs += '|Slovene'; else if (lang == 'Tupinambá') langs += '|Tupinamba'; else if (lang == 'Tupinamba') langs += '|Tupinambá'; else if (lang == 'Uighur') langs += '|Uyghur'; else if (lang == 'Uyghur') langs += '|Uighur';

var rx1 = new RegExp('^==\\s*(' + langs + ')\\s*=='); var rx2 = new RegExp('^==\\s*\\[\\[(' + langs + ')]]\\s*=='); return [rx1, rx2]; } // use ajax to load the raw pages of each translated term function lookup_langs(li, anchor, term, lang, sublang) { function on200(req) { parse_other_raw(li, anchor, term, lang, sublang, req.responseText ); } if (term) ajax(wgScript + '?title=' + term.replace(/ /g, '_') + '&action=raw', on200, function {}); } // TODO misses items not wrapped in { {top}} etc // TODO gets some related terms etc which also use { {top}} etc function get_trans_listitems { var lis = [];

// find the translations section // { {top}} and { {trans-top}} both result in  var bc = document.getElementById('bodyContent');

if (bc != null) { var tables = bc.getElementsByTagName('table');

if (tables != null) { for (var t = 0; t < tables.length; t++) { if (tables[t].className.match(/\btranslations\b/)) { var somelis = tables[t].getElementsByTagName('li');

for (var l = 0; l < somelis.length; l++) { lis.push(somelis[l]); }       }      }    }  }

return lis; }

function parse_translistitems(lis) { if (window.domtokenizer) parse_translistitems_haveflatdom(lis); /*else parse_translistitems_noflatdom(lis); */ }

// for each item create parser, prefetch, parse, destroy parser function parse_translistitems_haveflatdom(lis) { // members accessible by any parser function // item always points to the top-level li // subitem always points to the current level //  which may be the li or one of its child dd  var toker = null;     // dom tokenizer (generator) var gItem = null;    // lis[i]:       eg *Spanish: var gSubItem = null; // lis[i] or dd: eg *Serbian: *: Cyrillic var gDepth = 0;

// main loop for (var i = 0; i < lis.length; i++) { gSubItem = gItem = lis[i];

toker = new domtokenizer(gItem);

// get first token toker.gettok;

try { parsetransentry; }   // google chrome can't handle "if" here //catch (e if e == 'WiktParseException') { catch (e) { if (e == 'WiktParseException') { //consolelog('caught parser exception: ' + e); } else { throw e;     } }

toker = null; }

// SUB FUNCTIONS

function pp_unexpected(level, msg) { var txt = level + ': unexpected token'; if (msg) txt += ' at ' + msg; txt += ':';

consolelog(txt); consolelog(toker.tok); consolelog(toker.nexttok); }

function pp_expect_text(v) { if (toker.tok && toker.tok.t == 't' && toker.tok.x == v)     toker.gettok; else pp_error('text "' + v + '"'); }

function pp_expect_start(v) { if (toker.tok && toker.tok.t == 's' && toker.tok.n.nodeName == v)     toker.gettok; else pp_error('<' + v + '>'); }

function pp_expect_end(v) { if (toker.tok && toker.tok.t == 'e' && toker.tok.n.nodeName == v)     toker.gettok; else pp_error(''); }

function pp_warn(msg) { addclass(gSubItem, 'parsewarn'); pp_unexpected('warning', msg); }

function pp_error(msg) { addclass(gSubItem, 'parserror'); pp_unexpected('error', msg); throw 'WiktParseException'; }

// TODO accepts any whitespace including nbsp due to mw french punc feature function pp_tolerate_space { if (toker.tok.t == 't' && toker.tok.isWhite) { toker.gettok; pp_warn('tolerate space'); } }

function pp_tolerate_missing_space { if (toker.tok.t == 't' && toker.tok.isWhite) toker.gettok; else pp_warn('tolerate missing space'); }

//////////////////////////////////////////////////////////

// returns tree // this is the only place where gSubItem is changed! function pp_sublang { var lat = null; var dd = toker.tok.n;

pp_expect_start('DD'); gSubItem = dd; gDepth ++; lat = pp_lang_and_terms; gDepth --; pp_expect_end('DD'); pp_expect_text('\n');

return lat; }

// returns array of sublang trees function pp_sublangs { var aot = [];

pp_expect_start('DL'); pp_expect_text('\n');

while (true) { if (toker.tok.t == 's' && toker.tok.n.nodeName == 'DD') aot.push(pp_sublang); else break; }

pp_expect_end('DL'); pp_expect_text('\n');

return aot; }

// returns tree // TODO tolerate comma or missing colon between langname and dl function pp_lang_and_terms { var lang = null; var sublangs = null; var terms = null;

lang = pp_lang; pp_tolerate_space; pp_expect_text(':'); pp_tolerate_missing_space; // space, or \n if followed by 

// set the language (and title) for each branch and leaf gSubItem.title = lang; gSubItem.wiktLang = lang;

// if we're a branch node if (toker.tok.t == 's' && toker.tok.n.nodeName == 'DL' && toker.nexttok.t == 't' && toker.nexttok.isEOL == true) { // then process the leaves sublangs = pp_sublangs;

// else we're a leaf node } else { // set the term array only for leaves // only set this for leaf nodes // watch out because gSubItem will be set the same for the last leaf //  node and afterward its parent branch node //  which would result in nulling the last sublang's terms gSubItem.wiktTerms = terms = pp_terms; }   // return a tree //  branch nodes always have null "terms" //  leaf nodes always have null "sublangs" return { lang: lang, sublangs: sublangs, terms: terms }; }

// parse the  then call function to parse its lang and terms // trreq and ttbc are handled here as they are not expected to // occur in sublanguages function parsetransentry { var lang = null; var termArray = null; var isErr = false; var isWarn = false; var data = '';

pp_expect_start('LI');

if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'trreq') lang = trreq; else { var tree = pp_lang_and_terms;

consolelog(' lang: ' + tree.lang); if (tree.sublangs) for (var i in tree.sublangs) consolelog(' ' + tree.sublangs[i].lang); }

pp_expect_end('LI');

return;

function trreq { var lang = null;

toker.gettok; // lang = parseunlinkedlangname; // TODO this should be done in the function above toker.gettok; pp_expect_text(':'); pp_tolerate_missing_space; pp_expect_start('I'); while (toker.tok.t == 't') toker.gettok; // please add... pp_expect_end('I'); pp_expect_end('SPAN');

return lang; } }

// TODO handle language names with a linked part and unlinked part: Sorbian (lower) function pp_lang { var lang = null;

if (toker.tok.t == 't') { lang = parseunlinkedlangname; // TODO this should be done in the function above toker.gettok; } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') lang = linked_or_ttbc_lang('A'); else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'ttbc') lang = linked_or_ttbc_lang('SPAN');

return lang; }

// TODO are there 3-part language names which use comma or parentheses? // TODO this function doesn't consume each token as soon as it is recognized function parseunlinkedlangname {

// first word of lang name lang = toker.tok.x;

// following words? if (toker.nexttok.t == 't') { // Old English; English (Old) if (toker.nexttok.x == ' ') { toker.gettok; // eat first word if (toker.nexttok.t == 't') { // English (Old) if (toker.nexttok.x == '(') {           toker.gettok; // eat space            lang = toker.nexttok.x + ' ' + lang;  // get second word            toker.gettok; // eat ( toker.gettok; // eat second word // Old English; Torres Strait Creole } else { lang = lang + ' ' + toker.gettok.x; // eat space, get second word if (toker.nexttok.t == 't' && toker.nexttok.x == ' ') { toker.gettok; // eat second word // Torres Strait Creole if (toker.nexttok.t == 't') lang = lang + ' ' + toker.gettok.x; // eat space, get third word }         }        }

// English, Old or Greek, instead of Greek: } else if (toker.nexttok.x == ',') { toker.gettok; // eat first word if (toker.nexttok.t == 't' && toker.nexttok.x == ' ') { toker.gettok; // eat comma lang = toker.gettok.x + ' ' + lang; // eat space, get second word }       else toker.ungettok; }   }

return lang; }

function linked_or_ttbc_lang(tag) { toker.gettok; //  or    var lang = parseunlinkedlangname; // TODO this should be done in the function above toker.gettok; pp_expect_end(tag); return lang; }

// parse a list of terms separated by commas or semicolons // does not handle sublanguage lists // returns array of terms only function pp_terms { var terms = []; var term = null;

term = parseterm; if (term) terms.push(term);

parseterms_rest;

return terms;

// SUB FUNCTIONS

// term interwiki? ((gender translit?) | (translit gender?))? function parseterm { var term = null;

if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') term = parseterm_link; else if (is_script_span(toker.tok)) term = parseterm_script; else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'STRONG' && toker.tok.n.className == 'selflink') term = parseterm_selflink; else if (toker.tok.t == 't') { if (toker.tok.x == '(' || toker.tok.x == '[') {         var term = {};          parse_translit_gender(term);        } else          term = parseterm_plain;      } else        pp_unexpected('error', 'term');

// if we parsed a term now parse its attributes such as gender and transliteration if (term) { parse_optional_interwiki(term); parse_optional_gender_translit(term);

logterm(term); }

return term;

function logterm(term) { var outputstr = ''; if (typeof term.p != 'undefined') { outputstr += ' ' + term.p;

if (typeof term.x != 'undefined' && term.x != term.p)           outputstr += '|' + term.x;        }

if (term.tr) outputstr += ' (' + term.tr + ')'; if (term.g)         outputstr += ' ' + term.g + '.'; if (term.n)         outputstr += ' ' + term.n + '.'; if (term.iw.code || term.iw.sign) { outputstr += ' '; if (term.iw.code) outputstr += term.iw.code; if (term.iw.sign) outputstr += term.iw.sign; }

consolelog(outputstr); }

function parseterm_link { var a = toker.tok.n;       var term = null;

toker.gettok; //  if (is_script_span(toker.tok)) term = parseterm_link_script; else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'FONT') term = parseterm_link_font; else term = parseterm_inner; pp_expect_end('A');

term.a = a;       term.p = title_from_anchor(a);

return term;

function parseterm_link_script { var term = null;

toker.gettok; // term = parseterm_inner; pp_expect_end('SPAN');

return term; }

function parseterm_link_font { var term = null;

toker.gettok; // term = parseterm_inner; pp_expect_end('FONT');

return term; }     }

function parseterm_script { var term = null;

toker.gettok; // if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') term = parseterm_script_link; else term = parseterm_inner; pp_expect_end('SPAN');

return term;

function parseterm_script_link { var a = toker.tok.n;         var term = null;

toker.gettok; //  term = parseterm_inner; pp_expect_end('A');

term.a = a;         term.p = title_from_anchor(a);

return term; }     }

function parseterm_selflink { var term = null;

toker.gettok; // term = parseterm_inner; pp_expect_end('STRONG');

return term; }

// get one piece of text function parseterm_plain { var term = {}; term.x = term.p = toker.tok.x;       toker.gettok; // term itself

return term; }   }

// TODO this doesn't really have to be recursive does it? function parseterms_rest { var term = null; var dorest = false;

// comma or semicolon possibly preceded by a space if (toker.tok.t == 't') { if (toker.tok.x == ',' || toker.tok.x == ';' || toker.tok.x == '/') dorest = true; else if (toker.tok.x == ' ') { if (toker.nexttok.t == 't' && (toker.nexttok.x == ',' || toker.nexttok.x == ';' || toker.nexttok.x == '/')) { toker.gettok; // eat whitespace dorest = true; }       }

if (dorest) { if (toker.tok.x == '/') pp_warn('tolerate / in place of, or ;'); toker.gettok; // eat, or ;

pp_tolerate_missing_space;

// next term term = parseterm; if (term) terms.push(term);

// recur parseterms_rest; }     }    }  }

// interwiki? function parse_optional_interwiki(term) { var iw = {};

// current (cc) style if (toker.tok.t == 't' && toker.tok.isWhite) { if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SUP') { toker.gettok; // space

// class to tell us if it's a red or blue link? if (toker.tok.n.className == 'tpos') iw.sign = '+'; else if (toker.tok.n.className == 'tneg') iw.sign = '-'; else iw.sign = '';

toker.gettok; //

// template:t style if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') { toker.gettok; // 

// blue cross-wikt link or new-style class'd sup if (toker.tok.t == 't' && toker.tok.x == '(') {           toker.gettok; // ( iw.code = toker.tok.x;           //iw.sign = '+'; toker.gettok; // language code pp_expect_text(')');

// old-style red cross-wikt link which wrapped sup with span } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'new') { toker.gettok; // pp_expect_text('(');           iw.code = toker.tok.x;            //iw.sign = '-';            toker.gettok; // language code            pp_expect_text(')'); pp_expect_end('SPAN'); }

pp_expect_end('A'); }

// template:he-translation style else if (toker.tok.t == 't' && toker.tok.x == '(') {         toker.gettok; // ( pp_expect_start('A'); iw.code = toker.tok.x;         iw.sign = ''; toker.gettok; // he         pp_expect_end('A'); pp_expect_text(')');       }

toker.gettok; // }

// old ^ style else if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'A' && toker.nexttok.n.className == 'extiw') { toker.gettok; // space

toker.gettok; //  iw.code = '^'; iw.sign = ''; toker.gettok; // <^> pp_expect_end('A'); }

// ^ style used on swan Greek // AMBIG looks like transliteration // TODO since this comes between the transliteration and the gender // TODO we should accept all of (transliteration, gender, interwiki) in any order // TODO this would entail left factoring (^) and transliteration /*     else if (toker.nexttok.t == 't' && toker.nexttok.x == '(') {        toker.gettok; // space        if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'A') {

toker.gettok; // (         toker.gettok; //           iw.code = '^';          iw.sign = '';          toker.gettok; // <^>          toker.gettok; //           toker.gettok; // ) }       else toker.ungettok; }     */    }

term.iw = iw; }

function parse_optional_gender_translit(term) { var gt = null;

if (toker.tok.t == 't' && toker.tok.isWhite) { // transliteration? if ((toker.nexttok.t == 't' && (toker.nexttok.x == '(' || toker.nexttok.x == '[')) || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'         && (toker.nexttok.n.className == 'IPA' || toker.nexttok.n.className == 'Unicode')) || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'         && toker.nexttok.n.className == 'ib-brac') || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'I')) {       toker.gettok; // space        parse_translit_gender(term);      }

// gender? else if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'         && toker.nexttok.n.className.match(/\b(gend|numb)er\b/)) { toker.gettok; // space parse_gender_translit(term); }   }

return gt; }

// translit gender? function parse_translit_gender(term) { parse_translit(term);

// is there a gender after the transliteration? if (toker.tok.t == 't' && toker.tok.isWhite) { if (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'         && toker.nexttok.n.className.match(/\b(gend|numb)er\b/)) { toker.gettok; // space parse_gender_num(term); }   }  }

// transliterations function parse_translit(term) { var rbrac = null;

if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN'       && (toker.tok.n.className == 'IPA' || toker.tok.n.className == 'Unicode')) { toker.gettok; // <span class="IPA|Unicode"> outer_list(term); pp_expect_end('SPAN'); } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'I') { toker.gettok; // <i> outer_list(term); pp_expect_end('I'); } else if (toker.tok.x == '(' || toker.tok.x == '[')     outer_list(term);    else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className == 'ib-brac') {     // TODO this won't handle more than one transliteration inside { {ib}}      toker.gettok; //       pp_expect_text('('); pp_expect_end('SPAN'); pp_expect_start('SPAN'); // class="ib-content" inner_list(term); pp_expect_end('SPAN');  // class="ib-content" pp_expect_start('SPAN'); pp_expect_text(')');     pp_expect_end('SPAN');   // class="ib-brac"    } else      pp_error('transliteration');

return;

// => "(", transliteration { "," , transliteration } , ")" function outer_list(term) { if (toker.tok.x == '(') rbrac = ')'; else if (toker.tok.x == '[') rbrac = ']'; else pp_error('transliteration list start bracket');

toker.gettok; // ( or [     term.tr = inner_list(term);      pp_expect_text(rbrac);    }

// => transliteration { ",", transliteration } function inner_list(term) { var translits = []; var translit = null;

// => transliteration if (translit = parsetranslit) translits.push(translit);

// => { ",", transliteration } parsetranslits_rest;

return translits;

/////////////////////////

// lang="XX" + A + translit || A + translit || translit function parsetranslit { var translit = null;

// japanese may wrap kana transliterations in a font tag if (toker.tok.t == 's' && toker.tok.n.nodeName == 'SPAN' && toker.tok.n.className.match(/^[A-Z][A-Z]$/)) { translit = lang_anchor; } else if (toker.tok.t == 's' && toker.tok.n.nodeName == 'A') { translit = anchor; } else { translit = inner; }

return translit;

function lang_anchor { var translit = null;

toker.gettok; // translit = anchor; pp_expect_end('SPAN');

return translit; }

function anchor { var translit = null;

toker.gettok; // <a> translit = inner; pp_expect_end('A');

return translit; }

function inner { var translit = ''; while (true) { translit += toker.tok.x;           toker.gettok; // transliteration itself

// some greek transliterations use [] instead of if (toker.tok.x == ',' || toker.tok.x == rbrac || toker.tok.t != 't') break; }

return translit; }     }

function parsetranslits_rest { var translit = null; var dorest = false;

// comma possibly preceded by a space if (toker.tok.t == 't') { if (toker.tok.x == ',') dorest = true; else if (toker.tok.x == ' ') { if (toker.nexttok.t == 't' && toker.nexttok.x == ',') { toker.gettok; // eat whitespace dorest = true; }         }

if (dorest) { toker.gettok; // eat, or ;

pp_tolerate_missing_space;

// next translit translit = parsetranslit; if (translit) translits.push(translit);

// recur parsetranslits_rest; }       }      }    }  }

// gender translit? function parse_gender_translit(term) { parse_gender_num(term);

// is there a transliteration after the gender? // TODO sense numbers after the gender look like transliterations to the parser if (toker.tok.t == 't' && toker.tok.isWhite) { if ((toker.nexttok.t == 't' && (toker.nexttok.x == '(' || toker.nexttok.x == '[')) || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'           && (toker.nexttok.n.className == 'IPA' || toker.nexttok.n.className == 'Unicode')) || (toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN'           && toker.nexttok.n.className == 'ib-brac')) {        toker.gettok; // space        parse_translit(term);      }    }  }

// gender(s) and possibly number function parse_gender_num(term) { var gender = null;

// TODO we handle { {m}} but not yet m gender = parse_gender_or_number;

// plain comma? if (toker.tok.t == 's' && toker.tok.n.nodeName == 'I') { while (true) { pp_expect_start('I'); pp_expect_text(','); pp_expect_end('I');

toker.gettok; // space gender += parse_gender_or_number;

if (toker.tok.t != 's' || toker.tok.n.nodeName != 'I') break; }

toker.gettok; // pp_expect_start('I'); pp_expect_text(','); pp_expect_end('I'); pp_expect_end('SPAN'); }

// serial comma? if (toker.tok.t == 't' && toker.tok.isWhite     && toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN' && toker.nexttok.n.className == 'serial-and') { toker.gettok; // space toker.gettok; // pp_expect_start('I'); pp_expect_text('and'); pp_expect_end('I'); pp_expect_end('SPAN'); toker.gettok; // space

gender += parse_gender_or_number; }

// and number? if (toker.tok.t == 't' && toker.tok.isWhite     && toker.nexttok.t == 's' && toker.nexttok.n.nodeName == 'SPAN' && toker.nexttok.n.className.match(/\bnumber\b/)) {

toker.gettok; // space term.n = parse_gender_or_number; }

term.g = gender;

function parse_gender_or_number { var gender = null;

toker.gettok; // pp_expect_start('I'); gender = toker.tok.x;     toker.gettok; // gender pp_expect_start('SPAN'); pp_expect_text('.'); pp_expect_end('SPAN'); pp_expect_end('I'); pp_expect_end('SPAN');

return gender; } }

// get all pieces of text function parseterm_inner { var px = '';

px = ''; while (true) { px += toker.tok.x;     toker.gettok; // term itself

if (toker.tok.t != 't') break; }

return { p: px, x: px }; } }

////////////////////////////////////////////////////////////////////////////////////////////////////////// // Helper functions

// log to firebug console if it exists function consolelog(data) { if (typeof window.console != 'undefined') { console.log(data); } } // script spans are mostly inserted by templates. old ones use language codes as two capital letters // some newer ones use the newer ISO script names but these may be prefixed by 'sc' or not... // TODO add 'polytonic' here? function is_script_span(tok) { if (tok.t == 's' && tok.n.nodeName == 'SPAN'   && (tok.n.className.match(/^[A-Z][A-Z]$/) || tok.n.className == 'scHebr' || tok.n.className == 'Deva')) return true; else return false; }

// get an unadorned title from an anchor // since the nodeValue will contain the title // including optional characters like Hebrew vowels // and Latin and Old English macrons function title_from_anchor(a) { var t;

// red link if (a.search) { var l = a.search.indexOf('title=') + 6; var r = a.search.indexOf('&', l);

if (r == -1) t = a.search.substr(l); else t = a.search.substr(l, r-l); }

// blue link else t = a.pathname.substr(a.pathname.lastIndexOf('/') + 1);

return decodeURIComponent(t).replace(/_/g, ' '); }

// add a CSS class to an element which may or may // not already have other classes. will not add // a class that's already there function addclass(ele, newclass) { if (ele.className) { var p = new RegExp('\\b' + newclass + '\\b'); if (!ele.className.match(p)) ele.className += newclass; } else ele.className = newclass; }

if (wgNamespaceNumber === 0) jQuery(document).ready(function {	jQuery.when( jQuery.getScript(mw.util.getUrl('User:Hippietrail/hippajax.js', { action: 'raw', ctype: 'text/javascript', maxage: 86400, smaxage: 86400 })), jQuery.getScript(mw.util.getUrl('User:Hippietrail/domtokenizer.js', { action: 'raw', ctype: 'text/javascript', maxage: 86400, smaxage: 86400 })) ).then(function { // find all the translation entries in the dom var lis = get_trans_listitems; if (lis != null) { // parse the language name from each translation entry // and an array of terms for each entry parse_translistitems(lis); // look up the other language term for each entry for (var i = 0; i < lis.length; i++) { if (lis[i].wiktTerms && lis[i].wiktTerms.length > 0) for (var j = 0; j < lis[i].wiktTerms.length; j++) lookup_langs(lis[i], lis[i].wiktTerms[j].a, lis[i].wiktTerms[j].p, lis[i].wiktLang, null /*sublang*/); // do sublanguages var dds = lis[i].getElementsByTagName('dd'); for (var k = 0; k < dds.length; k++) if (dds[k].wiktTerms && dds[k].wiktTerms.length > 0) for (var l = 0; l < dds[k].wiktTerms.length; l++) lookup_langs(dds[k], dds[k].wiktTerms[l].a, dds[k].wiktTerms[l].p, lis[i].wiktLang, dds[k].wiktLang); }		}	}); });