User:Hippietrail/domtokenizer.js

// needs at least JavaScript 1.7

function domtokenizer(startnode) { // public this.lasttok = null; this.tok = null; this.nexttok = null;

// private this.ungot = false; this.nextnext = null; this.eof = false;

this.domgen = domgenerator(startnode);

this.gettok = function { this.lasttok = this.tok; this.tok = this.nexttok; if (this.ungot) { this.nexttok = this.nextnext; this.ungot = false; } else if (this.eof) { // because we have one token of lookahead // we need to be able to go one token past the end } else { this.nexttok = this.domgen.next; if (this.nexttok == null) this.eof = true; }   return this.tok; }

this.ungettok = function { this.ungot = true; this.nextnext = this.nexttok; this.nexttok = this.tok; this.tok = this.lasttok; this.lasttok = null; }

// lookahead this.gettok; }

function domgenerator(startnode) { var node = startnode;

while (true) { // EMIT

if (node == null) { yield null; break; }

// tag nodes else if (node.nodeType == 1) yield { "t": "s", "n": node };

// text nodes else if (node.nodeType == 3) { var txtgen = texttokenizer(node.nodeValue);

var t;     while (t = txtgen.next) yield t;   }

// other nodes (comments etc) else yield { "t": "o", "n": node };

//////////////////////////////////////////////

// WALK

// child of this tag if (node.firstChild) node = node.firstChild;

// close this tag then go to sibling or parent else while (true) { if (node.nodeType == 1) { yield { "t": "e", "n": node };

if (node == startnode) { node = null; break; }     }

if (node.nextSibling) { node = node.nextSibling; break; }

node = node.parentNode; } }

function texttokenizer(text) { var stdin = text; var i = 0; var c = null; var cc = -1; var lookahead = null;

var s = '';

// lookahead getc;

while (true) { var isWhite = false; var isEOL = false; var isEOF = false; var isCyr = false; var isHeb = false; var isAra = false; var isCJK = false; getc;

if (c == null) { s = null; isEOF = true; } else if (c == ',') { s = c;     } else if (c == ':') { s = c;     } else if (c == ';') { s = c;     } else if (c == '(') {        s = c;      } else if (c == ')') { s = c;     // MediaWiki converts some spaces to non-breaking spaces near punctuation // This is a feature for the French language and an unexpected surprise for the rest of us! } else if (c.match(/[\r\n \u00A0]/)) { s = c;       while (true) { getc; if (c != null && c.match(/[\r\n \u00A0]/)) s += c;         else { ungetc; break; }       }        isWhite = true; if (s == '\r' || s == '\n' || s == '\r\n') isEOL = true; // Characters used in language names } else if (c.match(/[-'!a-záåâāàăçéêíñõöüũúA-Z]/)) { s = c;       while (true) { getc; if (c != null && c.match(/[-'!a-záåâāàăçéêíñõöüũúA-Z]/)) s += c;         else { ungetc; break; }       }      // Cyrillic characters } else if (inCyrillic(cc)) { s = c;       while (true) { getc; if (inCyrillic(cc)) { s += c;         } else { ungetc; break; }       }        isCyr = true; // Hebrew characters } else if (inHebrew(cc)) { s = c;       while (true) { getc; if (inHebrew(cc)) { s += c;         } else { ungetc; break; }       }        isHeb = true; // Arabic characters } else if (inArabic(cc)) { s = c;       while (true) { getc; if (inArabic(cc)) { s += c;         } else { ungetc; break; }       }        isAra = true; // CJKV characters } else if (inCJK(cc)) { s = c;       while (true) { getc; if (inCJK(cc)) { s += c;         } else { ungetc; break; }       }        isCJK = true; } else { s = c;     }

// EOF if (c == null) yield null; else { var retval = { "t": "t", "x": s, "isWhite": isWhite }; if (isEOL) retval.isEOL = true;

yield retval; }   }

function inCyrillic(c) { if (c >= 0x0400 && c <= 0x04FF) return true; else return false; }

function inHebrew(c) { if (c >= 0x0590 && c <= 0x05FF) return true; else return false; }

function inArabic(c) { if (c >= 0x0600 && c <= 0x06FF) return true; else return false; }

function inCJK(c) { if ((c >= 0x2E80 && c <= 0x303F)         || (c >= 0x31C0 && c <= 0x31EF)          || (c >= 0x3200 && c <= 0x4DBF)          || (c >= 0x4E00 && c <= 0x9FFF)          || (c >= 0xF900 && c <= 0xFAFF)          || (c >= 0xFE30 && c <= 0xFE4F)          // || (c >= 0x20000 && c <= 0x2A6DF)          // || (c >= 0x2F800 && c <= 0x2FA1F)          ) { return true; } else { return false; }     }

function getc { c = lookahead; if (c == null) cc = -1; else cc = c.charCodeAt(0); lookahead = null; if (i < stdin.length) lookahead = stdin.charAt(i);

i++; }

function ungetc { lookahead = c;     i--; c = stdin.charAt(i); if (c == null) cc = -1; else cc = c.charCodeAt(0); } } }