User:Kephir/t.love

This is a list of pages which are suspected to contain incorrectly formatted translations and should probably be run through xte.

There are 14351 items on the list (generated by checking against a known-good pattern). So I needed to split it.

/0, /1, /2, /3, /4, /5, /6, /7, /8, /9, /10, /11, /12, /13, /14

The script has also been run through some other dumps, giving the number of results as shown below: 25815 2012-11-04  23993 2013-08-25   23582 2013-09-07   22278 2013-09-19   21946 2013-10-02   21837 2013-10-17   21590 2013-11-17   21478 2013-12-02         2013-12-08..15: first Buttermilch run 15595 2013-12-17  14494 2014-04-15   14426 2014-05-22   14398 2014-06-09   14351 2014-07-02         2014-07-28: xte was updated to process even items containing undefined

The list was generated from the 2013-08-25 dump of Wiktionary by the following script: ') _re_split     = re.compile(r'\s*[;,]\s*') _re_item       = re.compile(r'((\{\{(qualifier|i)\|[^}]*?}}\s*)?\{\{(t[-+ø0]?|t[-+]check|)\|[^}]*?}}(\s*\{\{(qualifier|i)\|[^}]*?}})?|\{\{t-needed\|[a-z\-]+}})$')

def splitdefs(defs): if defs == '': return op = 0 oc = 0 os = 0 ot = 0 cur = '' for item in re.split(_re_split, defs): op += item.count('(') - item.count(')') oc += item.count('{') - item.count('}') os += item.count('[') - item.count(']') ot += item.count('<') - item.count('>') cur += item if not (op or oc or os or ot): yield cur cur = ''

def scanpage(title, text): mode = 0 for line in text.splitlines: if re.match(_re_trans_top, line): mode = 1 continue elif re.match(_re_ttbc_top, line): mode = 1 continue if mode == 0: continue elif re.match(_re_trans_bot, line): mode = 0 continue m = re.match(_re_trans_line, line) if m:			rest = line[len(m.group(0)):] for item in splitdefs(rest): if not re.match(_re_item, item): return 'definition not recognised: %s' % (repr(item)) continue m = re.match(_re_ttbc_line, line) if m:			continue

m = re.match(_re_trans_sub, line) if m:			continue

m = re.match(_re_trans_mid, line) if m:			continue

m = re.match(_re_trans_req, line) if m:			continue if re.match(_re_hiero, line): continue if re.match(_re_dummy, line): continue return 'line not recognised: %s' % (repr(line))

return None

class handler: def __init__(self): self.buf = None self.tit = None self.current = None def processingInstruction(self): pass def setDocumentLocator(self, locator): pass def startDocument(self): pass

def startElement(self, name, attrs): self.current = name if name == 'text': self.buf = '' elif name == 'title': self.tit = ''

def characters(self, data): if self.current == 'text': self.buf += data elif self.current == 'title': self.tit += data def endElement(self, name): self.current = None if name == 'text': reason = scanpage(self.tit, self.buf) if reason: sys.stdout.write('* %s because:  %s \n' % (self.tit, reason)) elif name == 'page': self.buf = None self.tit = None def endDocument(self): pass

xml.sax.parse(bz2.BZ2File(sys.stdin.buffer), handler)