User:Flubot/tools

Find page sections
This simple tool examines a certain xml file, obtained through Special:Export, finds and prints all entries contained with more than one language header. It's useful, for example, to know if we can move a certain Romanian entry with ş or ţ to a new name with ș, ț.

cat roverbs.xml | ./lang_headers.py

lang_headers.py

 * 1) !/usr/bin/python
 * 2) -*- coding: utf-8 -*-

import sys, re

fin = sys.stdin

page_tag = re.compile(' ') title_tag = re.compile(' ') lang_tag = re.compile('==([A-za-z ]+)==$') title_content = re.compile(' ([^:]+):(.+)<\/title>') title_capture = re.compile(' (.*)<\/title>') comment_tag = re.compile(' (.*)<\/comment>') page_tag_end = re.compile('<\/page>') main_string = re.compile('main')

eof=0

while not eof: line = fin.readline if line == "": eof = 1 elif page_tag.search(line): namespace="" title="" langs=0 section = [ ] elif title_tag.search(line): result = title_content.search(line) if result: namespace=result.group(1) title=result.group(2) else: result = title_capture.search(line) if result: namespace="main" title=result.group(1) elif comment_tag.search(line): result = comment_tag.search(line) elif lang_tag.search(line): result = lang_tag.search(line) if result: section.append(result.group(1)) langs= langs+1 elif page_tag_end.search(line): if langs > 1 and main_string.search(namespace): print(title), " ", for j in range(0,langs): print(section[j])," ", print fin.close