User:Flubot/getcat

Tool name: getcat.sh

Original script by Ariel Glenn

Description:

This tool downloads the entries' titles of a certain category and stores them to "cat_tmp/titles. .txt".

Parameters:  ./getcat.sh "French verb forms"
 * the name of the category. Give for example

You need:
 * a unix/linux environment and the date, sed, awk, cat, grep commands
 * the curl command

getcat.sh

 * 1) !/bin/bash

usage { echo "Usage: $0 cat" echo "where category is the name of the category for which to retrieve titles" echo echo "For example:" echo "$0 'French verbs'\n"; exit 1 }

if [ -z "$1" ]; then usage fi cat=`echo "$1" | sed -e 's/ /_/g;'` cat="Category:$cat" tmp="./cat_tmp" today=`date +"%B-%e-%Y"` ext="$today" mkdir -p $tmp titles="$tmp/titles.$ext" cmcontinue=""

rm -f $titles.*

count=1 while [ 1 ]; do

echo getting category titles $count to $count+500

# επόμενοι 500

if [ -z "$cmcontinue" ]; then curl --retry 10 -f "http://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=$cat&cmprop=title&cmlimit=500&format=xml" | sed -e 's/>/>\n/g;' > $titles.xml.temp else curl --retry 10 -f "http://en.wiktionary.org/w/api.php?action=query&list=categorymembers&cmtitle=$cat&cmprop=title&cmcontinue=$cmcontinue&cmlimit=500&format=xml" | sed -e 's/>/>\n/g;' > $titles.xml.temp fi   if [ $? -ne 0 ]; then echo "Error $? from curl, unable to get xml pages, bailing" exit 1 fi   cat $titles.xml.temp >> $titles.xml # get continue param # format:  cmcontinue=`grep cmcontinue $titles.xml.temp` if [ -z "$cmcontinue" ]; then break; else cmcontinue=`echo $cmcontinue | awk -F'"' '{ print $2 }'`   fi    sleep 6    count=$(( $count+500 )) done

cat $titles.xml | grep ' $titles.txt echo "done!" exit 0