User:Hippietrail/wiktgrep.pl


 * 1) !/usr/bin/perl


 * 1) wiktgrep2 string
 * 2) scans a wiktionary xml dump file
 * 3) looks for articles containing the string arg
 * 4) dumps the field, ignores other fields up to the field
 * 5) then dumps all lines of the text field. scan resumes
 * 6) all lines are dumped preceded by their line number
 * 7) and that line's offset in the dump file
 * 8) uses two pre-built index files to speed up the
 * 9) process: a list of all article names in order, and a list of offsets
 * 10) to the start of each article in the dumpfile
 * 1) process: a list of all article names in order, and a list of offsets
 * 2) to the start of each article in the dumpfile

use strict; use Path::Class;   # not standard! use IO::Uncompress::Bunzip2 qw(bunzip2 $Bunzip2Error); use HTML::Entities;

my $config = getconfig;

our($NFH) = $config->{'date'}. '-all.txt'; our($IFH) = $config->{'date'}. '-off.raw';; my $df = Path::Class::file($config->{'dumppath'}, ('enwiktionary-'. $config->{'date'}. '-pages-articles.xml'. $config->{'ext'})); my $regexp = shift;

print "using dump file $df\n\n";

my $dumph; # file handle or bzip2 object my $mode;  # 0 for text, 1 for bzip2

open NFH or die "no name file"; open IFH or die "no index file"; open(DFH, $df) or die "no dump file";

binmode(STDOUT, ":utf8");	# TODO experimental for windows console binmode(NFH, ":utf8");     # TODO experimental for windows console binmode(DFH, ":utf8");     # TODO experimental for windows console

binmode(IFH); binmode(STDOUT, 'utf8');   # make optional

if (rindex($df, ".bz2") != -1) { print STDERR "** bzip2 compressed dump **\n"; $mode = 1; $dumph = new IO::Uncompress::Bunzip2(\*DFH) or die "IO::Uncompress::Bunzip2 failed: $Bunzip2Error\n";; } else { print STDERR "** uncompressed dump **\n"; $mode = 0; $dumph = \*DFH; }

my $lineno; my $raw; my $offset;

while () { if (/$regexp/) { $lineno = $. - 1;       print $_, "\n"; seek(IFH, $lineno * 4, 0) == 0 && die "can't seek index file";

read(IFH, $raw, 4) || die "can't read index $lineno"; $offset = unpack('I', $raw);

if ($mode == 0) { seek(DFH, $offset, 0) == 0 && die "can't seek dump file"; } else { $dumph->seek($offset, 0) == 0 && die "can't seek compressed dump file"; }

my $t = <$dumph>; $t = decode_entities($t); $t = substr($t,11,length($t)-20); while (<$dumph>) { last if (/; }

print "\n\n"; } }

exit;



sub getconfig { require Cwd; require File::HomeDir; # not standard! require FindBin;

my $configname = '.mwconfig'; my $configpath = undef; my $dumppath = undef; my $date = undef; my $ext = undef;

for ( Cwd::getcwd, File::HomeDir->my_home, $FindBin::Bin ) { my $d = Path::Class::file($_, $configname); if (-d $d) { $configpath = $d; last; }   }

unless ($configpath) { die "no config file in current dir, my home dir, or the script's dir"; } else { print "using config file $configpath\n";

my $configdumppath = Path::Class::file($configpath, 'dumppath');

if (-e $configdumppath) { #print "config contains dump path\n"; unless (open DP, "<$configdumppath") { print "can't open configdumppath\n"; } else { #print "opened configdumppath\n"; $dumppath = ; chomp $dumppath; #print "dump files are stored in $dumppath\n"; unless (-d $dumppath) { print "specified dump file directory $dumppath doesn't exist\n"; } else { my $origdir = Cwd::getcwd; chdir $dumppath;

# find newest .xml and .xml.bz2 dump files

my @files = glob('enwiktionary-????????-pages-articles.xml'); push @files, glob('enwiktionary-????????-pages-articles.xml.bz2');

if (@files) { my @sorted = sort { $b->[0] <=> $a->[0] || $a->[1] cmp $b->[1] } map { /enwiktionary-(\d\d\d\d\d\d\d\d)-pages-articles.xml(\.bz2)?/; [ $1, $2 ] } @files;

$date = $sorted[0][0]; $ext = $sorted[0][1]; }

chdir $origdir; }           }        } else { print "config doesn't contain dump path\n"; }   }    return { 'dumppath' => $dumppath, 'date' => $date, 'ext' => $ext }; }