#!/usr/bin/perl -CIO
use strict;
use warnings;

print << "__END__";
============================================================================
WARNING: This script requires WikiExtractor.py which seems to have bitrotted
and is now both very slow and unreliable.  You probably want to use
the newer scripts/wikipedia-dump-to-freq and scripts/freq-to-voc instead.
We've left this script here as it's referenced by the COPYING files for
some languages as how the vocabulary was originally generated.
============================================================================

__END__

# Minimum number of occurrence of a word.
my $MIN_FREQ = 10;

# Regex for letter (can be overridden to restrict to appropriate alphabet).
my $LETTER_RE = '\w';

if (@ARGV && $ARGV[0] !~ /^-/) {
    if (@ARGV) {
	$MIN_FREQ = shift @ARGV;
    }

    if (@ARGV) {
	$LETTER_RE = shift @ARGV;
	if ($LETTER_RE eq 'ascii') {
	    $LETTER_RE = '[a-z]';
	} elsif ($LETTER_RE eq 'latin1') {
	    $LETTER_RE = '[a-z\x{c0}-\x{ff}]';
	} elsif ($LETTER_RE eq 'latin2') {
	    $LETTER_RE = '[a-z\x{c1}\x{c2}\x{c4}\x{c7}\x{c9}\x{cb}\x{cd}\x{ce}\x{d3}\x{d4}\x{d6}\x{da}\x{dc}\x{dd}\x{df}\x{e1}\x{e2}\x{e4}\x{e7}\x{e9}\x{eb}\x{ed}\x{ee}\x{f3}-\x{f4}\x{f6}\x{fa}\x{fc}\x{fd}\x{102}-\x{107}\x{10c}-\x{111}\x{118}-\x{11b}\x{139}-\x{13a}\x{13d}\x{13e}\x{141}-\x{144}\x{147}\x{148}\x{150}\x{151}\x{154}\x{155}\x{158}-\x{15b}\x{15e}-\x{165}\x{16e}-\x{171}\x{179}-\x{17e}]';
	} elsif ($LETTER_RE =~ /^[A-Za-z]+$/) {
	    # Perl handles script names case-insensitively.
	    $LETTER_RE = "\\p{Script: $LETTER_RE}";
	} else {
	    $LETTER_RE = "(?:$LETTER_RE)";
	}
    }
}

if (@ARGV) {
    print <<"__END__";
Usage: $0 [MIN_FREQ [SCRIPT]]

MIN_FREQ is the minimum number of occurrences needed for a word to be
included.  The default is 10.

SCRIPT can be:

* A Unicode script name (the first letter gets upper-cased and the Perl
  regex used is: \\p{Script: ...} which generally seems suitable.  This
  has been tested for at least: "armenian", "devanagari", "greek", "latin",
  and "tamil".
* "ascii" to match only ASCII letters.
* "latin1" to match only Latin1 (ISO-8859-1) letters.
* "latin2" to match only Latin2 (ISO-8859-2) letters.
* A regular expression to match a letter in the script.

If not specified, the default regex used is: \\w

The script only considers words which match the regex \\b(?:\$SCRIPT)+\b and
are not changed by the Perl "lc" function.

For example:

Download a dump for language with ISO 639-1 language code XX (mirrors are
available, see https://dumps.wikimedia.org/mirrors.html):

wget https://dumps.wikimedia.org/XXwiki/latest/XXwiki-latest-pages-articles.xml.bz2
WikiExtractor.py XXwiki-latest-pages-articles.xml.bz2
# or: python -m wikiextractor.WikiExtractor XXwiki-latest-pages-articles.xml.bz2
cat text/*/wiki_*|$0 42 ascii > voc.txt

This will extract a list of words that occur at least 42 times, treating
only ASCII letters as word characters.  You should specify an appropriate
alphabet, and choose a MIN_FREQ value which does a good job of selecting
mostly actual words - this threshold will vary with dataset size, and
probably also with the language in question.
__END__
    exit ($ARGV[0] eq '--help' ? 0 : 1);
}

my %f;
while (<>) {
    s!<[^>]*>! !g;
    s!&[^; ]*;! !g;
    while (/\b$LETTER_RE+\b/go) {
	# Perl lc is Unicode-aware for strings with the UTF8 flag set (which it
	# will be here).
	++$f{$&} if lc($&) eq $&;
    }
}
for (sort keys %f) {
    print "$_\n" if $f{$_} >= $MIN_FREQ;
    #print "$f{$_}\t$_\n" if $f{$_} >= $MIN_FREQ;
}
