#!/usr/bin/perl use warnings; use strict; use Getopt::Long; Getopt::Long::Configure ("bundling"); my $with_variants; my $file_names_only; my $dir; my $no_implied; my $variants_str; my $accents; sub usage() { print STDERR "$0 [-v#] [] \n"; exit 1; } GetOptions ("dir|d=s", \$dir, "with-variants|v=i", => \$with_variants, "variants=s", \$variants_str, "file-names-only|f", \$file_names_only, "no-implied", \$no_implied, "accents=s", \$accents) or usage(); $dir = "final" unless defined $dir; die "Cannot specify both --file-names-only and --accents" if $file_names_only && $accents; my %spelling_map = qw(en-us american en-gb british? en-gb-ise british en-gb-ize british_z en-gb-oed british_z en-ca canadian en-au australian); $spelling_map{$_} = $_ foreach (qw(english special american british canadian variant_1 variant_2 variant_3 british_variant_1 british_variant_2 canadian_variant_1 canadian_variant_2 australian_variant_1 australian_variant_2)); die "Cannot specify both --with-variants and --variants" if $with_variants && $variants_str; $variants_str = "non" unless defined $variants_str || defined $with_variants; $variants_str = join(',', 'non', 1..$with_variants) if defined $with_variants; my %use_variant_level; foreach (split ',', $variants_str) { die "Invalid variant num: $_" unless /^(non|[123])$/; $use_variant_level{$_} = 1; } $no_implied = 1 unless $use_variant_level{non}; usage unless @ARGV >= 2; my $SIZE = pop @ARGV; die "Invalid size: $SIZE\n" unless $SIZE =~ /^\d\d$/; my @SPS = map {my $sp = $_; $sp =~ tr/_/-/; $sp = $spelling_map{lc $sp}; die "Must specify en_GB-ise or en_GB-ize\n" if $sp eq 'british?' && $use_variant_level{non}; $sp = 'british' if $sp eq 'british?'; die "Unknown spelling category: $_\n" unless defined $sp; $sp;} @ARGV; my %SPS; $SPS{$_} = 1 foreach (@SPS); $SPS{english} = 1 unless $no_implied; $SPS{special} = 1 unless $no_implied; $SPS{variant_1} = 1 if $SPS{american} && $use_variant_level{1}; $SPS{variant_2} = 1 if $SPS{american} && $use_variant_level{2}; $SPS{british_variant_1} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{1}; $SPS{british_variant_2} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{2}; $SPS{canadian_variant_1} = 1 if $SPS{canadian} && $use_variant_level{1}; $SPS{canadian_variant_2} = 1 if $SPS{canadian} && $use_variant_level{2}; $SPS{australian_variant_1} = 1 if $SPS{australian} && $use_variant_level{1}; $SPS{australian_variant_2} = 1 if $SPS{australian} && $use_variant_level{2}; $SPS{variant_3} = 1 if $use_variant_level{3}; unless ($use_variant_level{non}) { delete $SPS{$_} foreach (@SPS); } opendir D, $dir or die "Unable to open dir $dir\n"; my @words; sub deaccent($) { local $_ = $_[0]; # from deaccent-toperl.cc # the line is encoded in iso-8859-1 to match the input encoding: tr/ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ/AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy/; return $_; } my $add_word; if (!defined($accents) || $accents eq 'keep') { $add_word = sub {push @words, $_[0]}; } elsif ($accents eq 'strip') { $add_word = sub {push @words, deaccent($_[0])}; } elsif ($accents eq 'both') { $add_word = sub {push @words, $_[0]; my $new_word = deaccent($_[0]); push @words, $new_word if $new_word ne $_[0];} } else { die "--accents must be one of: keep, strip, both\n"; } my @files = readdir(D); foreach (sort @files) { my ($sp, $type, $size) = /^(\w+)-([^.]+)\.(\d\d)/ or next; next unless $SPS{$sp}; next unless $size <= $SIZE; if ($file_names_only) { print "$_\n"; } else { open F, "$dir/$_" or die "Unable to open $dir/$_ for reading"; while () { chop; $add_word->($_); } } } my $prev = ''; foreach (sort @words) { next if $_ eq $prev; print "$_\n"; $prev = $_; }