my $dir;
my $no_implied;
my $variants_str;
+my $accents;
sub usage() {
print STDERR "$0 [-v#] [<options>] <spelling categories> <size>\n";
"with-variants|v=i", => \$with_variants,
"variants=s", \$variants_str,
"file-names-only|f", \$file_names_only,
- "no-implied", \$no_implied) or usage();
+ "no-implied", \$no_implied,
+ "accents=s", \$accents) or usage();
$dir = "final" unless defined $dir;
+die "Cannot specify both --file-names-only and --accents" if $file_names_only && $accents;
+
my %spelling_map = qw(en-us american en-gb british? en-gb-ise british
- en-gb-ize british_z en-gb-oed british_z en-ca canadian);
+ en-gb-ize british_z en-gb-oed british_z en-ca canadian en-au australian);
$spelling_map{$_} = $_
foreach (qw(english special american british canadian
- variant_0 variant_1 variant_2
- british_variant_0 british_variant_1
- canadian_variant_0 canadian_variant_1));
+ variant_1 variant_2 variant_3
+ british_variant_1 british_variant_2
+ canadian_variant_1 canadian_variant_2
+ australian_variant_1 australian_variant_2));
die "Cannot specify both --with-variants and --variants" if $with_variants && $variants_str;
$variants_str = "non" unless defined $variants_str || defined $with_variants;
-$variants_str = join(',', 'non', 0..$with_variants) if defined $with_variants;
+$variants_str = join(',', 'non', 1..$with_variants) if defined $with_variants;
my %use_variant_level;
foreach (split ',', $variants_str) {
- die "Invalid variant num: $_" unless /^(non|[012])$/;
+ die "Invalid variant num: $_" unless /^(non|[123])$/;
$use_variant_level{$_} = 1;
}
$SPS{english} = 1 unless $no_implied;
$SPS{special} = 1 unless $no_implied;
-$SPS{variant_0} = 1 if $SPS{american} && $use_variant_level{0};
-$SPS{variant_1} = 1 if $SPS{american} && $use_variant_level{1};
-$SPS{british_variant_0} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{0};
+$SPS{variant_1} = 1 if $SPS{american} && $use_variant_level{1};
+$SPS{variant_2} = 1 if $SPS{american} && $use_variant_level{2};
$SPS{british_variant_1} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{1};
-$SPS{canadian_variant_0} = 1 if $SPS{canadian} && $use_variant_level{0};
+$SPS{british_variant_2} = 1 if ($SPS{british} || $SPS{british_z}) && $use_variant_level{2};
$SPS{canadian_variant_1} = 1 if $SPS{canadian} && $use_variant_level{1};
-$SPS{variant_2} = 1 if $use_variant_level{2};
+$SPS{canadian_variant_2} = 1 if $SPS{canadian} && $use_variant_level{2};
+$SPS{australian_variant_1} = 1 if $SPS{australian} && $use_variant_level{1};
+$SPS{australian_variant_2} = 1 if $SPS{australian} && $use_variant_level{2};
+$SPS{variant_3} = 1 if $use_variant_level{3};
unless ($use_variant_level{non}) {
delete $SPS{$_} foreach (@SPS);
opendir D, $dir or die "Unable to open dir $dir\n";
-my @files = readdir(D);
-
my @words;
+sub deaccent($) {
+ local $_ = $_[0];
+ # from deaccent-toperl.cc
+ # the line is encoded in iso-8859-1 to match the input encoding:
+ tr/ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ/AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy/;
+ return $_;
+}
+my $add_word;
+if (!defined($accents) || $accents eq 'keep') {
+ $add_word = sub {push @words, $_[0]};
+} elsif ($accents eq 'strip') {
+ $add_word = sub {push @words, deaccent($_[0])};
+} elsif ($accents eq 'both') {
+ $add_word = sub {push @words, $_[0];
+ my $new_word = deaccent($_[0]);
+ push @words, $new_word if $new_word ne $_[0];}
+} else {
+ die "--accents must be one of: keep, strip, both\n";
+}
+
+my @files = readdir(D);
+
foreach (sort @files) {
my ($sp, $type, $size) = /^(\w+)-([^.]+)\.(\d\d)/ or next;
next unless $SPS{$sp};
open F, "$dir/$_" or die "Unable to open $dir/$_ for reading";
while (<F>) {
chop;
- push @words, $_;
+ $add_word->($_);
}
}
}
$prev = $_;
}
-
-
-
-
-