X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=speller%2Fmunch-list;fp=speller%2Fmunch-list;h=65c17e0cdfac1fc02d2d6bdd4a6117cc236fac5c;hb=b13ea8a082364672c6de2b010e558211ff52ec9a;hp=0000000000000000000000000000000000000000;hpb=01534a94130c1f5a3a230cf4fe18365a235ba271;p=deb_pkgs%2Fscowl.git diff --git a/speller/munch-list b/speller/munch-list new file mode 100755 index 0000000..65c17e0 --- /dev/null +++ b/speller/munch-list @@ -0,0 +1,71 @@ +#!/usr/bin/perl + +# +# Quick and dirty script to use aspell to munch and expand a hunspell +# list. For munching using Aspell gives much better results than the +# hunspell scripts for expanding hunspell could theoretical be used +# but it just as easy to use Aspell. +# +# Note. this script is only intended to work with english. Other +# languages might work but the hunspell Affix file needs to be +# compatible with Aspell. +# +# For now expected the input to be in ASCII or iso-8859-1. +# + +my $ASPELL="aspell"; + +use strict; +use warnings; + +use File::Temp; + +sub usage() { + print STDERR "$0: [munch|expand] AFFIX_FILE < INPUT\n"; + exit 1; +} + +usage() unless @ARGV == 2; +my $action = $ARGV[0]; +usage() unless $action eq 'munch' or $action eq 'expand'; +my $affix_fn = $ARGV[1]; +my $affix_file; + +{ + local $/ = undef; + open F, $affix_fn or die "Unable to open: $affix_file\n"; + $affix_file = ; +} + +# Aspell expects the dictionary to be in iso8859-1 so fake it for now. +# The fact that there may be some Hunspell specific entries in UTF-8 +# (such as the ICONV entry) should not be a problem as Aspell ignores +# it. The words them self are already in iso8859-1. + +$affix_file =~ s/^SET UTF8$/SET ISO8859-1/m; + +my $datadir = File::Temp->newdir(); + +open F, ">$datadir/eng_affix.dat"; +print F $affix_file; + +open F, ">$datadir/eng.dat"; +print F "name eng\n"; +print F "charset iso8859-1\n"; +print F "special ' -*-\n"; +print F "affix eng\n"; + +if ($action eq 'munch') { + open F, '-|', $ASPELL, '--local-data-dir', $datadir, '--lang', 'eng', 'munch-list'; + while () { + next if /^(XXX\|>>>)/; + print; + } +} elsif ($action eq 'expand') { + open F, '-|', $ASPELL, '--local-data-dir', $datadir, '--lang', 'eng', 'expand'; + while () { + foreach (split ' ') { + print "$_\n"; + } + } +}