distribute all scowl files as utf-8

[deb_pkgs/scowl.git] / speller / make-hunspell-dict
diff --git a/speller/make-hunspell-dict b/speller/make-hunspell-dict

index c09108382d409cf639bc46a9ee5cce257f4bf11f..3d02ddaf1c0f83738eea61fb1744eeea4cd00cbf 100755 (executable)
--- a/speller/make-hunspell-dict
+++ b/speller/make-hunspell-dict
@@ -1,7 +1,10 @@
-#!/bin/bash
+#!/usr/bin/env bash
  
  ASPELL=aspell
  HUNSPELL=hunspell
+: ${SCOWL:=..}
+SPELLER="$SCOWL/speller"
+: ${UNIX2DOS:=unix2dos}
  
  set -e
  
@@ -10,60 +13,98 @@ export LC_ALL=C
  export LC_CTYPE=C
  export LC_COLLATE=C
  
-prep() {
-  echo prep 
+SIZE=60
  
-  cat misc/{offensive.1,offensive.2,profane.1} | sort -u > nosug
+mk-list() { $SCOWL/mk-list -d $SCOWL/final "$@"; }
  
-  cp en.aff eng_affix.dat
+prep() {
+  echo prep 
  
-  cat << EOF > eng.dat
-name eng
-charset iso8859-1
-special ' -*-
-affix eng
-EOF
+  cat $SCOWL/misc/{offensive.1,offensive.2,profane.1} | sort -u > nosug
  }
  
  doit() {
-  echo creating en_$1.dic
+  echo creating $1.dic
  
-  cat en-common.wl en_$1-wo_accents-only.wl | sort -u > en_$1.0
+  eval $2 | sort -u > $1.0
  
-  comm en_$1.0 nosug -12 > en_$1-nosug.1
+  comm -12 $1.0 nosug > $1-nosug.1
  
-  comm en_$1.0 nosug -23 > en_$1.1
+  comm -23 $1.0 nosug > $1.1
  
-  $ASPELL -l ./eng munch-list < en_$1-nosug.1 | grep -v '^\(XXX\|>>>\)' | ./add-no-suggest > en_$1.2
+  $SPELLER/munch-list munch $SPELLER/en.aff < $1-nosug.1 | $SPELLER/add-no-suggest > $1.2
  
-  $ASPELL -l ./eng munch-list < en_$1.1 | grep -v '^\(XXX\|>>>\)' >> en_$1.2
+  $SPELLER/munch-list munch $SPELLER/en.aff < $1.1 >> $1.2
  
-  cat en.dic.supp >> en_$1.2
+  cat $SPELLER/en.dic.supp >> $1.2
  
-  wc -l en_$1.2 | cut -d' ' -f1 > en_$1.dic
-  cat en_$1.2 | sort >> en_$1.dic
+  wc -l < $1.2 | tr -d '[:blank:]' > $1.dic
+  cat $1.2 | sort | iconv -f iso-8859-1 -t utf-8 >> $1.dic
  
-  cp en.aff en_$1.aff
+  cp $SPELLER/en.aff $1.aff
    
-  cat README_en.txt.in ../Copyright > README_en_$1.txt
-  echo >> README_en_$1.txt
-  echo "Build Date: `date`" >> README_en_$1.txt
+  if [ "$SCOWL_VERSION" ]; then
+    fn="$1-$SCOWL_VERSION"
+  else
+    fn="$1"
+  fi
  
-  zip -9 en_$1.zip README_en_$1.txt en_$1.dic en_$1.aff
+  WHAT="$1 Hunspell Dictionary" sh $SPELLER/README_en.txt.sh > README_$1.txt
+  if [ -z "$3" ]; then
+    echo "Wordlist Command: $2" >> README_$1.txt
+  else
+    cat $3 >> README_$1.txt
+  fi
  
-  cp -p en_$1.zip hunspell/
+  rm -f hunspell-$fn.zip
+  zip -9 hunspell-$fn.zip README_$1.txt $1.dic $1.aff
  
-  #echo check
+  if [ -z "$3" ]; then
+    mkdir -p hunspell
+    cp hunspell-$fn.zip hunspell/
  
-  cat en_$1-nosug.1 en_$1.1 | sort -u > en_$1.tocheck
+    #echo check
  
-  #hunspell -l -d ./en_$1 < en_$1.dic.tocheck > misspelled
-}
+    cat $1-nosug.1 $1.1 | sort -u > $1.tocheck
+
+    #hunspell -l -d ./$1 < $1.dic.tocheck > misspelled
  
+    cat $1.tocheck | iconv -f iso-8859-1 -t utf-8 | $UNIX2DOS > $1.txt
+    cat <<EOF > README.txt
+This zip file contains the words found in the corresponding Hunspell
+dictionary.  See the file README_$1.txt.
+EOF
+    zip -9 hunspell/wordlist-$fn.zip README.txt README_$1.txt $1.txt
+  fi
+}
  
  prep
  
-doit US
-doit CA
+if [ "$1" = "-all" ]
+then 
+
+  doit en_US "mk-list --accents=strip en_US $SIZE"
+  doit en_CA "mk-list --accents=strip en_CA $SIZE"
+  doit en_GB-ize "mk-list --accents=strip en_GB-ize $SIZE"
+  doit en_GB-ise "mk-list --accents=strip en_GB-ise $SIZE"
+  doit en_AU "mk-list --accents=strip en_AU $SIZE"
+
+  doit en_US-large "mk-list -v1 --accents=both en_US 70"
+  doit en_CA-large "mk-list -v1 --accents=both en_CA 70"
+  doit en_GB-large "mk-list -v1 --accents=both en_GB-ize en_GB-ise 70"
+  doit en_AU-large "mk-list -v1 --accents=both en_AU 70"
+
+  sh $SPELLER/README_en.txt.sh > hunspell/README
+
+elif [ "$1" = "-one" -a -n "$2" -a -n "$3" ]
+then
+
+  doit $2 "cat" $3
+
+else 
+
+  echo "usage: $0 -all | -one <dict-name> <parms file>"
+
+fi
  
-rm eng*.dat nosug en_US*.? en_CA*.?
+#rm eng*.dat nosug en_US*.? en_CA*.?