#!/bin/sh # affix compressor utility for Hunspell # 2008 (c) László Németh, version 0.3 # usage: affixcompress sorted_word_list_file [max_affix_rules] case $# in 0) echo \ "affixcompress - compress a huge sorted word list to Hunspell format Usage: LC_ALL=C sort word_list >sorted_word_list affixcompress sorted_word_list [max_affix_rules] Default value of max_affix_rules = 5000 Note: output may need manually added affix parameters (SET character_encoding, TRY suggestion_characters etc., see man(4) hunspell)" exit 0;; esac MAXAFFIX=${2:-5000} # profiling #AWK="pgawk --profile" AWK="gawk" rm -f $1.aff $1.dic cat $1 | $AWK ' { # calculate frequent suffixes A[$1] = 1 len = length($1) if (len > 2) { # print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr" B[substr($1, 1, len - 1)] = substr($1, len, 1); } for(i = 2; i < len; i++) { r = substr($1, 1, i) if (i == 2) { if (prev != r) { delete A delete B print "Deleted roots: ", prev > "/dev/stderr" A[$1] = 1 } prev = r } if (A[r]) { # print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr" sfx[substr($1, i + 1, len - i + 1)]++ } else if (B[r] && B[r] != substr($1, i + 1, 1)) { r2 = substr($1, i + 1, len - i + 1) sfy[r2,B[r]]++ } } } END { for (i in sfx) print i, 0, sfx[i] for (i in sfy) print i, sfy[i] } ' | tr '\034' ' ' >affixcompress0.tmp sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' | head -$MAXAFFIX >affixcompress1.tmp cat affixcompress1.tmp | $AWK ' function potential_roots() { # potential roots with most frequent suffixes for(word in W) if (W[word]==1) { print word >"word" len = length(word); for(i = 2; i < len; i++) { root = substr(word, 1, i) suff = substr(word, i + 1, len - i + 1) if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++ if (sfz[suff]) { l = split(sfz[suff], a) for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) { C[root a[k]]++ } } } } # calculate roots for(word in W) if (W[word]==1) { print word >"word2" len = length(word); z = 0 # choose most frequent root (maybe the original word) max = C[word] maxword = word maxsuff = 0 for(i = 2; i < len; i++) { root = substr(word, 1, i) suff = substr(word, i + 1, len - i + 1) if ((sfx[suff] != "") && (C[root] > max)) { max = C[root] maxword = root maxsuff = sfx[suff] } if (sfz[suff] != "") { l = split(sfz[suff], a) for (k=1; k <= l; k++) if (C[root a[k]] > max) { max = C[root a[k]] maxword = root a[k] maxsuff = sfy[suff,a[k]] } } } if (max > 0) { if (maxsuff > 0) print maxword, maxsuff; else print maxword A[maxword]++ z=1 } else { for(i = 2; i < len; i++) { root = substr(word, 1, i) suff = substr(word, i + 1, len - i + 1) if ((A[root] > 0) && sfx[suff]!="") { print root, sfx[suff] z = 1 break } if (sfz[suff]) { l = split(sfz[suff], a) for (k=1; k <= l; k++) if (A[root a[k]]!="") { print root a[k], sfy[suff,a[k]] z = 1 break } } } } if (z == 0) { print word A[word]++ } } delete A delete C } FILENAME == "-" { if ($2 == 0) { sfx[$1] = NR sfxfr[$1] = $3 } else { sfy[$1,$2] = NR sfyfr[$1,$2] = $3 sfz[$1] = sfz[$1] " " $2 } maxsuf = NR next } { cap = substr($1, 1, 3) if (cap != prev) { potential_roots() delete W print "Deleted class:", prev > "/dev/stderr" } prev = cap W[$1] = 1 } END { potential_roots() # write out frequent suffixes out=FILENAME ".aff" print "FLAG num" >out for (i in sfx) if (sfx[i] > 0) { print "SFX", sfx[i], "Y 1" >out print "SFX", sfx[i], "0", i, "." >out } for (i in sfy) if (sfy[i] > 0) { print "SFX", sfy[i], "Y 1" >out split(i, c, "\034"); print "SFX", sfy[i], c[2], c[1], c[2] >out } } ' - $1 >affixcompress2.tmp sort -nk 2 affixcompress2.tmp >affixcompress3.tmp cat affixcompress3.tmp | $AWK -v out="$1.dic" ' { if (A[$1]=="") A[$1]=$2; else if ($2!="") A[$1] = A[$1] "," $2 } END { for (i in A) n++ print n >out for (i in A) { if (A[i]=="") print i else print i "/" A[i] } } ' | sort >>$1.dic