#ifndef lint static char Rcs_Id[] = "$Id$"; #endif /* * lookup.c - see if a word appears in the dictionary * * Pace Willisson, 1983 * * Copyright 1987, 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * 4. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by Geoff Kuenning and * other unpaid contributors. * 5. The name of Geoff Kuenning may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Log$ * Revision 1.7 1999/09/29 23:33:32 justin * Updates to the underlying ispell-based code to support suggested corrections. * * Revision 1.6 1999/04/13 17:12:51 jeff * Applied "Darren O. Benham" spell check changes. * Fixed crash on Win32 with the new code. * * Revision 1.5 1999/01/07 01:07:48 paul * Fixed spell leaks. * * Revision 1.5 1999/01/07 01:07:48 paul * Fixed spell leaks. * * Revision 1.4 1998/12/29 14:55:33 eric * * I've doctored the ispell code pretty extensively here. It is now * warning-free on Win32. It also *works* on Win32 now, since I * replaced all the I/O calls with ANSI standard ones. * * Revision 1.3 1998/12/28 23:11:30 eric * * modified spell code and integration to build on Windows. * This is still a hack. * * Actually, it doesn't yet WORK on Windows. It just builds. * SpellCheckInit is failing for some reason. * * Revision 1.2 1998/12/28 22:16:22 eric * * These changes begin to incorporate the spell checker into AbiWord. Most * of this is a hack. * * 1. added other/spell to the -I list in config/abi_defs * 2. replaced other/spell/Makefile with one which is more like * our build system. * 3. added other/spell to other/Makefile so that the build will now * dive down and build the spell check library. * 4. added the AbiSpell library to the Makefiles in wp/main * 5. added a call to SpellCheckInit in wp/main/unix/UnixMain.cpp. * This call is a HACK and should be replaced with something * proper later. * 6. added code to fv_View.cpp as follows: * whenever you double-click on a word, the spell checker * verifies that word and prints its status to stdout. * * Caveats: * 1. This will break the Windows build. I'm going to work on fixing it * now. * 2. This only works if your dictionary is in /usr/lib/ispell/american.hash. * The dictionary location is currently hard-coded. This will be * fixed as well. * * Anyway, such as it is, it works. * * Revision 1.1 1998/12/28 18:04:43 davet * Spell checker code stripped from ispell. At this point, there are * two external routines... the Init routine, and a check-a-word routine * which returns a boolean value, and takes a 16 bit char string. * The code resembles the ispell code as much as possible still. * * Revision 1.42 1995/01/08 23:23:42 geoff * Support MSDOS_BINARY_OPEN when opening the hash file to read it in. * * Revision 1.41 1994/01/25 07:11:51 geoff * Get rid of all old RCS log lines in preparation for the 3.1 release. * */ #include #include #include #include "ispell.h" #include "msgs.h" int linit P ((char *)); #ifdef INDEXDUMP static void dumpindex P ((struct flagptr * indexp, int depth)); #endif /* INDEXDUMP */ static void clearindex P ((struct flagptr * indexp)); struct dent * ispell_lookup P ((ichar_t * word, int dotree)); static void initckch P ((char *)); int gnMaskBits = 64; static int inited = 0; int linit (hashname) char *hashname; /* name of the hash file (dictionary) */ { FILE* fpHash; register int i; register struct dent * dp; struct flagent * entry; struct flagptr * ind; int nextchar, x; int viazero; register ichar_t * cp; if (inited) return 0; if ((fpHash = fopen (hashname, "rb")) == NULL) { (void) fprintf (stderr, CANT_OPEN, hashname); return (-1); } hashsize = fread ((char *) &hashheader, 1, sizeof hashheader, fpHash); if (hashsize < (int)sizeof(hashheader)) { if (hashsize < 0) (void) fprintf (stderr, LOOKUP_C_CANT_READ, hashname); else if (hashsize == 0) (void) fprintf (stderr, LOOKUP_C_NULL_HASH, hashname); else (void) fprintf (stderr, LOOKUP_C_SHORT_HASH (hashname, hashsize, (int) sizeof hashheader)); return (-1); } else if (hashheader.magic != MAGIC) { (void) fprintf (stderr, LOOKUP_C_BAD_MAGIC (hashname, (unsigned int) MAGIC, (unsigned int) hashheader.magic)); return (-1); } else if (hashheader.magic2 != MAGIC) { (void) fprintf (stderr, LOOKUP_C_BAD_MAGIC2 (hashname, (unsigned int) MAGIC, (unsigned int) hashheader.magic2)); return (-1); } /* else if (hashheader.compileoptions != COMPILEOPTIONS*/ else if ( 1 != 1 || hashheader.maxstringchars != MAXSTRINGCHARS || hashheader.maxstringcharlen != MAXSTRINGCHARLEN) { (void) fprintf (stderr, LOOKUP_C_BAD_OPTIONS ((unsigned int) hashheader.compileoptions, hashheader.maxstringchars, hashheader.maxstringcharlen, (unsigned int) COMPILEOPTIONS, MAXSTRINGCHARS, MAXSTRINGCHARLEN)); return (-1); } #if 0 /* DELETE_ME */ if (nodictflag) { /* * Dictionary is not needed - create an empty dummy table. We * actually have to have one entry since the hash * algorithm involves a divide by the table size * (actually modulo, but zero is still unacceptable). * So we create an empty entry. */ hashsize = 1; /* This prevents divides by zero */ hashtbl = (struct dent *) calloc (1, sizeof (struct dent)); if (hashtbl == NULL) { (void) fprintf (stderr, LOOKUP_C_NO_HASH_SPACE); return (-1); } hashtbl[0].word = NULL; hashtbl[0].next = NULL; hashtbl[0].flagfield &= ~(USED | KEEP); /* The flag bits don't matter, but calloc cleared them. */ hashstrings = (char *) malloc ((unsigned) hashheader.lstringsize); } else #endif /* DELETE_ME */ { hashtbl = (struct dent *) calloc ((unsigned) hashheader.tblsize, sizeof (struct dent)); hashsize = hashheader.tblsize; hashstrings = (char *) malloc ((unsigned) hashheader.stringsize); } numsflags = hashheader.stblsize; numpflags = hashheader.ptblsize; sflaglist = (struct flagent *) malloc ((numsflags + numpflags) * sizeof (struct flagent)); if (hashtbl == NULL || hashstrings == NULL || sflaglist == NULL) { (void) fprintf (stderr, LOOKUP_C_NO_HASH_SPACE); return (-1); } pflaglist = sflaglist + numsflags; { if( fread ( hashstrings, 1, (unsigned)hashheader.stringsize, fpHash) != ((size_t)(hashheader.stringsize)) ) { (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT); (void) fprintf (stderr, "stringsize err\n" ); return (-1); } if ( hashheader.compileoptions & 0x04 ) { if( fread ((char *) hashtbl, 1, (unsigned)hashheader.tblsize * sizeof(struct dent), fpHash) != ((size_t) (hashheader.tblsize * sizeof (struct dent)))) { (void) fprintf (stderr, LOOKUP_C_BAD_FORMAT); return (-1); } } else { for( x=0; x= 0; dp++) { if (dp->word == (char *) -1) dp->word = NULL; else dp->word = &hashstrings [ (int)(dp->word) ]; if (dp->next == (struct dent *) -1) dp->next = NULL; else dp->next = &hashtbl [ (int)(dp->next) ]; } } for (i = numsflags + numpflags, entry = sflaglist; --i >= 0; entry++) { if (entry->stripl) entry->strip = (ichar_t *) &hashstrings[(int) entry->strip]; else entry->strip = NULL; if (entry->affl) entry->affix = (ichar_t *) &hashstrings[(int) entry->affix]; else entry->affix = NULL; } /* ** Warning - 'entry' and 'i' are reset in the body of the loop ** below. Don't try to optimize it by (e.g.) moving the decrement ** of i into the loop condition. */ for (i = numsflags, entry = sflaglist; i > 0; i--, entry++) { if (entry->affl == 0) { cp = NULL; ind = &sflagindex[0]; viazero = 1; } else { cp = entry->affix + entry->affl - 1; ind = &sflagindex[*cp]; viazero = 0; while (ind->numents == 0 && ind->pu.fp != NULL) { if (cp == entry->affix) { ind = &ind->pu.fp[0]; viazero = 1; } else { ind = &ind->pu.fp[*--cp]; viazero = 0; } } } if (ind->numents == 0) ind->pu.ent = entry; ind->numents++; /* ** If this index entry has more than MAXSEARCH flags in ** it, we will split it into subentries to reduce the ** searching. However, the split doesn't make sense in ** two cases: (a) if we are already at the end of the ** current affix, or (b) if all the entries in the list ** have identical affixes. Since the list is sorted, (b) ** is true if the first and last affixes in the list ** are identical. */ if (!viazero && ind->numents >= MAXSEARCH && icharcmp (entry->affix, ind->pu.ent->affix) != 0) { /* Sneaky trick: back up and reprocess */ entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */ i = numsflags - (entry - sflaglist); ind->pu.fp = (struct flagptr *) calloc ((unsigned) (SET_SIZE + hashheader.nstrchars), sizeof (struct flagptr)); if (ind->pu.fp == NULL) { (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); return (-1); } ind->numents = 0; } } /* ** Warning - 'entry' and 'i' are reset in the body of the loop ** below. Don't try to optimize it by (e.g.) moving the decrement ** of i into the loop condition. */ for (i = numpflags, entry = pflaglist; i > 0; i--, entry++) { if (entry->affl == 0) { cp = NULL; ind = &pflagindex[0]; viazero = 1; } else { cp = entry->affix; ind = &pflagindex[*cp++]; viazero = 0; while (ind->numents == 0 && ind->pu.fp != NULL) { if (*cp == 0) { ind = &ind->pu.fp[0]; viazero = 1; } else { ind = &ind->pu.fp[*cp++]; viazero = 0; } } } if (ind->numents == 0) ind->pu.ent = entry; ind->numents++; /* ** If this index entry has more than MAXSEARCH flags in ** it, we will split it into subentries to reduce the ** searching. However, the split doesn't make sense in ** two cases: (a) if we are already at the end of the ** current affix, or (b) if all the entries in the list ** have identical affixes. Since the list is sorted, (b) ** is true if the first and last affixes in the list ** are identical. */ if (!viazero && ind->numents >= MAXSEARCH && icharcmp (entry->affix, ind->pu.ent->affix) != 0) { /* Sneaky trick: back up and reprocess */ entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */ i = numpflags - (entry - pflaglist); ind->pu.fp = (struct flagptr *) calloc (SET_SIZE + hashheader.nstrchars, sizeof (struct flagptr)); if (ind->pu.fp == NULL) { (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); return (-1); } ind->numents = 0; } } #ifdef INDEXDUMP (void) fprintf (stderr, "Prefix index table:\n"); dumpindex (pflagindex, 0); (void) fprintf (stderr, "Suffix index table:\n"); dumpindex (sflagindex, 0); #endif if (hashheader.nstrchartype == 0) chartypes = NULL; else { chartypes = (struct strchartype *) malloc (hashheader.nstrchartype * sizeof (struct strchartype)); if (chartypes == NULL) { (void) fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); return (-1); } for (i = 0, nextchar = hashheader.strtypestart; i < hashheader.nstrchartype; i++) { chartypes[i].name = &hashstrings[nextchar]; nextchar += strlen (chartypes[i].name) + 1; chartypes[i].deformatter = &hashstrings[nextchar]; nextchar += strlen (chartypes[i].deformatter) + 1; chartypes[i].suffixes = &hashstrings[nextchar]; while (hashstrings[nextchar] != '\0') nextchar += strlen (&hashstrings[nextchar]) + 1; nextchar++; } } inited = 1; initckch(NULL); return (0); } #define FREEP(p) do { if (p) free(p); } while (0) static void initckch (wchars) char * wchars; /* Characters in -w option, if any */ { register ichar_t c; char num[4]; for (c = 0; c < (ichar_t) (SET_SIZE + hashheader.nstrchars); ++c) { if (iswordch (c)) { if (!mylower (c)) { Try[Trynum] = c; ++Trynum; } } else if (isboundarych (c)) { Try[Trynum] = c; ++Trynum; } } if (wchars != NULL) { while (Trynum < SET_SIZE && *wchars != '\0') { if (*wchars != 'n' && *wchars != '\\') { c = *wchars; ++wchars; } else { ++wchars; num[0] = '\0'; num[1] = '\0'; num[2] = '\0'; num[3] = '\0'; if (isdigit (wchars[0])) { num[0] = wchars[0]; if (isdigit (wchars[1])) { num[1] = wchars[1]; if (isdigit (wchars[2])) num[2] = wchars[2]; } } if (wchars[-1] == 'n') { wchars += strlen (num); c = atoi (num); } else { wchars += strlen (num); c = 0; if (num[0]) c = num[0] - '0'; if (num[1]) { c <<= 3; c += num[1] - '0'; } if (num[2]) { c <<= 3; c += num[2] - '0'; } } } /* c &= NOPARITY;*/ if (!hashheader.wordchars[c]) { hashheader.wordchars[c] = 1; hashheader.sortorder[c] = hashheader.sortval++; Try[Trynum] = c; ++Trynum; } } } } void lcleanup(void) { clearindex (pflagindex); clearindex (sflagindex); FREEP(hashtbl); FREEP(hashstrings); FREEP(sflaglist); FREEP(chartypes); } static void clearindex (indexp) register struct flagptr * indexp; { register int i; for (i = 0; i < SET_SIZE + hashheader.nstrchars; i++, indexp++) { if (indexp->numents == 0 && indexp->pu.fp != NULL) { clearindex(indexp->pu.fp); free(indexp->pu.fp); } } } #ifdef INDEXDUMP static void dumpindex (indexp, depth) register struct flagptr * indexp; register int depth; { register int i; int j; int k; char stripbuf[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; for (i = 0; i < SET_SIZE + hashheader.nstrchars; i++, indexp++) { if (indexp->numents == 0 && indexp->pu.fp != NULL) { for (j = depth; --j >= 0; ) (void) putc (' ', stderr); if (i >= ' ' && i <= '~') (void) putc (i, stderr); else (void) fprintf (stderr, "0x%x", i); (void) putc ('\n', stderr); dumpindex (indexp->pu.fp, depth + 1); } else if (indexp->numents) { for (j = depth; --j >= 0; ) (void) putc (' ', stderr); if (i >= ' ' && i <= '~') (void) putc (i, stderr); else (void) fprintf (stderr, "0x%x", i); (void) fprintf (stderr, " -> %d entries\n", indexp->numents); for (k = 0; k < indexp->numents; k++) { for (j = depth; --j >= 0; ) (void) putc (' ', stderr); if (indexp->pu.ent[k].stripl) { (void) ichartostr (stripbuf, indexp->pu.ent[k].strip, sizeof stripbuf, 1); (void) fprintf (stderr, " entry %d (-%s,%s)\n", &indexp->pu.ent[k] - sflaglist, stripbuf, indexp->pu.ent[k].affl ? ichartosstr (indexp->pu.ent[k].affix, 1) : "-"); } else (void) fprintf (stderr, " entry %d (%s)\n", &indexp->pu.ent[k] - sflaglist, ichartosstr (indexp->pu.ent[k].affix, 1)); } } } } #endif /* n is length of s */ struct dent * ispell_lookup (s, dotree) register ichar_t * s; int dotree; { register struct dent * dp; register char * s1; char schar[INPUTWORDLEN + MAXAFFIXLEN]; dp = &hashtbl[hash (s, hashsize)]; if (ichartostr (schar, s, sizeof schar, 1)) (void) fprintf (stderr, WORD_TOO_LONG (schar)); for ( ; dp != NULL; dp = dp->next) { /* quick strcmp, but only for equality */ s1 = dp->word; if (s1 && s1[0] == schar[0] && strcmp (s1 + 1, schar + 1) == 0) return dp; #ifndef NO_CAPITALIZATION_SUPPORT while (dp->flagfield & MOREVARIANTS) /* Skip variations */ dp = dp->next; #endif } #if 0 /* NO PERSONAL DICTIONARY LOOKUP NOW 12/98 */ if (dotree) { dp = treelookup (s); return dp; } else #endif /* NO PERSONAL DICTIONARY LOOKUP NOW 12/98 */ return NULL; }