/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* enchant * Copyright (C) 2003, 2004 Dom Lachowicz * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02110-1301, USA. * * In addition, as a special exception, Dom Lachowicz * gives permission to link the code of this program with * non-LGPL Spelling Provider libraries (eg: a MSFT Office * spell checker backend) and distribute linked combinations including * the two. You must obey the GNU Lesser General Public License in all * respects for all of the code used other than said providers. If you modify * this file, you may extend this exception to your version of the * file, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. */ /** * * This file implements personal word list (PWL) dictionaries in the * type EnchantPWL. * * Under the hood, a PWL is stored as a Trie. Checking strings for * correctness and making suggestions is done by traversing the Trie * while allowing a certain number of miss-steps. Due to the prefix * compression of the Trie, this allows all strings in the PWL within * a given edit distance of the target word to be enumerated quite * efficiently. * * Ideas for the future: * * - Order the suggestions first by edit distance, then by * soundex key, to put the most similar sounding suggestions * at the front of the list. Would need a "soundex" that is * general enough to handle languages other than English. * * - iterative deepending to find suggestions, rather than a straight * search to depth three. * */ #include #include #include #include #include #include #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include #include #endif #include #include #include "enchant-provider.h" #include "pwl.h" #if defined(_MSC_VER) #pragma warning(disable: 4996) /* The POSIX name for this item is deprecated. Instead, use the ISO C++ conformant name. */ #endif #if defined(HAVE_FLOCK) || defined(HAVE_LOCKF) #include #include #endif /* HAVE_FLOCK || HAVE_LOCKF */ #define ENCHANT_PWL_MAX_ERRORS 3 #define ENCHANT_PWL_MAX_SUGGS 15 /* A PWL dictionary is stored as a Trie-like data structure EnchantTrie. * The EnchantTrie datatype is completely recursive - all child nodes * are simply EnchantTrie pointers. This means that all functions * that potentially modify a trie need to return the modified trie, * as additional memory may have been allocated. * * The empty trie is simply the null pointer. If a trie contains * a single string, it is recorded in the "value" attribute and * "subtries" is set to NULL. When two or more strings are contained, * "value" is NULL and "subtries" is a GHashTable mapping the first * character of each string to the subtrie containing the remainder of * that string. * * All strings stored in the Trie are assumed to be in UTF format. * Branching is done on unicode characters, not individual bytes. */ typedef struct str_enchant_trie EnchantTrie; struct str_enchant_trie { char* value; /* final string found under this node */ GHashTable* subtries; /* Maps characters to subtries */ }; struct str_enchant_pwl { EnchantTrie* trie; char * filename; time_t file_changed; GHashTable *words_in_trie; }; /* Special Trie node indicating the end of a string */ static EnchantTrie n_EOSTrie; static EnchantTrie* EOSTrie = &n_EOSTrie; /* mode for searching trie */ typedef enum enum_matcher_mode EnchantTrieMatcherMode; enum enum_matcher_mode { case_sensitive, case_insensitive }; /* The EnchantTrieMatcher structure maintains the state necessary to * search for matching strings within an EnchantTrie. It includes a * callback function which will be called with each matching string * as it is found. The arguments to this function are: * * - a freshly-allocated copy of the matching string, which must * be freed by external code * - the EnchantTrieMatcher object, giving the context of the match * (e.g. number of errors) */ typedef struct str_enchant_trie_matcher EnchantTrieMatcher; struct str_enchant_trie_matcher { int num_errors; /* Num errors encountered so far. */ int max_errors; /* Max errors before search should terminate */ char* word; /* Word being searched for */ ssize_t word_pos; /* Current position in the word */ char* path; /* Path taken through the trie so far */ ssize_t path_len; /* Length of allocated path string */ ssize_t path_pos; /* Current end pos of path string */ EnchantTrieMatcherMode mode; void (*cbfunc)(char*,EnchantTrieMatcher*); /* callback func */ void* cbdata; /* Private data for use by callback func */ }; /* To allow the list of suggestions to be built up an item at a time, * its state is maintained in an EnchantSuggList object. */ typedef struct str_enchant_sugg_list { char** suggs; int* sugg_errs; size_t n_suggs; } EnchantSuggList; /* * Function Prototypes */ static void enchant_pwl_add_to_trie(EnchantPWL *pwl, const char *const word, size_t len); static void enchant_pwl_refresh_from_file(EnchantPWL* pwl); static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher); static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher); static EnchantTrie* enchant_trie_init(void); static void enchant_trie_free(EnchantTrie* trie); static void enchant_trie_free_cb(void*,void*,void*); static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word); static void enchant_trie_remove(EnchantTrie* trie,const char *const word); static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher); static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV); static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word, size_t len, int maxerrs, EnchantTrieMatcherMode mode, void(*cbfunc)(char*,EnchantTrieMatcher*), void* cbdata); static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher); static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars); static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num); static int edit_dist(const char* word1, const char* word2); static void enchant_lock_file (FILE * f) { #if defined(HAVE_FLOCK) flock (fileno (f), LOCK_EX); #elif defined(HAVE_LOCKF) lockf (fileno (f), F_LOCK, 0); #elif defined(_WIN32) OVERLAPPED overlapped; overlapped.Offset = 0; overlapped.OffsetHigh = 0; overlapped.hEvent = NULL; if (!LockFileEx ((HANDLE) _get_osfhandle (fileno (f)), LOCKFILE_EXCLUSIVE_LOCK, 0, 0, 0x80000000, &overlapped)) g_warning ("Could not lock file\n"); #else /* TODO: UNIX fcntl. This race condition probably isn't too bad. */ #endif /* HAVE_FLOCK */ } static void enchant_unlock_file (FILE * f) { #if defined(HAVE_FLOCK) flock (fileno (f), LOCK_UN); #elif defined(HAVE_LOCKF) lockf (fileno (f), F_ULOCK, 0); #elif defined(_WIN32) OVERLAPPED overlapped; overlapped.Offset = 0; overlapped.OffsetHigh = 0; overlapped.hEvent = NULL; if (!UnlockFileEx ((HANDLE) _get_osfhandle (fileno (f)), 0, 0, 0x80000000, &overlapped)) g_warning ("Could not unlock file\n"); #else /* TODO: UNIX fcntl. This race condition probably isn't too bad. */ #endif /* HAVE_FLOCK */ } /** * enchant_pwl_init * * Returns: a new PWL object used to store/check/suggest words. */ EnchantPWL* enchant_pwl_init(void) { EnchantPWL *pwl; pwl = g_new0(EnchantPWL, 1); pwl->words_in_trie = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); return pwl; } #ifndef BUFSIZ #define BUFSIZ 1024 #endif /** * enchant_pwl_init_with_file * * Returns: a new PWL object used to store/check/suggest words * or NULL if the file cannot be opened or created */ EnchantPWL* enchant_pwl_init_with_file(const char * file) { FILE* fd; EnchantPWL *pwl; g_return_val_if_fail (file != NULL, NULL); fd = enchant_fopen(file, "ab+"); if(fd == NULL) { return NULL; } fclose(fd); pwl = enchant_pwl_init(); pwl->filename = g_strdup(file); pwl->file_changed = 0; enchant_pwl_refresh_from_file(pwl); return pwl; } static void enchant_pwl_refresh_from_file(EnchantPWL* pwl) { char buffer[BUFSIZ]; char* line; size_t line_number = 1; FILE *f; struct stat stats; if(!pwl->filename) return; if(g_stat(pwl->filename, &stats)!=0) return; /*presumably I won't be able to open the file either*/ if(pwl->file_changed == stats.st_mtime) return; /*nothing changed since last read*/ enchant_trie_free(pwl->trie); pwl->trie = NULL; g_hash_table_destroy (pwl->words_in_trie); pwl->words_in_trie = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); f = enchant_fopen(pwl->filename, "r"); if (!f) return; pwl->file_changed = stats.st_mtime; enchant_lock_file (f); for (;NULL != (fgets (buffer, sizeof (buffer), f));++line_number) { const gunichar BOM = 0xfeff; size_t l; line = buffer; if(line_number == 1 && BOM == g_utf8_get_char(line)) line = g_utf8_next_char(line); l = strlen(line)-1; if (line[l]=='\n') line[l] = '\0'; else if(!feof(f)) /* ignore lines longer than BUFSIZ. */ { g_warning ("Line too long (ignored) in %s at line:%u\n", pwl->filename, line_number); while (NULL != (fgets (buffer, sizeof (buffer), f))) { if (line[strlen(buffer)-1]=='\n') break; } continue; } if( line[0] && line[0] != '#') { if(g_utf8_validate(line, -1, NULL)) enchant_pwl_add_to_trie(pwl, line, strlen(line)); else g_warning ("Bad UTF-8 sequence in %s at line:%u\n", pwl->filename, line_number); } } enchant_unlock_file (f); fclose (f); } void enchant_pwl_free(EnchantPWL *pwl) { enchant_trie_free(pwl->trie); g_free(pwl->filename); g_hash_table_destroy (pwl->words_in_trie); g_free(pwl); } static void enchant_pwl_add_to_trie(EnchantPWL *pwl, const char *const word, size_t len) { char * normalized_word; normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD); if(NULL != g_hash_table_lookup (pwl->words_in_trie, normalized_word)) { g_free (normalized_word); return; } g_hash_table_insert (pwl->words_in_trie, normalized_word, g_strndup(word,len)); pwl->trie = enchant_trie_insert(pwl->trie, normalized_word); } static void enchant_pwl_remove_from_trie(EnchantPWL *pwl, const char *const word, size_t len) { char * normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD); if( g_hash_table_remove (pwl->words_in_trie, normalized_word) ) { enchant_trie_remove(pwl->trie, normalized_word); if(pwl->trie && pwl->trie->subtries == NULL && pwl->trie->value == NULL) pwl->trie = NULL; /* make trie empty if has no content */ } g_free(normalized_word); } void enchant_pwl_add(EnchantPWL *pwl, const char *const word, size_t len) { enchant_pwl_refresh_from_file(pwl); enchant_pwl_add_to_trie(pwl, word, len); if (pwl->filename != NULL) { FILE *f; f = enchant_fopen(pwl->filename, "a"); if (f) { struct stat stats; enchant_lock_file (f); if(g_stat(pwl->filename, &stats)==0) pwl->file_changed = stats.st_mtime; /* we write the new line first since we can't guarantee that the file was terminated by a new line before and we are just appending to the end of the file */ fwrite ("\n", sizeof(char), 1, f); fwrite (word, sizeof(char), len, f); enchant_unlock_file (f); fclose (f); } } } void enchant_pwl_remove(EnchantPWL *pwl, const char *const word, size_t len) { if(enchant_pwl_check(pwl, word, len) == 1) return; enchant_pwl_refresh_from_file(pwl); enchant_pwl_remove_from_trie(pwl, word, len); if (pwl->filename) { char * contents; size_t length; FILE *f; if(!g_file_get_contents(pwl->filename, &contents, &length, NULL)) return; f = enchant_fopen(pwl->filename, "wb"); /*binary because g_file_get_contents reads binary*/ if (f) { const gunichar BOM = 0xfeff; char * filestart, *searchstart, *needle; char * key; struct stat stats; enchant_lock_file (f); key = g_strndup(word, len); if(BOM == g_utf8_get_char(contents)) { filestart = g_utf8_next_char(contents); fwrite (contents, sizeof(char), filestart-contents, f); } else filestart = contents; searchstart = filestart; for(;;) { /*find word*/ needle = strstr(searchstart, key); if(needle == NULL) { fwrite (searchstart, sizeof(char), length - (searchstart - contents), f); break; } else { char* foundend = needle+len; if((needle == filestart || contents[needle-contents-1] == '\n' || contents[needle-contents-1] == '\r') && (foundend == contents + length || *foundend == '\n' || *foundend == '\r')) { fwrite (searchstart, sizeof(char), needle - searchstart, f); searchstart = foundend; while (*searchstart == '\n' || *searchstart == '\r') ++searchstart; } else { fwrite (searchstart, sizeof(char), needle - searchstart+1, f); searchstart = needle+1; } } } g_free(key); if(g_stat(pwl->filename, &stats)==0) pwl->file_changed = stats.st_mtime; enchant_unlock_file (f); fclose (f); } g_free(contents); } } static int enchant_pwl_contains(EnchantPWL *pwl, const char *const word, size_t len) { EnchantTrieMatcher* matcher; int count = 0; matcher = enchant_trie_matcher_init(word,len,0,case_sensitive,enchant_pwl_check_cb, &count); enchant_trie_find_matches(pwl->trie,matcher); enchant_trie_matcher_free(matcher); return (count == 0 ? 0 : 1); } static int enchant_is_all_caps(const char*const word, size_t len) { const char* it; int hasCap = 0; g_return_val_if_fail (word && *word, 0); for(it = word; it < word + len; it = g_utf8_next_char(it)) { GUnicodeType type = g_unichar_type(g_utf8_get_char(it)); switch(type) { case G_UNICODE_UPPERCASE_LETTER: hasCap = 1; break; case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: return 0; case G_UNICODE_CONTROL: case G_UNICODE_FORMAT: case G_UNICODE_UNASSIGNED: case G_UNICODE_PRIVATE_USE: case G_UNICODE_SURROGATE: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_COMBINING_MARK: case G_UNICODE_ENCLOSING_MARK: case G_UNICODE_NON_SPACING_MARK: case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: case G_UNICODE_CONNECT_PUNCTUATION: case G_UNICODE_DASH_PUNCTUATION: case G_UNICODE_CLOSE_PUNCTUATION: case G_UNICODE_FINAL_PUNCTUATION: case G_UNICODE_INITIAL_PUNCTUATION: case G_UNICODE_OTHER_PUNCTUATION: case G_UNICODE_OPEN_PUNCTUATION: case G_UNICODE_CURRENCY_SYMBOL: case G_UNICODE_MODIFIER_SYMBOL: case G_UNICODE_MATH_SYMBOL: case G_UNICODE_OTHER_SYMBOL: case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: case G_UNICODE_SPACE_SEPARATOR: default: break; } } return hasCap; } static int enchant_is_title_case(const char*const word, size_t len) { gunichar ch; GUnicodeType type; const char* it = word; g_return_val_if_fail (word && *word, 0); ch = g_utf8_get_char(it); type = g_unichar_type(ch); if(type != G_UNICODE_UPPERCASE_LETTER && type != G_UNICODE_TITLECASE_LETTER) return 0; if(ch != g_unichar_totitle(ch) ) return 0; for(it = g_utf8_next_char(it); it < word + len; it = g_utf8_next_char(it)) { type = g_unichar_type(g_utf8_get_char(it)); if(type == G_UNICODE_UPPERCASE_LETTER || type == G_UNICODE_TITLECASE_LETTER) return 0; } return 1; } static gchar* enchant_utf8_strtitle(const gchar*str, gssize len) { gunichar title_case_char; gchar* result; gchar* upperStr, * upperTail, * lowerTail; gchar title_case_utf8[7]; gint utf8len; upperStr = g_utf8_strup(str, len); /* for locale sensitive casing */ title_case_char = g_unichar_totitle(g_utf8_get_char(upperStr)); utf8len = g_unichar_to_utf8(title_case_char, title_case_utf8); title_case_utf8[utf8len] = '\0'; upperTail = g_utf8_next_char(upperStr); lowerTail = g_utf8_strdown(upperTail, -1); result = g_strconcat(title_case_utf8, lowerTail, NULL); g_free(upperStr); g_free(lowerTail); return result; } int enchant_pwl_check(EnchantPWL *pwl, const char *const word, size_t len) { int exists = 0; int isAllCaps = 0; enchant_pwl_refresh_from_file(pwl); exists = enchant_pwl_contains(pwl, word, len); if(exists) return 0; if(enchant_is_title_case(word, len) || (isAllCaps = enchant_is_all_caps(word, len))) { char * lower_case_word = g_utf8_strdown(word, len); exists = enchant_pwl_contains(pwl, lower_case_word, strlen(lower_case_word)); g_free(lower_case_word); if(exists) return 0; if(isAllCaps) { char * title_case_word = enchant_utf8_strtitle(word, len); exists = enchant_pwl_contains(pwl, title_case_word, strlen(title_case_word)); g_free(title_case_word); if(exists) return 0; } } return 1; /* not found */ } /* matcher callback when a match is found*/ static void enchant_pwl_check_cb(char* match,EnchantTrieMatcher* matcher) { g_free(match); (*((int*)(matcher->cbdata)))++; } static void enchant_pwl_case_and_denormalize_suggestions(EnchantPWL *pwl, const char *const word, size_t len, EnchantSuggList* suggs_list) { size_t i; gchar* (*utf8_case_convert_function)(const gchar*str, gssize len); if(enchant_is_title_case(word, len)) utf8_case_convert_function = enchant_utf8_strtitle; else if (enchant_is_all_caps(word, len)) utf8_case_convert_function = g_utf8_strup; else utf8_case_convert_function = NULL; for(i = 0; i < suggs_list->n_suggs; ++i) { gchar* cased_suggestion; gchar* suggestion; size_t suggestion_len; suggestion = g_hash_table_lookup (pwl->words_in_trie, suggs_list->suggs[i]); suggestion_len = strlen(suggestion); if(utf8_case_convert_function && !enchant_is_all_caps(suggestion, suggestion_len)) { cased_suggestion = utf8_case_convert_function(suggestion, suggestion_len); } else { cased_suggestion = g_strndup(suggestion, suggestion_len); } g_free(suggs_list->suggs[i]); suggs_list->suggs[i] = cased_suggestion; } } static int best_distance(const char*const*const suggs, const char *const word, size_t len) { int best_dist; const char*const* sugg_it; char* normalized_word; normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD); best_dist = g_utf8_strlen(normalized_word, -1); if(suggs) { for(sugg_it = suggs; *sugg_it; ++sugg_it) { char* normalized_sugg; int dist; normalized_sugg = g_utf8_normalize (*sugg_it, -1, G_NORMALIZE_NFD); dist = edit_dist(normalized_word, normalized_sugg); g_free(normalized_sugg); if (dist < best_dist) best_dist = dist; } } g_free(normalized_word); return best_dist; } /* gives the best set of suggestions from pwl that are at least as good as the * given suggs (if suggs == NULL just best from pwl) */ char** enchant_pwl_suggest(EnchantPWL *pwl,const char *const word, size_t len, const char*const*const suggs, size_t* out_n_suggs) { EnchantTrieMatcher* matcher; EnchantSuggList sugg_list; int max_dist; max_dist = suggs? best_distance(suggs, word, len) : ENCHANT_PWL_MAX_ERRORS; if(max_dist > ENCHANT_PWL_MAX_ERRORS) max_dist = ENCHANT_PWL_MAX_ERRORS; enchant_pwl_refresh_from_file(pwl); sugg_list.suggs = g_new0(char*,ENCHANT_PWL_MAX_SUGGS+1); sugg_list.sugg_errs = g_new0(int,ENCHANT_PWL_MAX_SUGGS); sugg_list.n_suggs = 0; matcher = enchant_trie_matcher_init(word,len, max_dist, case_insensitive, enchant_pwl_suggest_cb, &sugg_list); enchant_trie_find_matches(pwl->trie,matcher); enchant_trie_matcher_free(matcher); g_free(sugg_list.sugg_errs); sugg_list.suggs[sugg_list.n_suggs] = NULL; (*out_n_suggs) = sugg_list.n_suggs; enchant_pwl_case_and_denormalize_suggestions(pwl, word, len, &sugg_list); return sugg_list.suggs; } /* matcher callback when a match is found*/ static void enchant_pwl_suggest_cb(char* match,EnchantTrieMatcher* matcher) { EnchantSuggList* sugg_list; size_t loc, i; int changes = 0; /* num words added to list */ sugg_list = (EnchantSuggList*)(matcher->cbdata); /* only get best errors so adapt */ if(matcher->num_errors < matcher->max_errors) matcher->max_errors = matcher->num_errors; /* Find appropriate location in the array, if any */ /* In future, this could be done using binary search... */ for(loc=0; loc < sugg_list->n_suggs; loc++) { /* Better than an existing suggestion, so stop */ if(sugg_list->sugg_errs[loc] > matcher->num_errors) { break; } /* Already in the list with better score, just return */ if(strcmp(match,sugg_list->suggs[loc])==0) { g_free(match); return; } } /* If it's not going to fit, just throw it away */ if(loc >= ENCHANT_PWL_MAX_SUGGS) { g_free(match); return; } changes++; /* Remove all elements with worse score */ for(i=loc; i < sugg_list->n_suggs; i++){ g_free(sugg_list->suggs[i]); changes--; } sugg_list->suggs[loc] = match; sugg_list->sugg_errs[loc] = matcher->num_errors; sugg_list->n_suggs = sugg_list->n_suggs + changes; } void enchant_pwl_free_string_list(EnchantPWL *pwl, char** string_list) { g_strfreev(string_list); } static EnchantTrie* enchant_trie_init(void) { EnchantTrie* trie; trie = g_new(EnchantTrie,1); trie->value = NULL; trie->subtries = NULL; return trie; } static void enchant_trie_free(EnchantTrie* trie) { /* Dont ever free NULL, or the EOSTrie pointer */ if(trie == NULL || trie == EOSTrie) { return; } /* Because we have not set a destroy function for the hashtable * (to make code cleaner below), we need to explicitly free all * subtries with a callback function. */ if (trie->subtries != NULL) { g_hash_table_foreach(trie->subtries,enchant_trie_free_cb,NULL); g_hash_table_destroy(trie->subtries); } if (trie->value != NULL) { g_free(trie->value); } g_free(trie); } static void enchant_trie_free_cb(void* key, void* value, void* data) { enchant_trie_free((EnchantTrie*) value); } static EnchantTrie* enchant_trie_insert(EnchantTrie* trie,const char *const word) { char *tmpWord; ssize_t nxtCh = 0; EnchantTrie* subtrie; if (trie == NULL) { trie = enchant_trie_init(); } if (trie->value == NULL) { if (trie->subtries == NULL) { /* When single word, store in trie->value */ trie->value = g_strdup(word); } else { /* Store multiple words in subtries */ if (word[0] == '\0') { /* Mark end-of-string with special node */ tmpWord = g_strdup(""); g_hash_table_insert(trie->subtries, tmpWord,EOSTrie); } else { nxtCh = (ssize_t)(g_utf8_next_char(word)-word); tmpWord = g_strndup(word,nxtCh); subtrie = g_hash_table_lookup(trie->subtries, tmpWord); subtrie = enchant_trie_insert(subtrie, (word+nxtCh)); g_hash_table_insert(trie->subtries, tmpWord,subtrie); } } } else { /* Create new hastable for subtries, and reinsert */ trie->subtries = g_hash_table_new_full(g_str_hash, g_str_equal,g_free, NULL); tmpWord = trie->value; trie->value = NULL; enchant_trie_insert(trie,tmpWord); enchant_trie_insert(trie,word); g_free(tmpWord); } return trie; } #if !GLIB_CHECK_VERSION(2,14,0) static void grab_keys (gpointer key, gpointer value, gpointer user_data) { GList **l = user_data; *l = g_list_prepend (*l, key); } static GList* g_hash_table_get_keys (GHashTable *hash_table) { GList *l = NULL; g_hash_table_foreach (hash_table, grab_keys, &l); return l; } #endif static void enchant_trie_remove(EnchantTrie* trie,const char *const word) { char *tmpWord; ssize_t nxtCh = 0; EnchantTrie* subtrie; if (trie == NULL) return; if (trie->value == NULL) { if (trie->subtries != NULL) { /* Store multiple words in subtries */ if (word[0] == '\0') { /* Mark end-of-string with special node */ g_hash_table_remove(trie->subtries, ""); } else { nxtCh = (ssize_t)(g_utf8_next_char(word)-word); tmpWord = g_strndup(word,nxtCh); subtrie = g_hash_table_lookup(trie->subtries, tmpWord); enchant_trie_remove(subtrie, (word+nxtCh)); if(subtrie->subtries == NULL && subtrie->value == NULL) g_hash_table_remove(trie->subtries, tmpWord); g_free(tmpWord); } if(g_hash_table_size(trie->subtries) == 1) { char* key; GList* keys = g_hash_table_get_keys(trie->subtries); key = (char*) keys->data; subtrie = g_hash_table_lookup(trie->subtries, key); /* only remove trie nodes that have values by propogating these up */ if(subtrie->value) { trie->value = g_strconcat(key, subtrie->value, NULL); enchant_trie_free(subtrie); g_hash_table_destroy(trie->subtries); trie->subtries = NULL; } g_list_free(keys); } } } else { if(strcmp(trie->value, word) == 0) { g_free(trie->value); trie->value = NULL; } } } static EnchantTrie* enchant_trie_get_subtrie(EnchantTrie* trie, EnchantTrieMatcher* matcher, char** nxtChS) { EnchantTrie* subtrie; if(trie->subtries == NULL || *nxtChS == NULL) return NULL; subtrie = g_hash_table_lookup(trie->subtries,*nxtChS); if(subtrie == NULL && matcher->mode == case_insensitive) { char* nxtChSUp = g_utf8_strup(*nxtChS, -1); /* we ignore the title case scenario since that will give us an edit_distance of one which is acceptable since this mode is used for suggestions*/ g_free(*nxtChS); *nxtChS = nxtChSUp; subtrie = g_hash_table_lookup(trie->subtries,nxtChSUp); } return subtrie; } static void enchant_trie_find_matches(EnchantTrie* trie,EnchantTrieMatcher *matcher) { int errs = 0; ssize_t nxtChI = 0, oldPos = 0; char* nxtChS = NULL; EnchantTrie* subtrie = NULL; g_return_if_fail(matcher); /* Can't match in the empty trie */ if(trie == NULL) { return; } /* Bail out if over the error limits */ if(matcher->num_errors > matcher->max_errors){ return; } /* If the end of a string has been reached, no point recursing */ if (trie == EOSTrie) { size_t word_len = strlen(matcher->word); errs = matcher->num_errors; if((ssize_t)word_len > matcher->word_pos) { matcher->num_errors = errs + word_len - matcher->word_pos; } if (matcher->num_errors <= matcher->max_errors) { matcher->cbfunc(g_strdup(matcher->path),matcher); } matcher->num_errors = errs; return; } /* If there is a value, just check it, no recursion */ if (trie->value != NULL) { gchar* value; errs = matcher->num_errors; value = trie->value; if(matcher->mode == case_insensitive) { value = g_utf8_strdown(value, -1); } matcher->num_errors = errs + edit_dist(value, &(matcher->word[matcher->word_pos])); if(matcher->mode == case_insensitive) { g_free(value); } if (matcher->num_errors <= matcher->max_errors) { matcher->cbfunc(g_strconcat(matcher->path, trie->value,NULL), matcher); } matcher->num_errors = errs; return; } nxtChI = (ssize_t)(g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word); nxtChS = g_strndup(&matcher->word[matcher->word_pos], (nxtChI - matcher->word_pos)); /* Precisely match the first character, and recurse */ subtrie = enchant_trie_get_subtrie(trie, matcher, &nxtChS); if (subtrie != NULL) { enchant_trie_matcher_pushpath(matcher,nxtChS); oldPos = matcher->word_pos; matcher->word_pos = nxtChI; enchant_trie_find_matches(subtrie,matcher); matcher->word_pos = oldPos; enchant_trie_matcher_poppath(matcher,strlen(nxtChS)); } g_free(nxtChS); matcher->num_errors++; if (matcher->word[matcher->word_pos] != '\0') { /* Match on inserting word[0] */ oldPos = matcher->word_pos; matcher->word_pos = nxtChI; enchant_trie_find_matches(trie,matcher); matcher->word_pos = oldPos; } /* for each subtrie, match on delete or substitute word[0] or transpose word[0] and word[1] */ g_hash_table_foreach(trie->subtries, enchant_trie_find_matches_cb, matcher); matcher->num_errors--; } static void enchant_trie_find_matches_cb(void* keyV,void* subtrieV,void* matcherV) { char* key, *key2; EnchantTrie* subtrie, *subtrie2; EnchantTrieMatcher* matcher; ssize_t nxtChI, oldPos; key = (char*) keyV; subtrie = (EnchantTrie*) subtrieV; matcher = (EnchantTrieMatcher*) matcherV; nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word); /* Dont handle actual matches, that's already done */ if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) { return; } enchant_trie_matcher_pushpath(matcher,key); /* Match on deleting word[0] */ enchant_trie_find_matches(subtrie,matcher); /* Match on substituting word[0] */ oldPos = matcher->word_pos; matcher->word_pos = nxtChI; enchant_trie_find_matches(subtrie,matcher); enchant_trie_matcher_poppath(matcher,strlen(key)); /* Match on transposing word[0] and word[1] */ key2 = g_strndup(&matcher->word[oldPos],nxtChI-oldPos); subtrie2 = enchant_trie_get_subtrie(subtrie, matcher, &key2); if(subtrie2 != NULL) { nxtChI = (ssize_t) (g_utf8_next_char(&matcher->word[matcher->word_pos]) - matcher->word); if (strncmp(key,&matcher->word[matcher->word_pos],nxtChI-matcher->word_pos) == 0) { matcher->word_pos = nxtChI; enchant_trie_matcher_pushpath(matcher,key); enchant_trie_matcher_pushpath(matcher,key2); enchant_trie_find_matches(subtrie2,matcher); enchant_trie_matcher_poppath(matcher,strlen(key2)); enchant_trie_matcher_poppath(matcher,strlen(key)); } } g_free(key2); matcher->word_pos = oldPos; } static EnchantTrieMatcher* enchant_trie_matcher_init(const char* const word, size_t len, int maxerrs, EnchantTrieMatcherMode mode, void(*cbfunc)(char*,EnchantTrieMatcher*), void* cbdata) { EnchantTrieMatcher* matcher; char * normalized_word, * pattern; normalized_word = g_utf8_normalize (word, len, G_NORMALIZE_NFD); len = strlen(normalized_word); if(mode == case_insensitive) { pattern = g_utf8_strdown (normalized_word, len); g_free(normalized_word); } else pattern = normalized_word; matcher = g_new(EnchantTrieMatcher,1); matcher->num_errors = 0; matcher->max_errors = maxerrs; matcher->word = pattern; matcher->word_pos = 0; matcher->path = g_new0(char,len+maxerrs+1); matcher->path[0] = '\0'; matcher->path_len = len+maxerrs+1; matcher->path_pos = 0; matcher->mode = mode; matcher->cbfunc = cbfunc; matcher->cbdata = cbdata; return matcher; } static void enchant_trie_matcher_free(EnchantTrieMatcher* matcher) { g_free(matcher->word); g_free(matcher->path); g_free(matcher); } static void enchant_trie_matcher_pushpath(EnchantTrieMatcher* matcher,char* newchars) { ssize_t len, i; len = strlen(newchars); if(matcher->path_pos + len >= matcher->path_len) { matcher->path_len = matcher->path_len + len + 10; matcher->path = g_renew(char,matcher->path,matcher->path_len); } for(i = 0; i < len; i++) { matcher->path[matcher->path_pos + i] = newchars[i]; } matcher->path_pos = matcher->path_pos + len; matcher->path[matcher->path_pos] = '\0'; } static void enchant_trie_matcher_poppath(EnchantTrieMatcher* matcher,int num) { g_return_if_fail(matcher->path_pos >= 0); matcher->path_pos = matcher->path_pos - num; if(matcher->path_pos < 0) { matcher->path_pos = 0; } matcher->path[matcher->path_pos] = '\0'; } static int edit_dist(const char* utf8word1, const char* utf8word2) { gunichar * word1, * word2; glong len1, len2, cost, i, j; int v1, v2, v3, v4; int* table; word1 = g_utf8_to_ucs4_fast(utf8word1, -1, &len1); word2 = g_utf8_to_ucs4_fast(utf8word2, -1, &len2); table = g_new0(int,(len1+1)*(len2+1)); /* Initialise outer rows of table */ for (i=0; i < len1 + 1; i++) { table[i*(len2+1)] = i; } for (j=0; j < len2 + 1; j++) { table[j] = j; } /* Fill in table in dynamic programming style */ for (i=1; i < len1+1; i++){ for(j=1; j < len2+1; j++) { if(word1[i-1] == word2[j-1]) { cost = 0; } else { cost = 1; } v1 = table[(i-1)*(len2+1)+j] + 1; v2 = table[i*(len2+1)+(j-1)] + 1; v3 = table[(i-1)*(len2+1)+(j-1)] + cost; if(i > 1 && j > 1 && word1[i-1] == word2[j-2] && word1[i-2] == word2[j-1]) { v4 = table[(i-2)*(len2+1)+(j-2)] + cost; if(v4 < v1) v1 = v4; } if (v1 < v2 && v1 < v3) { cost = v1; } else if (v2 < v3) { cost = v2; } else { cost = v3; } table[i*(len2+1)+j] = cost; } } cost = table[len1*(len2+1) + len2]; g_free(word1); g_free(word2); g_free(table); return cost; }