/* Fast fuzzy searching among messages. Copyright (C) 2006, 2008 Free Software Foundation, Inc. Written by Bruno Haible , 2006. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifdef HAVE_CONFIG_H # include #endif /* Specification. */ #include "msgl-fsearch.h" #include #include #include "xalloc.h" #include "po-charset.h" /* Fuzzy searching of L strings in a large set of N messages (assuming they have all the same small size) takes O(L * N) when using repeated fstrcmp() calls. When for example L = 800 and N = 69000, this is slow. So we preprocess the set of N messages, yielding a data structure that allows to select the similar messages for any given string, and apply fstrcmp() only to the search result. This allows to reduce the time to something between O(L * 1) and O(L * N) - depending on the structure of the strings. In the example with L = 800 and N = 69000, memory use increases by a factor of 2 and the time decreases from 9068 s to 230 s. The data structure is a hash table mapping each n-gram (here with n=4) occurring in the message strings to the list of messages that contains it. For example, if the message list is [0] "close" [1] "losers" the index is a hash table mapping "clos" -> { 0 } "lose" -> { 0, 1 } "oser" -> { 1 } "sers" -> { 1 } Searching the similar messages to, say, "closest", is done by looking up all its 4-grams in the hash table and summing up the results: "clos" -> { 0 } "lose" -> { 0, 1 } "oses" -> {} "sest" -> {} => total: { 2x0, 1x1 } The list is sorted according to decreasing frequency: { 0, 1, ... } and only the first few messages from this frequency list are passed to fstrcmp(). The n-gram similarity and the fstrcmp() similarity are quite different metrics. For example, "close" and "c l o s e" have no n-grams in common (even for n as small as n = 2); however, fstrcmp() will find that they are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is only 0.02. Also, repeated n-grams don't have the same effect on the two similarity measures. But all this doesn't matter much in practice. We chose n = 4 because for alphabetic languages, with n = 3 the occurrence lists are likely too long. (Well, with ideographic languages like Chinese, n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case either.) The units are characters in the current encoding. Not just bytes, because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */ /* Each message is represented by its index in the message list. */ typedef unsigned int index_ty; /* An index list has its allocated size and length tacked at the front. The indices are sorted in ascending order. */ typedef index_ty *index_list_ty; #define IL_ALLOCATED 0 #define IL_LENGTH 1 /* Create a new index list containing only a given index. */ static inline index_list_ty new_index (index_ty idx) { index_ty *list = XNMALLOC (2 + 1, index_ty); list[IL_ALLOCATED] = 1; list[IL_LENGTH] = 1; list[2] = idx; return list; } /* Add a given index, greater or equal than all previous indices, to an index list. Return the new index list, if it had to be reallocated, or NULL if it didn't change. */ static inline index_list_ty addlast_index (index_list_ty list, index_ty idx) { index_list_ty result; size_t length = list[IL_LENGTH]; /* Look whether it should be inserted. */ if (length > 0 && list[2 + (length - 1)] == idx) return NULL; /* Now make room for one more list element. */ result = NULL; if (length == list[IL_ALLOCATED]) { size_t new_allocated = 2 * length - (length >> 6); list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); list[IL_ALLOCATED] = new_allocated; result = list; } list[2 + length] = idx; list[IL_LENGTH] = length + 1; return result; } /* Add a given index to an index list. Return the new index list, if it had to be reallocated, or NULL if it didn't change. */ static inline index_list_ty add_index (index_list_ty list, index_ty idx) { index_list_ty result; size_t length = list[IL_LENGTH]; /* Look where it should be inserted. */ size_t lo = 0; size_t hi = length; while (lo < hi) { /* Here we know that idx must be inserted at a position >= lo, <= hi. */ size_t mid = (lo + hi) / 2; /* lo <= mid < hi */ index_ty val = list[2 + mid]; if (val < idx) lo = mid + 1; else if (val > idx) hi = mid; else return NULL; } /* Now make room for one more list element. */ result = NULL; if (length == list[IL_ALLOCATED]) { size_t new_allocated = 2 * length - (length >> 6); list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); list[IL_ALLOCATED] = new_allocated; result = list; } list[IL_LENGTH] = length + 1; for (; length > hi; length--) list[2 + length] = list[1 + length]; list[2 + length] = idx; return result; } /* We use 4-grams, therefore strings with less than 4 characters cannot be handled through the 4-grams table and need to be handled specially. Since every character occupies at most 4 bytes (see po-charset.c), this means the size of such short strings is bounded by: */ #define SHORT_STRING_MAX_CHARACTERS (4 - 1) #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4) /* Such short strings are handled by direct comparison with all messages of appropriate size. Note that for two strings of length 0 <= l1 <= l2, fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l limit the search to lengths l' in the range l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1) Thus we need the list of the short strings up to length: */ #define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1)) /* A fuzzy index contains a hash table mapping all n-grams to their occurrences list. */ struct message_fuzzy_index_ty { message_ty **messages; character_iterator_t iterator; hash_table gram4; size_t firstfew; message_list_ty *short_messages[SHORT_MSG_MAX + 1]; }; /* Allocate a fuzzy index corresponding to a given list of messages. The list of messages and the msgctxt and msgid fields of the messages inside it must not be modified while the returned fuzzy index is in use. */ message_fuzzy_index_ty * message_fuzzy_index_alloc (const message_list_ty *mlp, const char *canon_charset) { message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty); size_t count = mlp->nitems; size_t j; size_t l; findex->messages = mlp->item; findex->iterator = po_charset_character_iterator (canon_charset); /* Setup hash table. */ if (hash_init (&findex->gram4, 10 * count) < 0) xalloc_die (); for (j = 0; j < count; j++) { message_ty *mp = mlp->item[j]; if (mp->msgstr != NULL && mp->msgstr[0] != '\0') { const char *str = mp->msgid; /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ const char *p0 = str; if (*p0 != '\0') { const char *p1 = p0 + findex->iterator (p0); if (*p1 != '\0') { const char *p2 = p1 + findex->iterator (p1); if (*p2 != '\0') { const char *p3 = p2 + findex->iterator (p2); if (*p3 != '\0') { const char *p4 = p3 + findex->iterator (p3); for (;;) { /* The segment from p0 to p4 is a 4-gram of characters. Add a hash table entry that maps it to the index j, or extend the existing hash table entry accordingly. */ void *found; if (hash_find_entry (&findex->gram4, p0, p4 - p0, &found) == 0) { index_list_ty list = (index_list_ty) found; list = addlast_index (list, j); if (list != NULL) hash_set_value (&findex->gram4, p0, p4 - p0, list); } else hash_insert_entry (&findex->gram4, p0, p4 - p0, new_index (j)); /* Advance. */ if (*p4 == '\0') break; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = p4 + findex->iterator (p4); } } } } } } } /* Shrink memory used by the hash table. */ { void *iter; const void *key; size_t keylen; void **valuep; iter = NULL; while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep) == 0) { index_list_ty list = (index_list_ty) *valuep; index_ty length = list[IL_LENGTH]; if (length < list[IL_ALLOCATED]) { list[IL_ALLOCATED] = length; *valuep = xrealloc (list, (2 + length) * sizeof (index_ty)); } } } findex->firstfew = (int) sqrt ((double) count); if (findex->firstfew < 10) findex->firstfew = 10; /* Setup lists of short messages. */ for (l = 0; l <= SHORT_MSG_MAX; l++) findex->short_messages[l] = message_list_alloc (false); for (j = 0; j < count; j++) { message_ty *mp = mlp->item[j]; if (mp->msgstr != NULL && mp->msgstr[0] != '\0') { const char *str = mp->msgid; size_t len = strlen (str); if (len <= SHORT_MSG_MAX) message_list_append (findex->short_messages[len], mp); } } /* Shrink memory used by the lists of short messages. */ for (l = 0; l <= SHORT_MSG_MAX; l++) { message_list_ty *mlp = findex->short_messages[l]; if (mlp->nitems < mlp->nitems_max) { mlp->nitems_max = mlp->nitems; mlp->item = (message_ty **) xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *)); } } return findex; } /* An index with multiplicity. */ struct mult_index { index_ty index; unsigned int count; }; /* A list of indices with multiplicity. The indices are sorted in ascending order. */ struct mult_index_list { struct mult_index *item; size_t nitems; size_t nitems_max; /* Work area. */ struct mult_index *item2; size_t nitems2_max; }; /* Initialize an empty list of indices with multiplicity. */ static inline void mult_index_list_init (struct mult_index_list *accu) { accu->nitems = 0; accu->nitems_max = 0; accu->item = NULL; accu->nitems2_max = 0; accu->item2 = NULL; } /* Add an index list to a list of indices with multiplicity. */ static inline void mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list) { size_t len1 = accu->nitems; size_t len2 = list[IL_LENGTH]; size_t need = len1 + len2; struct mult_index *ptr1; struct mult_index *ptr1_end; index_ty *ptr2; index_ty *ptr2_end; struct mult_index *destptr; /* Make the work area large enough. */ if (accu->nitems2_max < need) { size_t new_max = 2 * accu->nitems2_max + 1; if (new_max < need) new_max = need; if (accu->item2 != NULL) free (accu->item2); accu->item2 = XNMALLOC (new_max, struct mult_index); accu->nitems2_max = new_max; } /* Make a linear pass through accu and list simultaneously. */ ptr1 = accu->item; ptr1_end = ptr1 + len1; ptr2 = list + 2; ptr2_end = ptr2 + len2; destptr = accu->item2; while (ptr1 < ptr1_end && ptr2 < ptr2_end) { if (ptr1->index < *ptr2) { *destptr = *ptr1; ptr1++; } else if (ptr1->index > *ptr2) { destptr->index = *ptr2; destptr->count = 1; ptr2++; } else /* ptr1->index == list[2 + i2] */ { destptr->index = ptr1->index; destptr->count = ptr1->count + 1; ptr1++; ptr2++; } destptr++; } while (ptr1 < ptr1_end) { *destptr = *ptr1; ptr1++; destptr++; } while (ptr2 < ptr2_end) { destptr->index = *ptr2; destptr->count = 1; ptr2++; destptr++; } /* Swap accu->item and accu->item2. */ { struct mult_index *dest = accu->item2; size_t dest_max = accu->nitems2_max; accu->item2 = accu->item; accu->nitems2_max = accu->nitems_max; accu->item = dest; accu->nitems = destptr - dest; accu->nitems_max = dest_max; } } /* Compares two indices with multiplicity, according to their multiplicity. */ static int mult_index_compare (const void *p1, const void *p2) { const struct mult_index *ptr1 = (const struct mult_index *) p1; const struct mult_index *ptr2 = (const struct mult_index *) p2; if (ptr1->count < ptr2->count) return 1; if (ptr1->count > ptr2->count) return -1; /* For reproduceable results, also take into account the indices. */ if (ptr1->index > ptr2->index) return 1; if (ptr1->index < ptr2->index) return -1; return 0; } /* Sort a list of indices with multiplicity according to decreasing multiplicity. */ static inline void mult_index_list_sort (struct mult_index_list *accu) { if (accu->nitems > 1) qsort (accu->item, accu->nitems, sizeof (struct mult_index), mult_index_compare); } /* Frees a list of indices with multiplicity. */ static inline void mult_index_list_free (struct mult_index_list *accu) { if (accu->item != NULL) free (accu->item); if (accu->item2 != NULL) free (accu->item2); } /* Find a good match for the given msgctxt and msgid in the given fuzzy index. The match does not need to be optimal. Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND. LOWER_BOUND must be >= FUZZY_THRESHOLD. If HEURISTIC is true, only the few best messages among the list - according to a certain heuristic - are considered. If HEURISTIC is false, all messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered, like in message_list_search_fuzzy (except that in ambiguous cases where several best matches exist, message_list_search_fuzzy chooses the one with the smallest index whereas message_fuzzy_index_search makes a better choice). */ message_ty * message_fuzzy_index_search (message_fuzzy_index_ty *findex, const char *msgctxt, const char *msgid, double lower_bound, bool heuristic) { const char *str = msgid; /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ const char *p0 = str; if (*p0 != '\0') { const char *p1 = p0 + findex->iterator (p0); if (*p1 != '\0') { const char *p2 = p1 + findex->iterator (p1); if (*p2 != '\0') { const char *p3 = p2 + findex->iterator (p2); if (*p3 != '\0') { const char *p4 = p3 + findex->iterator (p3); struct mult_index_list accu; mult_index_list_init (&accu); for (;;) { /* The segment from p0 to p4 is a 4-gram of characters. Get the hash table entry containing a list of indices, and add it to the accu. */ void *found; if (hash_find_entry (&findex->gram4, p0, p4 - p0, &found) == 0) { index_list_ty list = (index_list_ty) found; mult_index_list_accumulate (&accu, list); } /* Advance. */ if (*p4 == '\0') break; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = p4 + findex->iterator (p4); } /* Sort in decreasing count order. */ mult_index_list_sort (&accu); /* Iterate over this sorted list, and maximize the fuzzy_search_goal_function() result. If HEURISTIC is true, take only the first few messages. If HEURISTIC is false, consider all messages - to match the behaviour of message_list_search_fuzzy -, but process them in the order of the sorted list. This increases the chances that the later calls to fstrcmp_bounded() (via fuzzy_search_goal_function()) terminate quickly, thanks to the best_weight which will be quite high already after the first few messages. */ { size_t count; struct mult_index *ptr; message_ty *best_mp; double best_weight; count = accu.nitems; if (heuristic) { if (count > findex->firstfew) count = findex->firstfew; } best_weight = lower_bound; best_mp = NULL; for (ptr = accu.item; count > 0; ptr++, count--) { message_ty *mp = findex->messages[ptr->index]; double weight = fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight); if (weight > best_weight) { best_weight = weight; best_mp = mp; } } mult_index_list_free (&accu); return best_mp; } } } } } /* The string had less than 4 characters. */ { size_t l = strlen (str); size_t lmin, lmax; message_ty *best_mp; double best_weight; if (!(l <= SHORT_STRING_MAX_BYTES)) abort (); /* Walk through those short messages which have an appropriate length. See the comment before SHORT_MSG_MAX. */ lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1)); lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1)); if (!(lmax <= SHORT_MSG_MAX)) abort (); best_weight = lower_bound; best_mp = NULL; for (l = lmin; l <= lmax; l++) { message_list_ty *mlp = findex->short_messages[l]; size_t j; for (j = 0; j < mlp->nitems; j++) { message_ty *mp = mlp->item[j]; double weight = fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight); if (weight > best_weight) { best_weight = weight; best_mp = mp; } } } return best_mp; } } /* Free a fuzzy index. */ void message_fuzzy_index_free (message_fuzzy_index_ty *findex) { size_t l; void *iter; const void *key; size_t keylen; void *data; /* Free the short lists. */ for (l = 0; l <= SHORT_MSG_MAX; l++) message_list_free (findex->short_messages[l], 1); /* Free the index lists occurring as values in the hash tables. */ iter = NULL; while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0) free ((index_list_ty *) data); /* Free the hash table itself. */ hash_destroy (&findex->gram4); free (findex); }