25, 'h2' => 18, 'h3' => 15, 'h4' => 12, 'h5' => 9, 'h6' => 6, 'u' => 3, 'b' => 3, 'i' => 3, 'strong' => 3, 'em' => 3, 'a' => 10); // Strip off all ignored tags to speed up processing, but insert space before/after // them to keep word boundaries. $text = str_replace(array('<', '>'), array(' <', '> '), $text); $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>'); // Split HTML tags from plain text. $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); // Note: PHP ensures the array consists of alternating delimiters and literals // and begins and ends with a literal (inserting $null as required). $tag = FALSE; // Odd/even counter. Tag or no tag. $score = 1; // Starting score per word $accum = ' '; // Accumulator for cleaned up data $tagstack = array(); // Stack with open tags $tagwords = 0; // Counter for consecutive words $focus = 1; // Focus state $results = array(0 => array()); // Accumulator for words for index foreach ($split as $value) { if ($tag) { // Increase or decrease score per word based on tag list($tagname) = explode(' ', $value, 2); $tagname = $textlib->strtolower($tagname); // Closing or opening tag? if ($tagname[0] == '/') { $tagname = substr($tagname, 1); // If we encounter unexpected tags, reset score to avoid incorrect boosting. if (!count($tagstack) || $tagstack[0] != $tagname) { $tagstack = array(); $score = 1; } else { // Remove from tag stack and decrement score $score = max(1, $score - $tags[array_shift($tagstack)]); } } else { if (isset($tagstack[0]) && $tagstack[0] == $tagname) { // None of the tags we look for make sense when nested identically. // If they are, it's probably broken HTML. $tagstack = array(); $score = 1; } else { // Add to open tag stack and increment score array_unshift($tagstack, $tagname); $score += $tags[$tagname]; } } // A tag change occurred, reset counter. $tagwords = 0; } else { // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values if ($value != '') { $words = tokenise_split($value, $stop_words, $overlap_cjk, $join_numbers); foreach ($words as $word) { // Add word to accumulator $accum .= $word .' '; $num = is_numeric($word); // Check word length if ($num || $textlib->strlen($word) >= MINIMUM_WORD_SIZE) { // Normalize numbers if ($num && $join_numbers) { $word = (int)ltrim($word, '-0'); } if (!isset($results[0][$word])) { $results[0][$word] = 0; } $results[0][$word] += $score * $focus; // Focus is a decaying value in terms of the amount of unique words up to this point. // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words. $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015)); } $tagwords++; // Too many words inside a single tag probably mean a tag was accidentally left open. if (count($tagstack) && $tagwords >= 15) { $tagstack = array(); $score = 1; } } } } $tag = !$tag; } $res = array(); if (isset($results[0]) && count($results[0]) > 0) { $res = $results[0]; arsort($res, SORT_NUMERIC); } return $res; } /// /// Some helper functions (should be considered private) /// /** * Splits a string into tokens */ function tokenise_split($text, $stop_words, $overlap_cjk, $join_numbers) { static $last = NULL; static $lastsplit = NULL; if ($last == $text) { return $lastsplit; } // Process words $text = tokenise_simplify($text, $overlap_cjk, $join_numbers); $words = explode(' ', $text); // Limit word length to 50 array_walk($words, 'tokenise_truncate_word'); // We have stop words, apply them if (is_array($stop_words) && !empty($stop_words)) { // Normalise them $simp_stop_words = explode(' ', tokenise_simplify(implode(' ', $stop_words), $overlap_cjk, $join_numbers)); // Extract from the list $words = array_diff($words, $simp_stop_words); } // Save last keyword result $last = $text; $lastsplit = $words; return $words; } /** * Simplifies a string according to indexing rules. */ function tokenise_simplify($text, $overlap_cjk, $join_numbers) { $textlib = textlib_get_instance(); // Decode entities to UTF-8 $text = $textlib->entities_to_utf8($text, true); // Lowercase $text = $textlib->strtolower($text); // Simple CJK handling if ($overlap_cjk) { $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'tokenise_expand_cjk', $text); } // To improve searching for numerical data such as dates, IP addresses // or version numbers, we consider a group of numerical characters // separated only by punctuation characters to be one piece. // This also means that searching for e.g. '20/03/1984' also returns // results with '20-03-1984' in them. // Readable regexp: ([number]+)[punctuation]+(?=[number]) if ($join_numbers) { $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text); } else { // Keep all the detected numbers+punctuation in a safe place in order to restore them later preg_match_all('/['. PREG_CLASS_NUMBERS .']+['. PREG_CLASS_PUNCTUATION . PREG_CLASS_NUMBERS .']+/u', $text, $foundseqs); $storedseqs = array(); foreach (array_unique($foundseqs[0]) as $ntkey => $value) { $prefix = (string)(count($storedseqs) + 1); $storedseqs[START_DELIM.$prefix.CENTER_DELIM.$ntkey.END_DELIM] = $value; } if (!empty($storedseqs)) { $text = str_replace($storedseqs, array_keys($storedseqs), $text); } } // The dot, underscore and dash are simply removed. This allows meaningful // search behaviour with acronyms and URLs. $text = preg_replace('/[._-]+/', '', $text); // With the exception of the rules above, we consider all punctuation, // marks, spacers, etc, to be a word boundary. $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE .']+/u', ' ', $text); // Restore, if not joining numbers, recover the original strings if (!$join_numbers) { if (!empty($storedseqs)) { $text = str_replace(array_keys($storedseqs), $storedseqs, $text); } } return $text; } /** * Basic CJK tokeniser. Simply splits a string into consecutive, overlapping * sequences of characters (MINIMUM_WORD_SIZE long). */ function tokenise_expand_cjk($matches) { $textlib = textlib_get_instance(); $str = $matches[0]; $l = $textlib->strlen($str); // Passthrough short words if ($l <= MINIMUM_WORD_SIZE) { return ' '. $str .' '; } $tokens = ' '; // FIFO queue of characters $chars = array(); // Begin loop for ($i = 0; $i < $l; ++$i) { // Grab next character $current = $textlib->substr($str, 0, 1); $str = substr($str, strlen($current)); $chars[] = $current; if ($i >= MINIMUM_WORD_SIZE - 1) { $tokens .= implode('', $chars) .' '; array_shift($chars); } } return $tokens; } /** * Helper function for array_walk in search_index_split. * Truncates one string (token) to MAXIMUM_WORD_SIZE */ function tokenise_truncate_word(&$text) { $textlib = textlib_get_instance(); $text = $textlib->substr($text, 0, MAXIMUM_WORD_SIZE); } ?>