factory = new HTMLPurifier_TokenFactory(); } public function tokenizeHTML($html, $config, &$context) { $html = $this->normalize($html, $config, $context); // attempt to armor stray angled brackets that cannot possibly // form tags and thus are probably being used as emoticons if ($config->get('Core', 'AggressivelyFixLt')) { $char = '[^a-z!\/]'; $comment = "/|\z)/is"; $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html); $html = preg_replace("/<($char)/i", '<\\1', $html); $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments } // preprocess html, essential for UTF-8 $html = $this->wrapHTML($html, $config, $context); $doc = new DOMDocument(); $doc->encoding = 'UTF-8'; // theoretically, the above has this covered set_error_handler(array($this, 'muteErrorHandler')); $doc->loadHTML($html); restore_error_handler(); $tokens = array(); $this->tokenizeDOM( $doc->getElementsByTagName('html')->item(0)-> // getElementsByTagName('body')->item(0)-> // getElementsByTagName('div')->item(0) //
, $tokens); return $tokens; } /** * Recursive function that tokenizes a node, putting it into an accumulator. * * @param $node DOMNode to be tokenized. * @param $tokens Array-list of already tokenized tokens. * @param $collect Says whether or start and close are collected, set to * false at first recursion because it's the implicit DIV * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ protected function tokenizeDOM($node, &$tokens, $collect = false) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); return; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of