or "img" for . The values for these entries * are associative arrays where the keys are attribute names for those * tags, such as "href" for . Finally, the values for * those arrays are URLs found in those tags and attributes throughout * the text. * * Parameters: * text the UTF-8 text to scan * * Return values: * an associative array where keys are tags and values are an * associative array where keys are attributes and values are * an array of URLs. * * See: * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page */ function extract_html_urls( $text ) { $match_elements = array( // HTML array('element'=>'a', 'attribute'=>'href'), // 2.0 array('element'=>'a', 'attribute'=>'urn'), // 2.0 array('element'=>'base', 'attribute'=>'href'), // 2.0 array('element'=>'form', 'attribute'=>'action'), // 2.0 array('element'=>'img', 'attribute'=>'src'), // 2.0 array('element'=>'link', 'attribute'=>'href'), // 2.0 array('element'=>'applet', 'attribute'=>'code'), // 3.2 array('element'=>'applet', 'attribute'=>'codebase'), // 3.2 array('element'=>'area', 'attribute'=>'href'), // 3.2 array('element'=>'body', 'attribute'=>'background'), // 3.2 array('element'=>'img', 'attribute'=>'usemap'), // 3.2 array('element'=>'input', 'attribute'=>'src'), // 3.2 array('element'=>'applet', 'attribute'=>'archive'), // 4.01 array('element'=>'applet', 'attribute'=>'object'), // 4.01 array('element'=>'blockquote', 'attribute'=>'cite'), // 4.01 array('element'=>'del', 'attribute'=>'cite'), // 4.01 array('element'=>'frame', 'attribute'=>'longdesc'), // 4.01 array('element'=>'frame', 'attribute'=>'src'), // 4.01 array('element'=>'head', 'attribute'=>'profile'), // 4.01 array('element'=>'iframe', 'attribute'=>'longdesc'), // 4.01 array('element'=>'iframe', 'attribute'=>'src'), // 4.01 array('element'=>'img', 'attribute'=>'longdesc'), // 4.01 array('element'=>'input', 'attribute'=>'usemap'), // 4.01 array('element'=>'ins', 'attribute'=>'cite'), // 4.01 array('element'=>'object', 'attribute'=>'archive'), // 4.01 array('element'=>'object', 'attribute'=>'classid'), // 4.01 array('element'=>'object', 'attribute'=>'codebase'), // 4.01 array('element'=>'object', 'attribute'=>'data'), // 4.01 array('element'=>'object', 'attribute'=>'usemap'), // 4.01 array('element'=>'q', 'attribute'=>'cite'), // 4.01 array('element'=>'script', 'attribute'=>'src'), // 4.01 array('element'=>'audio', 'attribute'=>'src'), // 5.0 array('element'=>'command', 'attribute'=>'icon'), // 5.0 array('element'=>'embed', 'attribute'=>'src'), // 5.0 array('element'=>'event-source','attribute'=>'src'), // 5.0 array('element'=>'html', 'attribute'=>'manifest'), // 5.0 array('element'=>'source', 'attribute'=>'src'), // 5.0 array('element'=>'video', 'attribute'=>'src'), // 5.0 array('element'=>'video', 'attribute'=>'poster'), // 5.0 array('element'=>'bgsound', 'attribute'=>'src'), // Extension array('element'=>'body', 'attribute'=>'credits'), // Extension array('element'=>'body', 'attribute'=>'instructions'), // Extension array('element'=>'body', 'attribute'=>'logo'), // Extension array('element'=>'div', 'attribute'=>'href'), // Extension array('element'=>'div', 'attribute'=>'src'), // Extension array('element'=>'embed', 'attribute'=>'code'), // Extension array('element'=>'embed', 'attribute'=>'pluginspage'), // Extension array('element'=>'html', 'attribute'=>'background'), // Extension array('element'=>'ilayer', 'attribute'=>'src'), // Extension array('element'=>'img', 'attribute'=>'dynsrc'), // Extension array('element'=>'img', 'attribute'=>'lowsrc'), // Extension array('element'=>'input', 'attribute'=>'dynsrc'), // Extension array('element'=>'input', 'attribute'=>'lowsrc'), // Extension array('element'=>'table', 'attribute'=>'background'), // Extension array('element'=>'td', 'attribute'=>'background'), // Extension array('element'=>'th', 'attribute'=>'background'), // Extension array('element'=>'layer', 'attribute'=>'src'), // Extension array('element'=>'xml', 'attribute'=>'src'), // Extension array('element'=>'button', 'attribute'=>'action'), // Forms 2.0 array('element'=>'datalist', 'attribute'=>'data'), // Forms 2.0 array('element'=>'form', 'attribute'=>'data'), // Forms 2.0 array('element'=>'input', 'attribute'=>'action'), // Forms 2.0 array('element'=>'select', 'attribute'=>'data'), // Forms 2.0 // XHTML array('element'=>'html', 'attribute'=>'xmlns'), // WML array('element'=>'access', 'attribute'=>'path'), // 1.3 array('element'=>'card', 'attribute'=>'onenterforward'), // 1.3 array('element'=>'card', 'attribute'=>'onenterbackward'),// 1.3 array('element'=>'card', 'attribute'=>'ontimer'), // 1.3 array('element'=>'go', 'attribute'=>'href'), // 1.3 array('element'=>'option', 'attribute'=>'onpick'), // 1.3 array('element'=>'template', 'attribute'=>'onenterforward'), // 1.3 array('element'=>'template', 'attribute'=>'onenterbackward'),// 1.3 array('element'=>'template', 'attribute'=>'ontimer'), // 1.3 array('element'=>'wml', 'attribute'=>'xmlns'), // 2.0 ); $match_metas = array( 'content-base', 'content-location', 'referer', 'location', 'refresh', ); // Extract all elements if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) ) return array( ); $elements = $matches[1]; $value_pattern = '=(("([^"]*)")|([^\s]*))'; // Match elements and attributes foreach ( $match_elements as $match_element ) { $name = $match_element['element']; $attr = $match_element['attribute']; $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu'; if ( $name == 'object' ) $split_pattern = '/\s*/u'; // Space-separated URL list else if ( $name == 'archive' ) $split_pattern = '/,\s*/u'; // Comma-separated URL list else unset( $split_pattern ); // Single URL foreach ( $elements as $element ) { if ( !preg_match( $pattern, $element, $match ) ) continue; $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3]; if ( !isset( $split_pattern ) ) $urls[$name][$attr][] = $m; else { $msplit = preg_split( $split_pattern, $m ); foreach ( $msplit as $ms ) $urls[$name][$attr][] = $ms; } } } // Match meta http-equiv elements foreach ( $match_metas as $match_meta ) { $attr_pattern = '/http-equiv="?' . $match_meta . '"?/iu'; $content_pattern = '/content' . $value_pattern . '/iu'; $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu'; foreach ( $elements as $element ) { if ( !preg_match( '/^meta/iu', $element ) || !preg_match( $attr_pattern, $element ) || !preg_match( $content_pattern, $element, $match ) ) continue; $m = empty($match[3]) ? $match[4] : $match[3]; if ( $match_meta != 'refresh' ) $urls['meta']['http-equiv'][] = $m; else if ( preg_match( $refresh_pattern, $m, $match ) ) $urls['meta']['http-equiv'][] = $match[2]; } } // Match style attributes $urls['style'] = array( ); $style_pattern = '/style' . $value_pattern . '/iu'; foreach ( $elements as $element ) { if ( !preg_match( $style_pattern, $element, $match ) ) continue; $m = empty($match[3]) ? $match[4] : $match[3]; $style_urls = extract_css_urls( $m ); if ( !empty( $style_urls ) ) $urls['style'] = array_merge_recursive( $urls['style'], $style_urls ); } // Match style bodies if ( preg_match_all( '/]*>(.*?)<\/style>/siu', $text, $style_bodies ) ) { foreach ( $style_bodies[1] as $style_body ) { $style_urls = extract_css_urls( $style_body ); if ( !empty( $style_urls ) ) $urls['style'] = array_merge_recursive( $urls['style'], $style_urls ); } } if ( empty($urls['style']) ) unset( $urls['style'] ); return $urls; } /** * Extract URLs from UTF-8 CSS text. * * URLs within @import statements and url() property functions are extracted * and returned in an associative array of arrays. Array keys indicate * the use context for the URL, including: * * "import" * "property" * * Each value in the associative array is an array of URLs. * * Parameters: * text the UTF-8 text to scan * * Return values: * an associative array of arrays of URLs. * * See: * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file */ function extract_css_urls( $text ) { $urls = array( ); $url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)'; $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)'; $pattern = '/(' . '(@import\s*[\'"]' . $url_pattern . '[\'"])' . '|(@import\s*' . $urlfunc_pattern . ')' . '|(' . $urlfunc_pattern . ')' . ')/iu'; if ( !preg_match_all( $pattern, $text, $matches ) ) return $urls; // @import '...' // @import "..." foreach ( $matches[3] as $match ) if ( !empty($match) ) $urls['import'][] = preg_replace( '/\\\\(.)/u', '\\1', $match ); // @import url(...) // @import url('...') // @import url("...") foreach ( $matches[7] as $match ) if ( !empty($match) ) $urls['import'][] = preg_replace( '/\\\\(.)/u', '\\1', $match ); // url(...) // url('...') // url("...") foreach ( $matches[11] as $match ) if ( !empty($match) ) $urls['property'][] = preg_replace( '/\\\\(.)/u', '\\1', $match ); return $urls; }