'), $link_matches);
if($link_matches[0]) {
$html_link = $link_matches[0];
}
}
$content_from_link[] = array(
'content' => $html_content,
'title' => $html_title,
'link' => $html_link
);
}
}
}
libxml_use_internal_errors($old_errors_value);
} else {
$content_from_link[] = array(
'content' => $this->clean_html($html),
'title' => '',
'link' => ''
);
}
}
}
//Dom renvoie de l'utf-8. Mais certains caractères windows peuvent être présents dans les pages...
if($charset != "utf-8"){
foreach ($content_from_link as $key=>$content) {
foreach ($content as $key_content=>$value_content) {
$content_from_link[$key][$key_content] = utf8_decode(encoding_normalize::clean_cp1252($value_content,'utf-8'));
}
}
}
return $content_from_link;
}
protected function find_root_url($url) {
$tmp = parse_url($url);
$url = ($tmp["scheme"]?$tmp["scheme"]."://":"").$tmp["host"]."/";
return $url;
}
protected function get_constructed_remote_link($link, $link_for_construct = '') {
if(!isset($this->parameters['use_root_url'])) $this->parameters['use_root_url']=0;
if($link_for_construct) {
if(substr($link_for_construct, 0, 7) == 'http://' || substr($link_for_construct, 0, 8) == 'https://') {
$link = $link_for_construct;
} else {
preg_match("/href=(\"[^\"]+\"|'[^']+'|[^<>\s]+)/i", ' '.$link_for_construct.' ', $matches);
if($matches[1]) {
$match_link = str_replace('"', '', $matches[1]);
if(substr($match_link, 0, 7) == 'http://' || substr($match_link, 0, 8) == 'https://') {
$link = $match_link;
} else {
if($this->parameters['use_root_url']) {
$link = $this->find_root_url($link);
}
if(substr($link, strlen($link)-1) === '/' && substr($match_link, 0, 1) === '/') {
$link .= substr($match_link, 1);
} else {
$link .= $match_link;
}
}
} else {
$link .= $link_for_construct;
}
}
} else {
$link .= '#'.strtotime($this->content_headers['Date']).rand(0,1000);
}
return $link;
}
protected function get_items_datas($link){
$items = array();
$content_from_link = $this->get_content_from_link($link);
$content_hash_from_link = md5(serialize($content_from_link));
if(is_array($content_from_link) && count($content_from_link)) {
$datas = array();
$stored_data = $this->get_stored_data();
if($content_hash_from_link != $stored_data['content_hash']) {
$content_from_base = unserialize($stored_data['content']);
if(is_array($this->parameters['xpath_expressions']) && count($this->parameters['xpath_expressions'])) {
$hash_from_base = array();
if(is_array($content_from_base)) {
foreach ($content_from_base as $content) {
$hash_from_base[] = md5($content['content']);
}
}
foreach ($content_from_link as $i=>$content) {
if(!in_array(md5($content['content']), $hash_from_base)) {
$items[] = array(
'content' => $content['content'],
'title' => ($content['title'] ? $content['title'] : $this->get_title()),
'link' => $this->get_constructed_remote_link($link, $content['link'])
);
}
}
} else {
if(!extension_loaded('xdiff')) {
return false;
}
if(md5($content_from_base[0]['content']) != md5($content_from_link[0]['content'])) {
$xdiff_string = xdiff_string_diff($content_from_base[0]['content'] , $content_from_link[0]['content']);
$xdiff_change = $this->get_xdiff_change($xdiff_string);
if($this->parameters['mode_creation_items'] == 'by_change') {
foreach ($xdiff_change as $i=>$change) {
$items[] = array(
'content' => implode('', $change),
'title' => $this->get_title(),
'link' => $this->get_constructed_remote_link($link)
);
}
} else {
$item_content = '';
foreach ($xdiff_change as $change) {
$item_content .= implode('', $change);
}
$items[] = array(
'content' => $item_content,
'title' => $this->get_title(),
'link' => $this->get_constructed_remote_link($link)
);
}
}
}
foreach ($items as $item) {
$data = array();
$data["type"] = "monitoring_website";
$data["title"] = $item['title'];
$data["summary"] = $item['content'];
$data["content"] = '';
$data["url"] = $item['link'];
$data["publication_date"] = date( 'Y-m-d H:i:s', strtotime($this->content_headers['Date']));
$data["logo_url"] = '';
$data["descriptors"] = "";
$data["tags"] = '';
$datas[] = $data;
}
$this->content = $content_from_link;
$this->content_hash = $content_hash_from_link;
$this->save_content($stored_data['is_first']);
}
return $datas;
}else{
return false;
}
}
public function get_available_selectors(){
global $msg;
return array(
"docwatch_selector_monitoring_website" => $msg['dsi_docwatch_selector_monitoring_website']
);
}
protected function get_expression_xpath_content($parameter_name, $i, $expression) {
global $msg, $charset;
$form = "
";
if($i == 0) {
$form .= "";
}
return $form;
}
protected function get_xpath_expressions_form($parameter_name) {
global $msg,$charset;
if(!isset($this->parameters[$parameter_name])) $this->parameters[$parameter_name]='';
$form = "
";
if(is_array($this->parameters[$parameter_name]) && count($this->parameters[$parameter_name])) {
foreach ($this->parameters[$parameter_name] as $i=>$expression) {
$form .= "
".$this->get_expression_xpath_content($parameter_name, $i, $expression)."
";
}
$form .= "";
} else {
$form .= "
".$this->get_expression_xpath_content($parameter_name, 0, '')."
";
}
$form .= "";
return $form;
}
protected function get_xdiff_informations(){
global $msg, $charset;
if(!extension_loaded('xdiff')) {
return "
".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_xdiff'],ENT_QUOTES,$charset)."
";
}
return "";
}
public function get_form_content(){
global $msg,$charset;
$form = parent::get_form_content();
if(!isset($this->parameters['mode_creation_items'])) $this->parameters['mode_creation_items']='';
if(!isset($this->parameters['use_root_url'])) $this->parameters['use_root_url']=0;
$form .= "
parameters['mode_creation_items'] ? "style='display:none;'" : "").">
".$this->get_xpath_expressions_form('xpath_expressions')."
".$this->get_xpath_expressions_form('xpath_expressions_for_title')."
".$this->get_xpath_expressions_form('xpath_expressions_for_link')."
parameters['mode_creation_items'] ? "style='display:none;'" : "").">
".htmlentities($msg['dsi_docwatch_datasource_monitoring_website_xdiff_only'],ENT_QUOTES,$charset)."
".$this->get_xdiff_informations()."
".$msg['39']."parameters['use_root_url'] ? "":"checked='checked'")." type='radio' data-dojo-type='dijit/form/RadioButton' name='docwatch_datasource_monitoring_website_use_root_url' value='0' />
".$msg['40']."parameters['use_root_url'] ? "checked='checked'":"")." type='radio' data-dojo-type='dijit/form/RadioButton' name='docwatch_datasource_monitoring_website_use_root_url' value='1' />
";
return $form;
}
public function set_from_form() {
global $docwatch_datasource_monitoring_website_mode_creation_items;
global $docwatch_datasource_monitoring_website_xpath_expressions;
global $docwatch_datasource_monitoring_website_xpath_expressions_for_title;
global $docwatch_datasource_monitoring_website_xpath_expressions_for_link;
global $docwatch_datasource_monitoring_website_use_root_url;
$this->parameters['mode_creation_items'] = stripslashes($docwatch_datasource_monitoring_website_mode_creation_items);
$this->parameters['xpath_expressions'] = array();
if(is_array($docwatch_datasource_monitoring_website_xpath_expressions)) {
foreach ($docwatch_datasource_monitoring_website_xpath_expressions as $xpath_expression) {
if($xpath_expression) {
$this->parameters['xpath_expressions'][] = stripslashes($xpath_expression);
}
}
}
$this->parameters['xpath_expressions_for_title'] = array();
if(is_array($docwatch_datasource_monitoring_website_xpath_expressions_for_title)) {
foreach ($docwatch_datasource_monitoring_website_xpath_expressions_for_title as $xpath_expression_for_title) {
if($xpath_expression_for_title) {
$this->parameters['xpath_expressions_for_title'][] = stripslashes($xpath_expression_for_title);
}
}
}
$this->parameters['xpath_expressions_for_link'] = array();
if(is_array($docwatch_datasource_monitoring_website_xpath_expressions_for_link)) {
foreach ($docwatch_datasource_monitoring_website_xpath_expressions_for_link as $xpath_expression_for_link) {
if($xpath_expression_for_link) {
$this->parameters['xpath_expressions_for_link'][] = stripslashes($xpath_expression_for_link);
}
}
}
$this->parameters['use_root_url'] = $docwatch_datasource_monitoring_website_use_root_url;
parent::set_from_form();
}
} // end of docwatch_datasource_monitoring_website