'), $link_matches);
if($link_matches[0]) {
$html_link = $link_matches[0];
}
}
$content_from_link[] = array(
'content' => $html_content,
'title' => $html_title,
'link' => $html_link
);
}
}
libxml_use_internal_errors($old_errors_value);
} else {
$content_from_link[] = array(
'content' => $this->clean_html($html),
'title' => '',
'link' => ''
);
}
}
}
//Dom renvoie de l'utf-8. Mais certains caractères windows peuvent être présents dans les pages...
if($charset != "utf-8"){
foreach ($content_from_link as $key=>$content) {
foreach ($content as $key_content=>$value_content) {
$content_from_link[$key][$key_content] = utf8_decode(encoding_normalize::clean_cp1252($value_content,'utf-8'));
}
}
}
return $content_from_link;
}
protected function find_root_url($url) {
$tmp = parse_url($url);
$url = ($tmp["scheme"]?$tmp["scheme"]."://":"").$tmp["host"]."/";
return $url;
}
protected function get_constructed_remote_link($link, $link_for_construct = '') {
if($link_for_construct) {
if(substr($link_for_construct, 0, 7) == 'http://' || substr($link_for_construct, 0, 8) == 'https://') {
$link = $link_for_construct;
} else {
preg_match("/\s+(?:[^\"'>]+|\"[^\"]*\"|'[^']*')*href=(\"[^\"]+\"|'[^']+'|[^<>\s]+)/i", ' '.$link_for_construct.' ', $matches);
if($matches[1]) {
$match_link = str_replace('"', '', $matches[1]);
if(substr($match_link, 0, 7) == 'http://' || substr($match_link, 0, 8) == 'https://') {
$link = $match_link;
} else {
if(strpos($match_link, '/') !== false) {
$match_link = substr($match_link, strpos($match_link, '/')+1);
}
if($this->parameters['use_root_url']) {
$link = $this->find_root_url($link).$match_link;
} else {
$link .= $match_link;
}
}
} else {
$link .= $link_for_construct;
}
}
} else {
$link .= '#'.strtotime($this->content_headers['Date']).rand(0,1000);
}
return $link;
}
protected function get_items_datas($link){
$items = array();
$content_from_link = $this->get_content_from_link($link);
$content_hash_from_link = md5(serialize($content_from_link));
if(is_array($content_from_link) && count($content_from_link)) {
$query = "select datasource_monitoring_website_upload_date, datasource_monitoring_website_content, datasource_monitoring_website_content_hash
from docwatch_datasource_monitoring_website where datasource_monitoring_website_num_datasource = ".$this->id;
$result = pmb_mysql_query($query);
if($result && pmb_mysql_num_rows($result)) {
$row = pmb_mysql_fetch_object($result);
if($content_hash_from_link != $row->datasource_monitoring_website_content_hash) {
$content_from_base = unserialize($row->datasource_monitoring_website_content);
if(is_array($this->parameters['xpath_expressions']) && count($this->parameters['xpath_expressions'])) {
$hash_from_base = array();
if(is_array($content_from_base)) {
foreach ($content_from_base as $content) {
$hash_from_base[] = md5($content['content']);
}
}
foreach ($content_from_link as $i=>$content) {
if(!in_array(md5($content['content']), $hash_from_base)) {
$items[] = array(
'content' => $content['content'],
'title' => ($content['title'] ? $content['title'] : $this->get_title()),
'link' => $this->get_constructed_remote_link($link, $content['link'])
);
}
}
} else {
if(!extension_loaded('xdiff')) {
return false;
}
if(md5($content_from_base[0]['content']) != md5($content_from_link[0]['content'])) {
$xdiff_string = xdiff_string_diff($content_from_base[0]['content'] , $content_from_link[0]['content']);
$xdiff_change = $this->get_xdiff_change($xdiff_string);
if($this->parameters['mode_creation_items'] == 'by_change') {
foreach ($xdiff_change as $i=>$change) {
$items[] = array(
'content' => implode('', $change),
'title' => $this->get_title(),
'link' => $this->get_constructed_remote_link($link)
);
}
} else {
$item_content = '';
foreach ($xdiff_change as $change) {
$item_content .= implode('', $change);
}
$items[] = array(
'content' => $item_content,
'title' => $this->get_title(),
'link' => $this->get_constructed_remote_link($link)
);
}
}
}
foreach ($items as $item) {
$data = array();
$data["type"] = "monitoring_website";
$data["title"] = $item['title'];
$data["summary"] = $item['content'];
$data["content"] = '';
$data["url"] = $item['link'];
$data["publication_date"] = date( 'Y-m-d H:i:s', strtotime($this->content_headers['Date']));
$data["logo_url"] = '';
$data["descriptors"] = "";
$data["tags"] = '';
$datas[] = $data;
}
$this->content = $content_from_link;
$this->content_hash = $content_hash_from_link;
$this->save_content(false);
}
} else {
$this->content = $content_from_link;
$this->content_hash = $content_hash_from_link;
$this->save_content(true);
}
return $datas;
}else{
return false;
}
}
public function get_available_selectors(){
global $msg;
return array(
"docwatch_selector_monitoring_website" => $msg['dsi_docwatch_selector_monitoring_website']
);
}
protected function get_expression_xpath_content($parameter_name, $i, $expression) {
global $msg;
$form = "
";
if($i == 0) {
$form .= "";
}
return $form;
}
protected function get_xpath_expressions_form($parameter_name) {
global $msg,$charset;
$form = "
";
if(is_array($this->parameters[$parameter_name]) && count($this->parameters[$parameter_name])) {
foreach ($this->parameters[$parameter_name] as $i=>$expression) {
$form .= "
".$this->get_expression_xpath_content($parameter_name, $i, $expression)."
";
}
$form .= "";
} else {
$form .= "
".$this->get_expression_xpath_content($parameter_name, 0, '')."
";
}
$form .= "";
return $form;
}
public function get_form_content(){
global $msg,$charset;
$form = parent::get_form_content();
$form .= "
".$this->get_xpath_expressions_form('xpath_expressions')."
".$this->get_xpath_expressions_form('xpath_expressions_for_title')."
".$this->get_xpath_expressions_form('xpath_expressions_for_link')."
".$msg['39']."parameters['use_root_url'] ? "":"checked='checked'")." type='radio' data-dojo-type='dijit/form/RadioButton' name='docwatch_datasource_monitoring_website_use_root_url' value='0' />
".$msg['40']."parameters['use_root_url'] ? "checked='checked'":"")." type='radio' data-dojo-type='dijit/form/RadioButton' name='docwatch_datasource_monitoring_website_use_root_url' value='1' />
";
return $form;
}
public function set_from_form() {
global $docwatch_datasource_monitoring_website_mode_creation_items;
global $docwatch_datasource_monitoring_website_xpath_expressions;
global $docwatch_datasource_monitoring_website_xpath_expressions_for_title;
global $docwatch_datasource_monitoring_website_xpath_expressions_for_link;
global $docwatch_datasource_monitoring_website_use_root_url;
$this->parameters['mode_creation_items'] = stripslashes($docwatch_datasource_monitoring_website_mode_creation_items);
$this->parameters['xpath_expressions'] = array();
foreach ($docwatch_datasource_monitoring_website_xpath_expressions as $xpath_expression) {
if($xpath_expression) {
$this->parameters['xpath_expressions'][] = stripslashes($xpath_expression);
}
}
$this->parameters['xpath_expressions_for_title'] = array();
foreach ($docwatch_datasource_monitoring_website_xpath_expressions_for_title as $xpath_expression_for_title) {
if($xpath_expression_for_title) {
$this->parameters['xpath_expressions_for_title'][] = stripslashes($xpath_expression_for_title);
}
}
$this->parameters['xpath_expressions_for_link'] = array();
foreach ($docwatch_datasource_monitoring_website_xpath_expressions_for_link as $xpath_expression_for_link) {
if($xpath_expression_for_link) {
$this->parameters['xpath_expressions_for_link'][] = stripslashes($xpath_expression_for_link);
}
}
$this->parameters['use_root_url'] = $docwatch_datasource_monitoring_website_use_root_url;
parent::set_from_form();
}
} // end of docwatch_datasource_monitoring_website