fichier = $filename; } /** * Récupération du texte à indexer dans l'archive */ public function get_text($filename){ $this->zip = zip_open($filename); if ($this->zip) { while ($zip_entry = zip_read($this->zip)) { $t = array(); $tab = explode("/",dirname(zip_entry_name($zip_entry))); $type_images_doc_num = $tab[count($tab)-1]; if($type_images_doc_num == "X"){ if(zip_entry_open($this->zip, $zip_entry, "r")) { $xmlGz=zip_entry_read($zip_entry,zip_entry_filesize($zip_entry)); $tmpfile=tempnam("/tmp","ocr"); @file_put_contents($tmpfile,$xmlGz); ob_start(); readgzfile($tmpfile); $xml=ob_get_clean(); $xml_dom = new xml_dom($xml, "iso-8859-1"); $textBlocs = @$xml_dom->get_nodes("alto/Layout/Page/PrintSpace/TextBlock"); if($textBlocs){ foreach($textBlocs as $textBloc){ $textlines = $xml_dom->get_nodes("TextLine",$textBloc); foreach($textlines as $textline){ $strings = $xml_dom->get_nodes("String",$textline); foreach($strings as $string){ $attrs = $xml_dom->get_attributes($string); foreach($attrs as $attr=>$value){ if($attr == 'CONTENT') $texte_final.= " ".$value; } } } } } } } } } return $texte_final; } } ?>