normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * Zend_Search_Lucene_Search_Similarity::encodeNorm()) * * @var array */ protected $_norms = array(); /** * '.fdx' file - Stored Fields, the field index. * * @var Zend_Search_Lucene_Storage_File */ protected $_fdxFile = null; /** * '.fdt' file - Stored Fields, the field data. * * @var Zend_Search_Lucene_Storage_File */ protected $_fdtFile = null; /** * Object constructor. * * @param Zend_Search_Lucene_Storage_Directory $directory * @param string $name */ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) { $this->_directory = $directory; $this->_name = $name; } /** * Add field to the segment * * Returns actual field number * * @param Zend_Search_Lucene_Field $field * @return integer */ public function addField(Zend_Search_Lucene_Field $field) { if (!isset($this->_fields[$field->name])) { $fieldNumber = count($this->_fields); $this->_fields[$field->name] = new Zend_Search_Lucene_Index_FieldInfo($field->name, $field->isIndexed, $fieldNumber, $field->storeTermVector); return $fieldNumber; } else { $this->_fields[$field->name]->isIndexed |= $field->isIndexed; $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; return $this->_fields[$field->name]->number; } } /** * Add fieldInfo to the segment * * Returns actual field number * * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo * @return integer */ public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo) { if (!isset($this->_fields[$fieldInfo->name])) { $fieldNumber = count($this->_fields); $this->_fields[$fieldInfo->name] = new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name, $fieldInfo->isIndexed, $fieldNumber, $fieldInfo->storeTermVector); return $fieldNumber; } else { $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed; $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector; return $this->_fields[$fieldInfo->name]->number; } } /** * Returns array of FieldInfo objects. * * @return array */ public function getFieldInfos() { return $this->_fields; } /** * Add stored fields information * * @param array $storedFields array of Zend_Search_Lucene_Field objects */ public function addStoredFields($storedFields) { if (!isset($this->_fdxFile)) { $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); $this->_files[] = $this->_name . '.fdx'; $this->_files[] = $this->_name . '.fdt'; } $this->_fdxFile->writeLong($this->_fdtFile->tell()); $this->_fdtFile->writeVInt(count($storedFields)); foreach ($storedFields as $field) { $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); $fieldBits = ($field->isTokenized ? 0x01 : 0x00) | ($field->isBinary ? 0x02 : 0x00) | 0x00; /* 0x04 - third bit, compressed (ZLIB) */ $this->_fdtFile->writeByte($fieldBits); if ($field->isBinary) { $this->_fdtFile->writeVInt(strlen($field->value)); $this->_fdtFile->writeBytes($field->value); } else { $this->_fdtFile->writeString($field->getUtf8Value()); } } $this->_docCount++; } /** * Returns the total number of documents in this segment. * * @return integer */ public function count() { return $this->_docCount; } /** * Return segment name * * @return string */ public function getName() { return $this->_name; } /** * Dump Field Info (.fnm) segment file */ protected function _dumpFNM() { $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); $fnmFile->writeVInt(count($this->_fields)); $nrmFile = $this->_directory->createFile($this->_name . '.nrm'); // Write header $nrmFile->writeBytes('NRM'); // Write format specifier $nrmFile->writeByte((int)0xFF); foreach ($this->_fields as $field) { $fnmFile->writeString($field->name); $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | ($field->storeTermVector ? 0x02 : 0x00) // not supported yet 0x04 /* term positions are stored with the term vectors */ | // not supported yet 0x08 /* term offsets are stored with the term vectors */ | ); if ($field->isIndexed) { // pre-2.1 index mode (not used now) // $normFileName = $this->_name . '.f' . $field->number; // $fFile = $this->_directory->createFile($normFileName); // $fFile->writeBytes($this->_norms[$field->name]); // $this->_files[] = $normFileName; $nrmFile->writeBytes($this->_norms[$field->name]); } } $this->_files[] = $this->_name . '.fnm'; $this->_files[] = $this->_name . '.nrm'; } /** * Term Dictionary file * * @var Zend_Search_Lucene_Storage_File */ private $_tisFile = null; /** * Term Dictionary index file * * @var Zend_Search_Lucene_Storage_File */ private $_tiiFile = null; /** * Frequencies file * * @var Zend_Search_Lucene_Storage_File */ private $_frqFile = null; /** * Positions file * * @var Zend_Search_Lucene_Storage_File */ private $_prxFile = null; /** * Number of written terms * * @var integer */ private $_termCount; /** * Last saved term * * @var Zend_Search_Lucene_Index_Term */ private $_prevTerm; /** * Last saved term info * * @var Zend_Search_Lucene_Index_TermInfo */ private $_prevTermInfo; /** * Last saved index term * * @var Zend_Search_Lucene_Index_Term */ private $_prevIndexTerm; /** * Last saved index term info * * @var Zend_Search_Lucene_Index_TermInfo */ private $_prevIndexTermInfo; /** * Last term dictionary file position * * @var integer */ private $_lastIndexPosition; /** * Create dicrionary, frequency and positions files and write necessary headers */ public function initializeDictionaryFiles() { $this->_tisFile = $this->_directory->createFile($this->_name . '.tis'); $this->_tisFile->writeInt((int)0xFFFFFFFD); $this->_tisFile->writeLong(0 /* dummy data for terms count */); $this->_tisFile->writeInt(self::$indexInterval); $this->_tisFile->writeInt(self::$skipInterval); $this->_tisFile->writeInt(self::$maxSkipLevels); $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii'); $this->_tiiFile->writeInt((int)0xFFFFFFFD); $this->_tiiFile->writeLong(0 /* dummy data for terms count */); $this->_tiiFile->writeInt(self::$indexInterval); $this->_tiiFile->writeInt(self::$skipInterval); $this->_tiiFile->writeInt(self::$maxSkipLevels); /** Dump dictionary header */ $this->_tiiFile->writeVInt(0); // preffix length $this->_tiiFile->writeString(''); // suffix $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number $this->_tiiFile->writeByte((int)0x0F); $this->_tiiFile->writeVInt(0); // DocFreq $this->_tiiFile->writeVInt(0); // FreqDelta $this->_tiiFile->writeVInt(0); // ProxDelta $this->_tiiFile->writeVInt(24); // IndexDelta $this->_frqFile = $this->_directory->createFile($this->_name . '.frq'); $this->_prxFile = $this->_directory->createFile($this->_name . '.prx'); $this->_files[] = $this->_name . '.tis'; $this->_files[] = $this->_name . '.tii'; $this->_files[] = $this->_name . '.frq'; $this->_files[] = $this->_name . '.prx'; $this->_prevTerm = null; $this->_prevTermInfo = null; $this->_prevIndexTerm = null; $this->_prevIndexTermInfo = null; $this->_lastIndexPosition = 24; $this->_termCount = 0; } /** * Add term * * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... ) * * @param Zend_Search_Lucene_Index_Term $termEntry * @param array $termDocs */ public function addTerm($termEntry, $termDocs) { $freqPointer = $this->_frqFile->tell(); $proxPointer = $this->_prxFile->tell(); $prevDoc = 0; foreach ($termDocs as $docId => $termPositions) { $docDelta = ($docId - $prevDoc)*2; $prevDoc = $docId; if (count($termPositions) > 1) { $this->_frqFile->writeVInt($docDelta); $this->_frqFile->writeVInt(count($termPositions)); } else { $this->_frqFile->writeVInt($docDelta + 1); } $prevPosition = 0; foreach ($termPositions as $position) { $this->_prxFile->writeVInt($position - $prevPosition); $prevPosition = $position; } } if (count($termDocs) >= self::$skipInterval) { /** * @todo Write Skip Data to a freq file. * It's not used now, but make index more optimal */ $skipOffset = $this->_frqFile->tell() - $freqPointer; } else { $skipOffset = 0; } $term = new Zend_Search_Lucene_Index_Term($termEntry->text, $this->_fields[$termEntry->field]->number); $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs), $freqPointer, $proxPointer, $skipOffset); $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo); if (($this->_termCount + 1) % self::$indexInterval == 0) { $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo); $indexPosition = $this->_tisFile->tell(); $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition); $this->_lastIndexPosition = $indexPosition; } $this->_termCount++; } /** * Close dictionary */ public function closeDictionaryFiles() { $this->_tisFile->seek(4); $this->_tisFile->writeLong($this->_termCount); $this->_tiiFile->seek(4); $this->_tiiFile->writeLong(ceil(($this->_termCount + 2)/self::$indexInterval)); } /** * Dump Term Dictionary segment file entry. * Used to write entry to .tis or .tii files * * @param Zend_Search_Lucene_Storage_File $dicFile * @param Zend_Search_Lucene_Index_Term $prevTerm * @param Zend_Search_Lucene_Index_Term $term * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo * @param Zend_Search_Lucene_Index_TermInfo $termInfo */ protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, &$prevTerm, Zend_Search_Lucene_Index_Term $term, &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) { if (isset($prevTerm) && $prevTerm->field == $term->field) { $matchedBytes = 0; $maxBytes = min(strlen($prevTerm->text), strlen($term->text)); while ($matchedBytes < $maxBytes && $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) { $matchedBytes++; } // Calculate actual matched UTF-8 pattern $prefixBytes = 0; $prefixChars = 0; while ($prefixBytes < $matchedBytes) { $charBytes = 1; if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x20 ) { $charBytes++; if (ord($term->text[$prefixBytes]) & 0x10 ) { $charBytes++; } } } if ($prefixBytes + $charBytes > $matchedBytes) { // char crosses matched bytes boundary // skip char break; } $prefixChars++; $prefixBytes += $charBytes; } // Write preffix length $dicFile->writeVInt($prefixChars); // Write suffix $dicFile->writeString(substr($term->text, $prefixBytes)); } else { // Write preffix length $dicFile->writeVInt(0); // Write suffix $dicFile->writeString($term->text); } // Write field number $dicFile->writeVInt($term->field); // DocFreq (the count of documents which contain the term) $dicFile->writeVInt($termInfo->docFreq); $prevTerm = $term; if (!isset($prevTermInfo)) { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer); } else { // Write FreqDelta $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); // Write ProxDelta $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); } // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval if ($termInfo->skipOffset != 0) { $dicFile->writeVInt($termInfo->skipOffset); } $prevTermInfo = $termInfo; } /** * Generate compound index file */ protected function _generateCFS() { $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); $cfsFile->writeVInt(count($this->_files)); $dataOffsetPointers = array(); foreach ($this->_files as $fileName) { $dataOffsetPointers[$fileName] = $cfsFile->tell(); $cfsFile->writeLong(0); // write dummy data $cfsFile->writeString($fileName); } foreach ($this->_files as $fileName) { // Get actual data offset $dataOffset = $cfsFile->tell(); // Seek to the data offset pointer $cfsFile->seek($dataOffsetPointers[$fileName]); // Write actual data offset value $cfsFile->writeLong($dataOffset); // Seek back to the end of file $cfsFile->seek($dataOffset); $dataFile = $this->_directory->getFileObject($fileName); $byteCount = $this->_directory->fileLength($fileName); while ($byteCount > 0) { $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/)); $byteCount -= strlen($data); $cfsFile->writeBytes($data); } $this->_directory->deleteFile($fileName); } } /** * Close segment, write it to disk and return segment info * * @return Zend_Search_Lucene_Index_SegmentInfo */ abstract public function close(); }