block_search_softlock)){ require_once 'Zend/Search/Lucene/SoftLockManager.php'; } else { require_once 'Zend/Search/Lucene/LockManager.php'; } /** * @category Zend * @package Zend_Search_Lucene * @subpackage Index * @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Search_Lucene_Index_SegmentInfo { /** * Number of docs in a segment * * @var integer */ private $_docCount; /** * Segment name * * @var string */ private $_name; /** * Term Dictionary Index * * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because * of performance considerations) * [0] -> $termValue * [1] -> $termFieldNum * * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos * * @var array */ private $_termDictionary; /** * Term Dictionary Index TermInfos * * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because * of performance considerations) * [0] -> $docFreq * [1] -> $freqPointer * [2] -> $proxPointer * [3] -> $skipOffset * [4] -> $indexPointer * * @var array */ private $_termDictionaryInfos; /** * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment * * @var array */ private $_fields; /** * Field positions in a dictionary. * (Term dictionary contains filelds ordered by names) * * @var array */ private $_fieldsDicPositions; /** * Associative array where the key is the file name and the value is data offset * in a compound segment file (.csf). * * @var array */ private $_segFiles; /** * Associative array where the key is the file name and the value is file size (.csf). * * @var array */ private $_segFileSizes; /** * Delete file generation number * * -1 means 'there is no delete file' * 0 means pre-2.1 format delete file * X specifies used delete file * * @var integer */ private $_delGen; /** * Segment has single norms file * * If true then one .nrm file is used for all fields * Otherwise .fN files are used * * @var boolean */ private $_hasSingleNormFile; /** * Use compound segment file (*.cfs) to collect all other segment files * (excluding .del files) * * @var boolean */ private $_isCompound; /** * File system adapter. * * @var Zend_Search_Lucene_Storage_Directory_Filesystem */ private $_directory; /** * Normalization factors. * An array fieldName => normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * Zend_Search_Lucene_Search_Similarity::encodeNorm()) * * @var array */ private $_norms = array(); /** * List of deleted documents. * bitset if bitset extension is loaded or array otherwise. * * @var mixed */ private $_deleted = null; /** * $this->_deleted update flag * * @var boolean */ private $_deletedDirty = false; /** * Zend_Search_Lucene_Index_SegmentInfo constructor * * @param Zend_Search_Lucene_Storage_Directory $directory * @param string $name * @param integer $docCount * @param integer $delGen * @param boolean $isCompound */ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $hasSingleNormFile = false, $isCompound = null) { $this->_directory = $directory; $this->_name = $name; $this->_docCount = $docCount; $this->_hasSingleNormFile = $hasSingleNormFile; $this->_delGen = $delGen; $this->_termDictionary = null; if (!is_null($isCompound)) { $this->_isCompound = $isCompound; } else { // It's a pre-2.1 segment // detect if it uses compond file $this->_isCompound = true; try { // Try to open compound file $this->_directory->getFileObject($name . '.cfs'); } catch (Zend_Search_Lucene_Exception $e) { if (strpos($e->getMessage(), 'is not readable') !== false) { // Compound file is not found or is not readable $this->_isCompound = false; } else { throw $e; } } } $this->_segFiles = array(); if ($this->_isCompound) { $cfsFile = $this->_directory->getFileObject($name . '.cfs'); $segFilesCount = $cfsFile->readVInt(); for ($count = 0; $count < $segFilesCount; $count++) { $dataOffset = $cfsFile->readLong(); if ($count != 0) { $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); } $fileName = $cfsFile->readString(); $this->_segFiles[$fileName] = $dataOffset; } if ($count != 0) { $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; } } $fnmFile = $this->openCompoundFile('.fnm'); $fieldsCount = $fnmFile->readVInt(); $fieldNames = array(); $fieldNums = array(); $this->_fields = array(); for ($count=0; $count < $fieldsCount; $count++) { $fieldName = $fnmFile->readString(); $fieldBits = $fnmFile->readByte(); $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, $fieldBits & 1, $count, $fieldBits & 2 ); if ($fieldBits & 0x10) { // norms are omitted for the indexed field $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); } $fieldNums[$count] = $count; $fieldNames[$count] = $fieldName; } array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); $this->_fieldsDicPositions = array_flip($fieldNums); if ($this->_delGen == -1) { // There is no delete file for this segment // Do nothing } else if ($this->_delGen == 0) { // It's a segment with pre-2.1 format delete file // Try to find delete file try { // '.del' files always stored in a separate file // Segment compound is not used $delFile = $this->_directory->getFileObject($this->_name . '.del'); $byteCount = $delFile->readInt(); $byteCount = ceil($byteCount/8); $bitCount = $delFile->readInt(); if ($bitCount == 0) { $delBytes = ''; } else { $delBytes = $delFile->readBytes($byteCount); } if (extension_loaded('bitset')) { $this->_deleted = $delBytes; } else { $this->_deleted = array(); for ($count = 0; $count < $byteCount; $count++) { $byte = ord($delBytes{$count}); for ($bit = 0; $bit < 8; $bit++) { if ($byte & (1<<$bit)) { $this->_deleted[$count*8 + $bit] = 1; } } } } } catch(Zend_Search_Exception $e) { if (strpos($e->getMessage(), 'is not readable') === false ) { throw $e; } // There is no delete file // Do nothing } } else { // It's 2.1+ format delete file $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $format = $delFile->readInt(); if ($format == (int)0xFFFFFFFF) { /** * @todo Implement support of DGaps delete file format. * See Lucene file format for details - http://lucene.apache.org/java/docs/fileformats.html#Deleted%20Documents */ throw new Zend_Search_Lucene_Exception('DGaps delete file format is not supported. Optimize index to use it with Zend_Search_Lucene'); } else { // $format is actually byte count $byteCount = ceil($format/8); $bitCount = $delFile->readInt(); if ($bitCount == 0) { $delBytes = ''; } else { $delBytes = $delFile->readBytes($byteCount); } if (extension_loaded('bitset')) { $this->_deleted = $delBytes; } else { $this->_deleted = array(); for ($count = 0; $count < $byteCount; $count++) { $byte = ord($delBytes{$count}); for ($bit = 0; $bit < 8; $bit++) { if ($byte & (1<<$bit)) { $this->_deleted[$count*8 + $bit] = 1; } } } } } } } /** * Opens index file stoted within compound index file * * @param string $extension * @param boolean $shareHandler * @throws Zend_Search_Lucene_Exception * @return Zend_Search_Lucene_Storage_File */ public function openCompoundFile($extension, $shareHandler = true) { $filename = $this->_name . $extension; if (!$this->_isCompound) { return $this->_directory->getFileObject($filename, $shareHandler); } if( !isset($this->_segFiles[$filename]) ) { throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain ' . $filename . ' file.' ); } $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); $file->seek($this->_segFiles[$filename]); return $file; } /** * Get compound file length * * @param string $extension * @return integer */ public function compoundFileLength($extension) { $filename = $this->_name . $extension; // Try to get common file first if ($this->_directory->fileExists($filename)) { return $this->_directory->fileLength($filename); } if( !isset($this->_segFileSizes[$filename]) ) { throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' . $filename . ' file.' ); } return $this->_segFileSizes[$filename]; } /** * Returns field index or -1 if field is not found * * @param string $fieldName * @return integer */ public function getFieldNum($fieldName) { foreach( $this->_fields as $field ) { if( $field->name == $fieldName ) { return $field->number; } } return -1; } /** * Returns field info for specified field * * @param integer $fieldNum * @return Zend_Search_Lucene_Index_FieldInfo */ public function getField($fieldNum) { return $this->_fields[$fieldNum]; } /** * Returns array of fields. * if $indexed parameter is true, then returns only indexed fields. * * @param boolean $indexed * @return array */ public function getFields($indexed = false) { $result = array(); foreach( $this->_fields as $field ) { if( (!$indexed) || $field->isIndexed ) { $result[ $field->name ] = $field->name; } } return $result; } /** * Returns array of FieldInfo objects. * * @return array */ public function getFieldInfos() { return $this->_fields; } /** * Returns actual deletions file generation number. * * @return integer */ public function getDelGen() { return $this->_delGen; } /** * Returns the total number of documents in this segment (including deleted documents). * * @return integer */ public function count() { return $this->_docCount; } /** * Returns number of deleted documents. * * @return integer */ private function _deletedCount() { if ($this->_deleted === null) { return 0; } if (extension_loaded('bitset')) { return count(bitset_to_array($this->_deleted)); } else { return count($this->_deleted); } } /** * Returns the total number of non-deleted documents in this segment. * * @return integer */ public function numDocs() { if ($this->hasDeletions()) { return $this->_docCount - $this->_deletedCount(); } else { return $this->_docCount; } } /** * Get field position in a fields dictionary * * @param integer $fieldNum * @return integer */ private function _getFieldPosition($fieldNum) { // Treat values which are not in a translation table as a 'direct value' return isset($this->_fieldsDicPositions[$fieldNum]) ? $this->_fieldsDicPositions[$fieldNum] : $fieldNum; } /** * Return segment name * * @return string */ public function getName() { return $this->_name; } /** * TermInfo cache * * Size is 1024. * Numbers are used instead of class constants because of performance considerations * * @var array */ private $_termInfoCache = array(); private function _cleanUpTermInfoCache() { // Clean 256 term infos foreach ($this->_termInfoCache as $key => $termInfo) { unset($this->_termInfoCache[$key]); // leave 768 last used term infos if (count($this->_termInfoCache) == 768) { break; } } } /** * Load terms dictionary index * * @throws Zend_Search_Lucene_Exception */ private function _loadDictionaryIndex() { // Check, if index is already serialized if ($this->_directory->fileExists($this->_name . '.sti')) { // Load serialized dictionary index data $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); // Load dictionary index data if (($unserializedData = @unserialize($stiFileData)) !== false) { list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData; return; } } // Load data from .tii file and generate .sti file // Prefetch dictionary index data $tiiFile = $this->openCompoundFile('.tii'); $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); // Load dictionary index data list($this->_termDictionary, $this->_termDictionaryInfos) = Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); $stiFile = $this->_directory->createFile($this->_name . '.sti'); $stiFile->writeBytes($stiFileData); } /** * Scans terms dictionary and returns term info * * @param Zend_Search_Lucene_Index_Term $term * @return Zend_Search_Lucene_Index_TermInfo */ public function getTermInfo(Zend_Search_Lucene_Index_Term $term) { $termKey = $term->key(); if (isset($this->_termInfoCache[$termKey])) { $termInfo = $this->_termInfoCache[$termKey]; // Move termInfo to the end of cache unset($this->_termInfoCache[$termKey]); $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($term->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // return $this->_termDictionaryInfos[$mid]; // We got it! $a = $this->_termDictionaryInfos[$mid]; $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $tisFile->readInt(); } $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR); $termValue = $prevTerm[1] /* text */; $termFieldNum = $prevTerm[0] /* field */; $freqPointer = $prevTermInfo[1] /* freqPointer */; $proxPointer = $prevTermInfo[2] /* proxPointer */; for ($count = $prevPosition*$indexInterval + 1; $count <= $termCount && ( $this->_getFieldPosition($termFieldNum) < $searchDicField || ($this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0) ); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if( $docFreq >= $skipInterval ) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { $termInfo = null; } // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; if (count($this->_termInfoCache) == 1024) { $this->_cleanUpTermInfoCache(); } return $termInfo; } /** * Returns term freqs array. * Result array structure: array(docId => freq, ...) * * @param Zend_Search_Lucene_Index_Term $term * @param integer $shift * @return Zend_Search_Lucene_Index_TermInfo */ public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $result = array(); $docId = 0; for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $result[$shift + $docId] = 1; } else { $docId += $docDelta/2; $result[$shift + $docId] = $frqFile->readVInt(); } } return $result; } /** * Returns term positions array. * Result array structure: array(docId => array(pos1, pos2, ...), ...) * * @param Zend_Search_Lucene_Index_Term $term * @param integer $shift * @return Zend_Search_Lucene_Index_TermInfo */ public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $freqs = array(); $docId = 0; for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } $result[$shift + $docId] = $positions; } return $result; } /** * Load normalizatin factors from an index file * * @param integer $fieldNum * @throws Zend_Search_Lucene_Exception */ private function _loadNorm($fieldNum) { if ($this->_hasSingleNormFile) { $normfFile = $this->openCompoundFile('.nrm'); $header = $normfFile->readBytes(3); $headerFormatVersion = $normfFile->readByte(); if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) { throw new Zend_Search_Lucene_Exception('Wrong norms file format.'); } foreach ($this->_fields as $fieldNum => $fieldInfo) { if ($fieldInfo->isIndexed) { $this->_norms[$fieldNum] = $normfFile->readBytes($this->_docCount); } } } else { $fFile = $this->openCompoundFile('.f' . $fieldNum); $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); } } /** * Returns normalization factor for specified documents * * @param integer $id * @param string $fieldName * @return float */ public function norm($id, $fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ( !($this->_fields[$fieldNum]->isIndexed) ) { return null; } if (!isset($this->_norms[$fieldNum])) { $this->_loadNorm($fieldNum); } return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); } /** * Returns norm vector, encoded in a byte string * * @param string $fieldName * @return string */ public function normVector($fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), $this->_docCount); } if (!isset($this->_norms[$fieldNum])) { $this->_loadNorm($fieldNum); } return $this->_norms[$fieldNum]; } /** * Returns true if any documents have been deleted from this index segment. * * @return boolean */ public function hasDeletions() { return $this->_deleted !== null; } /** * Returns true if segment has single norms file. * * @return boolean */ public function hasSingleNormFile() { return $this->_hasSingleNormFile ? 1 : 0; } /** * Returns true if segment is stored using compound segment file. * * @return boolean */ public function isCompound() { return $this->_isCompound ? 1 : 0; } /** * Deletes a document from the index segment. * $id is an internal document id * * @param integer */ public function delete($id) { $this->_deletedDirty = true; if (extension_loaded('bitset')) { if ($this->_deleted === null) { $this->_deleted = bitset_empty($id); } bitset_incl($this->_deleted, $id); } else { if ($this->_deleted === null) { $this->_deleted = array(); } $this->_deleted[$id] = 1; } } /** * Checks, that document is deleted * * @param integer * @return boolean */ public function isDeleted($id) { if ($this->_deleted === null) { return false; } if (extension_loaded('bitset')) { return bitset_in($this->_deleted, $id); } else { return isset($this->_deleted[$id]); } } /** * Write changes if it's necessary. */ public function writeChanges() { if (!$this->_deletedDirty) { return; } if (extension_loaded('bitset')) { $delBytes = $this->_deleted; $bitCount = count(bitset_to_array($delBytes)); } else { $byteCount = floor($this->_docCount/8)+1; $delBytes = str_repeat(chr(0), $byteCount); for ($count = 0; $count < $byteCount; $count++) { $byte = 0; for ($bit = 0; $bit < 8; $bit++) { if (isset($this->_deleted[$count*8 + $bit])) { $byte |= (1<<$bit); } } $delBytes{$count} = chr($byte); } $bitCount = count($this->_deleted); } // Get new generation number Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory); $delFileList = array(); foreach ($this->_directory->fileList() as $file) { if ($file == $this->_name . '.del') { // Matches .del file name $delFileList[] = 0; } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) { // Matches _NNN.del file names $delFileList[] = (int)base_convert($matches[1], 36, 10); } } if (count($delFileList) == 0) { // There is no deletions file for current segment in the directory // Set detetions file generation number to 1 $this->_delGen = 1; } else { // There are some deletions files for current segment in the directory // Set detetions file generation number to the highest + 1 $this->_delGen = max($delFileList) + 1; } $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory); $delFile->writeInt($this->_docCount); $delFile->writeInt($bitCount); $delFile->writeBytes($delBytes); $this->_deletedDirty = false; } /** * Term Dictionary File object for stream like terms reading * * @var Zend_Search_Lucene_Storage_File */ private $_tisFile = null; /** * Actual offset of the .tis file data * * @var integer */ private $_tisFileOffset; /** * Frequencies File object for stream like terms reading * * @var Zend_Search_Lucene_Storage_File */ private $_frqFile = null; /** * Actual offset of the .frq file data * * @var integer */ private $_frqFileOffset; /** * Positions File object for stream like terms reading * * @var Zend_Search_Lucene_Storage_File */ private $_prxFile = null; /** * Actual offset of the .prx file in the compound file * * @var integer */ private $_prxFileOffset; /** * Actual number of terms in term stream * * @var integer */ private $_termCount = 0; /** * Overall number of terms in term stream * * @var integer */ private $_termNum = 0; /** * Segment index interval * * @var integer */ private $_indexInterval; /** * Segment skip interval * * @var integer */ private $_skipInterval; /** * Last TermInfo in a terms stream * * @var Zend_Search_Lucene_Index_TermInfo */ private $_lastTermInfo = null; /** * Last Term in a terms stream * * @var Zend_Search_Lucene_Index_Term */ private $_lastTerm = null; /** * Map of the document IDs * Used to get new docID after removing deleted documents. * It's not very effective from memory usage point of view, * but much more faster, then other methods * * @var array|null */ private $_docMap = null; /** * An array of all term positions in the documents. * Array structure: array( docId => array( pos1, pos2, ...), ...) * * Is set to null if term positions loading has to be skipped * * @var array|null */ private $_lastTermPositions; /** * Terms scan mode * * Values: * * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved * document numbers are compacted (shifted if segment has deleted documents) * * @var integer */ private $_termsScanMode; /** Scan modes */ const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved // document numbers are compacted (shifted if segment contains deleted documents) /** * Reset terms stream * * $startId - id for the fist document * $compact - remove deleted documents * * Returns start document id for the next segment * * @param integer $startId * @param integer $mode * @throws Zend_Search_Lucene_Exception * @return integer */ public function reset($startId = 0, $mode = self::SM_TERMS_ONLY) { if ($this->_tisFile !== null) { $this->_tisFile = null; } $this->_tisFile = $this->openCompoundFile('.tis', false); $this->_tisFileOffset = $this->_tisFile->tell(); $tiVersion = $this->_tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $this->_termCount = $this->_termNum = $this->_tisFile->readLong(); // Read terms count $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $this->_tisFile->readInt(); } if ($this->_frqFile !== null) { $this->_frqFile = null; } if ($this->_prxFile !== null) { $this->_prxFile = null; } $this->_docMap = array(); $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); $this->_lastTermPositions = null; $this->_termsScanMode = $mode; switch ($mode) { case self::SM_TERMS_ONLY: // Do nothing break; case self::SM_FULL_INFO: // break intentionally omitted case self::SM_MERGE_INFO: $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_frqFileOffset = $this->_frqFile->tell(); $this->_prxFile = $this->openCompoundFile('.prx', false); $this->_prxFileOffset = $this->_prxFile->tell(); for ($count = 0; $count < $this->_docCount; $count++) { if (!$this->isDeleted($count)) { $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count); } } break; default: throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.'); break; } $this->nextTerm(); return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount); } /** * Skip terms stream up to specified term preffix. * * Prefix contains fully specified field info and portion of searched term * * @param Zend_Search_Lucene_Index_Term $prefix * @throws Zend_Search_Lucene_Exception */ public function skipTo(Zend_Search_Lucene_Index_Term $prefix) { if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($prefix->field); if ($searchField == -1) { /** * Field is not presented in this segment * Go to the end of dictionary */ $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; return; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($prefix->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // We have reached term we are looking for break; } } if ($highIndex == -1) { // Term is out of the dictionary range $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; return; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; if ($this->_tisFile === null) { // The end of terms stream is reached and terms dictionary file is closed // Perform mini-reset operation $this->_tisFile = $this->openCompoundFile('.tis', false); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_prxFile = $this->openCompoundFile('.prx', false); } } $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET); $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */, ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */, $prevTermInfo[1] /* freqPointer */, $prevTermInfo[2] /* proxPointer */, $prevTermInfo[3] /* skipOffset */); $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval; if ($highIndex == 0) { // skip start entry $this->nextTerm(); } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) { // We got exact match in the dictionary index if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { $docDelta = $this->_frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } return; } // Search term matching specified prefix while ($this->_lastTerm !== null) { if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 || ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) { // Current term matches or greate than the pattern return; } $this->nextTerm(); } } /** * Scans terms dictionary and returns next term * * @return Zend_Search_Lucene_Index_Term|null */ public function nextTerm() { if ($this->_tisFile === null || $this->_termCount == 0) { $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; // may be necessary for "empty" segment $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; return null; } $termPrefixLength = $this->_tisFile->readVInt(); $termSuffix = $this->_tisFile->readString(); $termFieldNum = $this->_tisFile->readVInt(); $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name); $docFreq = $this->_tisFile->readVInt(); $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); if ($docFreq >= $this->_skipInterval) { $skipOffset = $this->_tisFile->readVInt(); } else { $skipOffset = 0; } $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { $docDelta = $this->_frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } $this->_termCount--; if ($this->_termCount == 0) { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; } return $this->_lastTerm; } /** * Close terms stream * * Should be used for resources clean up if stream is not read up to the end */ public function closeTermsStream() { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; } /** * Returns term in current position * * @return Zend_Search_Lucene_Index_Term|null */ public function currentTerm() { return $this->_lastTerm; } /** * Returns an array of all term positions in the documents. * Return array structure: array( docId => array( pos1, pos2, ...), ...) * * @return array */ public function currentTermPositions() { return $this->_lastTermPositions; } }