$termValue * [1] -> $termFieldNum * * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos * * @var array */ private $_termDictionary; /** * Term Dictionary Index TermInfos * * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because * of performance considerations) * [0] -> $docFreq * [1] -> $freqPointer * [2] -> $proxPointer * [3] -> $skipOffset * [4] -> $indexPointer * * @var array */ private $_termDictionaryInfos; /** * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment * * @var array */ private $_fields; /** * Field positions in a dictionary. * (Term dictionary contains filelds ordered by names) * * @var array */ private $_fieldsDicPositions; /** * Associative array where the key is the file name and the value is data offset * in a compound segment file (.csf). * * @var array */ private $_segFiles; /** * Associative array where the key is the file name and the value is file size (.csf). * * @var array */ private $_segFileSizes; /** * Delete file generation number * * -2 means autodetect latest delete generation * -1 means 'there is no delete file' * 0 means pre-2.1 format delete file * X specifies used delete file * * @var integer */ private $_delGen; /** * Segment has single norms file * * If true then one .nrm file is used for all fields * Otherwise .fN files are used * * @var boolean */ private $_hasSingleNormFile; /** * Use compound segment file (*.cfs) to collect all other segment files * (excluding .del files) * * @var boolean */ private $_isCompound; /** * File system adapter. * * @var Zend_Search_Lucene_Storage_Directory_Filesystem */ private $_directory; /** * Normalization factors. * An array fieldName => normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * Zend_Search_Lucene_Search_Similarity::encodeNorm()) * * @var array */ private $_norms = array(); /** * List of deleted documents. * bitset if bitset extension is loaded or array otherwise. * * @var mixed */ private $_deleted = null; /** * $this->_deleted update flag * * @var boolean */ private $_deletedDirty = false; /** * True if segment uses shared doc store * * @var boolean */ private $_usesSharedDocStore; /* * Shared doc store options. * It's an assotiative array with the following items: * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files. * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file). */ private $_sharedDocStoreOptions; /** * Zend_Search_Lucene_Index_SegmentInfo constructor * * @param Zend_Search_Lucene_Storage_Directory $directory * @param string $name * @param integer $docCount * @param integer $delGen * @param array|null $docStoreOptions * @param boolean $hasSingleNormFile * @param boolean $isCompound */ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null) { $this->_directory = $directory; $this->_name = $name; $this->_docCount = $docCount; if ($docStoreOptions !== null) { $this->_usesSharedDocStore = true; $this->_sharedDocStoreOptions = $docStoreOptions; if ($docStoreOptions['isCompound']) { $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx'); $cfxFilesCount = $cfxFile->readVInt(); $cfxFiles = array(); $cfxFileSizes = array(); for ($count = 0; $count < $cfxFilesCount; $count++) { $dataOffset = $cfxFile->readLong(); if ($count != 0) { $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles); } $fileName = $cfxFile->readString(); $cfxFiles[$fileName] = $dataOffset; } if ($count != 0) { $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset; } $this->_sharedDocStoreOptions['files'] = $cfxFiles; $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes; } } $this->_hasSingleNormFile = $hasSingleNormFile; $this->_delGen = $delGen; $this->_termDictionary = null; if ($isCompound !== null) { $this->_isCompound = $isCompound; } else { // It's a pre-2.1 segment or isCompound is set to 'unknown' // Detect if segment uses compound file require_once 'Zend/Search/Lucene/Exception.php'; try { // Try to open compound file $this->_directory->getFileObject($name . '.cfs'); // Compound file is found $this->_isCompound = true; } catch (Zend_Search_Lucene_Exception $e) { if (strpos($e->getMessage(), 'is not readable') !== false) { // Compound file is not found or is not readable $this->_isCompound = false; } else { throw $e; } } } $this->_segFiles = array(); if ($this->_isCompound) { $cfsFile = $this->_directory->getFileObject($name . '.cfs'); $segFilesCount = $cfsFile->readVInt(); for ($count = 0; $count < $segFilesCount; $count++) { $dataOffset = $cfsFile->readLong(); if ($count != 0) { $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); } $fileName = $cfsFile->readString(); $this->_segFiles[$fileName] = $dataOffset; } if ($count != 0) { $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; } } $fnmFile = $this->openCompoundFile('.fnm'); $fieldsCount = $fnmFile->readVInt(); $fieldNames = array(); $fieldNums = array(); $this->_fields = array(); for ($count=0; $count < $fieldsCount; $count++) { $fieldName = $fnmFile->readString(); $fieldBits = $fnmFile->readByte(); $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, $fieldBits & 0x01 /* field is indexed */, $count, $fieldBits & 0x02 /* termvectors are stored */, $fieldBits & 0x10 /* norms are omitted */, $fieldBits & 0x20 /* payloads are stored */); if ($fieldBits & 0x10) { // norms are omitted for the indexed field $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); } $fieldNums[$count] = $count; $fieldNames[$count] = $fieldName; } array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); $this->_fieldsDicPositions = array_flip($fieldNums); if ($this->_delGen == -2) { // SegmentInfo constructor is invoked from index writer // Autodetect current delete file generation number $this->_delGen = $this->_detectLatestDelGen(); } // Load deletions $this->_deleted = $this->_loadDelFile(); } /** * Load detetions file * * Returns bitset or an array depending on bitset extension availability * * @return mixed * @throws Zend_Search_Lucene_Exception */ private function _loadDelFile() { if ($this->_delGen == -1) { // There is no delete file for this segment return null; } else if ($this->_delGen == 0) { // It's a segment with pre-2.1 format delete file // Try to load deletions file return $this->_loadPre21DelFile(); } else { // It's 2.1+ format deleteions file return $this->_load21DelFile(); } } /** * Load pre-2.1 detetions file * * Returns bitset or an array depending on bitset extension availability * * @return mixed * @throws Zend_Search_Lucene_Exception */ private function _loadPre21DelFile() { require_once 'Zend/Search/Lucene/Exception.php'; try { // '.del' files always stored in a separate file // Segment compound is not used $delFile = $this->_directory->getFileObject($this->_name . '.del'); $byteCount = $delFile->readInt(); $byteCount = ceil($byteCount/8); $bitCount = $delFile->readInt(); if ($bitCount == 0) { $delBytes = ''; } else { $delBytes = $delFile->readBytes($byteCount); } if (extension_loaded('bitset')) { return $delBytes; } else { $deletions = array(); for ($count = 0; $count < $byteCount; $count++) { $byte = ord($delBytes[$count]); for ($bit = 0; $bit < 8; $bit++) { if ($byte & (1<<$bit)) { $deletions[$count*8 + $bit] = 1; } } } return $deletions; } } catch(Zend_Search_Lucene_Exception $e) { if (strpos($e->getMessage(), 'is not readable') === false) { throw $e; } // There is no deletion file $this->_delGen = -1; return null; } } /** * Load 2.1+ format detetions file * * Returns bitset or an array depending on bitset extension availability * * @return mixed */ private function _load21DelFile() { $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $format = $delFile->readInt(); if ($format == (int)0xFFFFFFFF) { if (extension_loaded('bitset')) { $deletions = bitset_empty(); } else { $deletions = array(); } $byteCount = $delFile->readInt(); $bitCount = $delFile->readInt(); $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $byteNum = 0; do { $dgap = $delFile->readVInt(); $nonZeroByte = $delFile->readByte(); $byteNum += $dgap; if (extension_loaded('bitset')) { for ($bit = 0; $bit < 8; $bit++) { if ($nonZeroByte & (1<<$bit)) { bitset_incl($deletions, $byteNum*8 + $bit); } } return $deletions; } else { for ($bit = 0; $bit < 8; $bit++) { if ($nonZeroByte & (1<<$bit)) { $deletions[$byteNum*8 + $bit] = 1; } } return (count($deletions) > 0) ? $deletions : null; } } while ($delFile->tell() < $delFileSize); } else { // $format is actually byte count $byteCount = ceil($format/8); $bitCount = $delFile->readInt(); if ($bitCount == 0) { $delBytes = ''; } else { $delBytes = $delFile->readBytes($byteCount); } if (extension_loaded('bitset')) { return $delBytes; } else { $deletions = array(); for ($count = 0; $count < $byteCount; $count++) { $byte = ord($delBytes[$count]); for ($bit = 0; $bit < 8; $bit++) { if ($byte & (1<<$bit)) { $deletions[$count*8 + $bit] = 1; } } } return (count($deletions) > 0) ? $deletions : null; } } } /** * Opens index file stoted within compound index file * * @param string $extension * @param boolean $shareHandler * @throws Zend_Search_Lucene_Exception * @return Zend_Search_Lucene_Storage_File */ public function openCompoundFile($extension, $shareHandler = true) { if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx'; $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt'; if (!$this->_sharedDocStoreOptions['isCompound']) { $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler); $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); if ($extension == '.fdx') { // '.fdx' file is requested return $fdxFile; } else { // '.fdt' file is requested $fdtStartOffset = $fdxFile->readLong(); $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler); $fdtFile->seek($fdtStartOffset, SEEK_CUR); return $fdtFile; } } if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain ' . $fdxFName . ' file.' ); } if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain ' . $fdtFName . ' file.' ); } // Open shared docstore segment file $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler); // Seek to the start of '.fdx' file within compound file $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]); // Seek to the start of current segment documents section $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); if ($extension == '.fdx') { // '.fdx' file is requested return $cfxFile; } else { // '.fdt' file is requested $fdtStartOffset = $cfxFile->readLong(); // Seek to the start of '.fdt' file within compound file $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]); // Seek to the start of current segment documents section $cfxFile->seek($fdtStartOffset, SEEK_CUR); return $fdtFile; } } $filename = $this->_name . $extension; if (!$this->_isCompound) { return $this->_directory->getFileObject($filename, $shareHandler); } if( !isset($this->_segFiles[$filename]) ) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain ' . $filename . ' file.' ); } $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); $file->seek($this->_segFiles[$filename]); return $file; } /** * Get compound file length * * @param string $extension * @return integer */ public function compoundFileLength($extension) { if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { $filename = $this->_sharedDocStoreOptions['segment'] . $extension; if (!$this->_sharedDocStoreOptions['isCompound']) { return $this->_directory->fileLength($filename); } if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain ' . $filename . ' file.' ); } return $this->_sharedDocStoreOptions['fileSizes'][$filename]; } $filename = $this->_name . $extension; // Try to get common file first if ($this->_directory->fileExists($filename)) { return $this->_directory->fileLength($filename); } if( !isset($this->_segFileSizes[$filename]) ) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' . $filename . ' file.' ); } return $this->_segFileSizes[$filename]; } /** * Returns field index or -1 if field is not found * * @param string $fieldName * @return integer */ public function getFieldNum($fieldName) { foreach( $this->_fields as $field ) { if( $field->name == $fieldName ) { return $field->number; } } return -1; } /** * Returns field info for specified field * * @param integer $fieldNum * @return Zend_Search_Lucene_Index_FieldInfo */ public function getField($fieldNum) { return $this->_fields[$fieldNum]; } /** * Returns array of fields. * if $indexed parameter is true, then returns only indexed fields. * * @param boolean $indexed * @return array */ public function getFields($indexed = false) { $result = array(); foreach( $this->_fields as $field ) { if( (!$indexed) || $field->isIndexed ) { $result[ $field->name ] = $field->name; } } return $result; } /** * Returns array of FieldInfo objects. * * @return array */ public function getFieldInfos() { return $this->_fields; } /** * Returns actual deletions file generation number. * * @return integer */ public function getDelGen() { return $this->_delGen; } /** * Returns the total number of documents in this segment (including deleted documents). * * @return integer */ public function count() { return $this->_docCount; } /** * Returns number of deleted documents. * * @return integer */ private function _deletedCount() { if ($this->_deleted === null) { return 0; } if (extension_loaded('bitset')) { return count(bitset_to_array($this->_deleted)); } else { return count($this->_deleted); } } /** * Returns the total number of non-deleted documents in this segment. * * @return integer */ public function numDocs() { if ($this->hasDeletions()) { return $this->_docCount - $this->_deletedCount(); } else { return $this->_docCount; } } /** * Get field position in a fields dictionary * * @param integer $fieldNum * @return integer */ private function _getFieldPosition($fieldNum) { // Treat values which are not in a translation table as a 'direct value' return isset($this->_fieldsDicPositions[$fieldNum]) ? $this->_fieldsDicPositions[$fieldNum] : $fieldNum; } /** * Return segment name * * @return string */ public function getName() { return $this->_name; } /** * TermInfo cache * * Size is 1024. * Numbers are used instead of class constants because of performance considerations * * @var array */ private $_termInfoCache = array(); private function _cleanUpTermInfoCache() { // Clean 256 term infos foreach ($this->_termInfoCache as $key => $termInfo) { unset($this->_termInfoCache[$key]); // leave 768 last used term infos if (count($this->_termInfoCache) == 768) { break; } } } /** * Load terms dictionary index * * @throws Zend_Search_Lucene_Exception */ private function _loadDictionaryIndex() { // Check, if index is already serialized if ($this->_directory->fileExists($this->_name . '.sti')) { // Load serialized dictionary index data $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); // Load dictionary index data if (($unserializedData = @unserialize($stiFileData)) !== false) { list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData; return; } } // Load data from .tii file and generate .sti file // Prefetch dictionary index data $tiiFile = $this->openCompoundFile('.tii'); $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); /** Zend_Search_Lucene_Index_DictionaryLoader */ require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php'; // Load dictionary index data list($this->_termDictionary, $this->_termDictionaryInfos) = Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); $stiFile = $this->_directory->createFile($this->_name . '.sti'); $stiFile->writeBytes($stiFileData); } /** * Scans terms dictionary and returns term info * * @param Zend_Search_Lucene_Index_Term $term * @return Zend_Search_Lucene_Index_TermInfo */ public function getTermInfo(Zend_Search_Lucene_Index_Term $term) { $termKey = $term->key(); if (isset($this->_termInfoCache[$termKey])) { $termInfo = $this->_termInfoCache[$termKey]; // Move termInfo to the end of cache unset($this->_termInfoCache[$termKey]); $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($term->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // return $this->_termDictionaryInfos[$mid]; // We got it! $a = $this->_termDictionaryInfos[$mid]; $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; return $termInfo; } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $tisFile->readInt(); } $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR); $termValue = $prevTerm[1] /* text */; $termFieldNum = $prevTerm[0] /* field */; $freqPointer = $prevTermInfo[1] /* freqPointer */; $proxPointer = $prevTermInfo[2] /* proxPointer */; for ($count = $prevPosition*$indexInterval + 1; $count <= $termCount && ( $this->_getFieldPosition($termFieldNum) < $searchDicField || ($this->_getFieldPosition($termFieldNum) == $searchDicField && strcmp($termValue, $term->text) < 0) ); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if( $docFreq >= $skipInterval ) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { $termInfo = null; } // Put loaded termInfo into cache $this->_termInfoCache[$termKey] = $termInfo; if (count($this->_termInfoCache) == 1024) { $this->_cleanUpTermInfoCache(); } return $termInfo; } /** * Returns IDs of all the documents containing term. * * @param Zend_Search_Lucene_Index_Term $term * @param integer $shift * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter * @return array */ public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { $docsFilter->segmentFilters[$this->_name] = array(); } return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $docId = 0; $result = array(); if ($docsFilter !== null) { if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); } if (isset($docsFilter->segmentFilters[$this->_name])) { // Filter already has some data for the current segment // Make short name for the filter (which doesn't need additional dereferencing) $filter = &$docsFilter->segmentFilters[$this->_name]; // Check if filter is not empty if (count($filter) == 0) { return array(); } if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { // Perform fetching // --------------------------------------------------------------- $updatedFilterData = array(); for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } if (isset($filter[$docId])) { $result[] = $shift + $docId; $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; // --------------------------------------------------------------- } else { // Perform full scan $updatedFilterData = array(); for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } if (isset($filter[$docId])) { $result[] = $shift + $docId; $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; } } else { // Filter is present, but doesn't has data for the current segment yet $filterData = array(); for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } $result[] = $shift + $docId; $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } $docsFilter->segmentFilters[$this->_name] = $filterData; } } else { for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } $result[] = $shift + $docId; } } return $result; } /** * Returns term freqs array. * Result array structure: array(docId => freq, ...) * * @param Zend_Search_Lucene_Index_Term $term * @param integer $shift * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter * @return Zend_Search_Lucene_Index_TermInfo */ public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { $docsFilter->segmentFilters[$this->_name] = array(); } return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $result = array(); $docId = 0; $result = array(); if ($docsFilter !== null) { if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); } if (isset($docsFilter->segmentFilters[$this->_name])) { // Filter already has some data for the current segment // Make short name for the filter (which doesn't need additional dereferencing) $filter = &$docsFilter->segmentFilters[$this->_name]; // Check if filter is not empty if (count($filter) == 0) { return array(); } if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { // Perform fetching // --------------------------------------------------------------- $updatedFilterData = array(); for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; if (isset($filter[$docId])) { $result[$shift + $docId] = 1; $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } else { $docId += $docDelta/2; if (isset($filter[$docId])) { $result[$shift + $docId] = $frqFile->readVInt(); $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; // --------------------------------------------------------------- } else { // Perform full scan $updatedFilterData = array(); for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; if (isset($filter[$docId])) { $result[$shift + $docId] = 1; $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here } } else { $docId += $docDelta/2; if (isset($filter[$docId])) { $result[$shift + $docId] = $frqFile->readVInt(); $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here } } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; } } else { // Filter doesn't has data for current segment $filterData = array(); for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $result[$shift + $docId] = 1; $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } else { $docId += $docDelta/2; $result[$shift + $docId] = $frqFile->readVInt(); $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here } } $docsFilter->segmentFilters[$this->_name] = $filterData; } } else { for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $result[$shift + $docId] = 1; } else { $docId += $docDelta/2; $result[$shift + $docId] = $frqFile->readVInt(); } } } return $result; } /** * Returns term positions array. * Result array structure: array(docId => array(pos1, pos2, ...), ...) * * @param Zend_Search_Lucene_Index_Term $term * @param integer $shift * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter * @return Zend_Search_Lucene_Index_TermInfo */ public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) { $termInfo = $this->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { $docsFilter->segmentFilters[$this->_name] = array(); } return array(); } $frqFile = $this->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $docId = 0; $freqs = array(); if ($docsFilter !== null) { if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); } if (isset($docsFilter->segmentFilters[$this->_name])) { // Filter already has some data for the current segment // Make short name for the filter (which doesn't need additional dereferencing) $filter = &$docsFilter->segmentFilters[$this->_name]; // Check if filter is not empty if (count($filter) == 0) { return array(); } if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { // Perform fetching // --------------------------------------------------------------- for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $updatedFilterData = array(); $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); // we have to read .prx file to get right position for next doc // even filter doesn't match current document for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } // Include into updated filter and into result only if doc is matched by filter if (isset($filter[$docId])) { $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $result[$shift + $docId] = $positions; } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; // --------------------------------------------------------------- } else { // Perform full scan for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $updatedFilterData = array(); $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); // we have to read .prx file to get right position for next doc // even filter doesn't match current document for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } // Include into updated filter and into result only if doc is matched by filter if (isset($filter[$docId])) { $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $result[$shift + $docId] = $positions; } } $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; } } else { // Filter doesn't has data for current segment for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $filterData = array(); $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } $result[$shift + $docId] = $positions; } $docsFilter->segmentFilters[$this->_name] = $filterData; } } else { for ($count = 0; $count < $termInfo->docFreq; $count++) { $docDelta = $frqFile->readVInt(); if ($docDelta % 2 == 1) { $docId += ($docDelta-1)/2; $freqs[$docId] = 1; } else { $docId += $docDelta/2; $freqs[$docId] = $frqFile->readVInt(); } } $result = array(); $prxFile = $this->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer, SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } $result[$shift + $docId] = $positions; } } return $result; } /** * Load normalizatin factors from an index file * * @param integer $fieldNum * @throws Zend_Search_Lucene_Exception */ private function _loadNorm($fieldNum) { if ($this->_hasSingleNormFile) { $normfFile = $this->openCompoundFile('.nrm'); $header = $normfFile->readBytes(3); $headerFormatVersion = $normfFile->readByte(); if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong norms file format.'); } foreach ($this->_fields as $fNum => $fieldInfo) { if ($fieldInfo->isIndexed) { $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount); } } } else { $fFile = $this->openCompoundFile('.f' . $fieldNum); $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); } } /** * Returns normalization factor for specified documents * * @param integer $id * @param string $fieldName * @return float */ public function norm($id, $fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ( !($this->_fields[$fieldNum]->isIndexed) ) { return null; } if (!isset($this->_norms[$fieldNum])) { $this->_loadNorm($fieldNum); } return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) ); } /** * Returns norm vector, encoded in a byte string * * @param string $fieldName * @return string */ public function normVector($fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), $this->_docCount); } if (!isset($this->_norms[$fieldNum])) { $this->_loadNorm($fieldNum); } return $this->_norms[$fieldNum]; } /** * Returns true if any documents have been deleted from this index segment. * * @return boolean */ public function hasDeletions() { return $this->_deleted !== null; } /** * Returns true if segment has single norms file. * * @return boolean */ public function hasSingleNormFile() { return $this->_hasSingleNormFile ? true : false; } /** * Returns true if segment is stored using compound segment file. * * @return boolean */ public function isCompound() { return $this->_isCompound; } /** * Deletes a document from the index segment. * $id is an internal document id * * @param integer */ public function delete($id) { $this->_deletedDirty = true; if (extension_loaded('bitset')) { if ($this->_deleted === null) { $this->_deleted = bitset_empty($id); } bitset_incl($this->_deleted, $id); } else { if ($this->_deleted === null) { $this->_deleted = array(); } $this->_deleted[$id] = 1; } } /** * Checks, that document is deleted * * @param integer * @return boolean */ public function isDeleted($id) { if ($this->_deleted === null) { return false; } if (extension_loaded('bitset')) { return bitset_in($this->_deleted, $id); } else { return isset($this->_deleted[$id]); } } /** * Detect latest delete generation * * Is actualy used from writeChanges() method or from the constructor if it's invoked from * Index writer. In both cases index write lock is already obtained, so we shouldn't care * about it * * @return integer */ private function _detectLatestDelGen() { $delFileList = array(); foreach ($this->_directory->fileList() as $file) { if ($file == $this->_name . '.del') { // Matches .del file name $delFileList[] = 0; } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) { // Matches _NNN.del file names $delFileList[] = (int)base_convert($matches[1], 36, 10); } } if (count($delFileList) == 0) { // There is no deletions file for current segment in the directory // Set deletions file generation number to 1 return -1; } else { // There are some deletions files for current segment in the directory // Set deletions file generation number to the highest nuber return max($delFileList); } } /** * Write changes if it's necessary. * * This method must be invoked only from the Writer _updateSegments() method, * so index Write lock has to be already obtained. * * @internal * @throws Zend_Search_Lucene_Exceptions */ public function writeChanges() { // Get new generation number $latestDelGen = $this->_detectLatestDelGen(); if (!$this->_deletedDirty) { // There was no deletions by current process if ($latestDelGen == $this->_delGen) { // Delete file hasn't been updated by any concurrent process return; } else if ($latestDelGen > $this->_delGen) { // Delete file has been updated by some concurrent process // Reload deletions file $this->_delGen = $latestDelGen; $this->_deleted = $this->_loadDelFile(); return; } else { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.'); } } if ($latestDelGen > $this->_delGen) { // Merge current deletions with latest deletions file $this->_delGen = $latestDelGen; $latestDelete = $this->_loadDelFile(); if (extension_loaded('bitset')) { $this->_deleted = bitset_union($this->_deleted, $latestDelete); } else { $this->_deleted += $latestDelete; } } if (extension_loaded('bitset')) { $delBytes = $this->_deleted; $bitCount = count(bitset_to_array($delBytes)); } else { $byteCount = floor($this->_docCount/8)+1; $delBytes = str_repeat(chr(0), $byteCount); for ($count = 0; $count < $byteCount; $count++) { $byte = 0; for ($bit = 0; $bit < 8; $bit++) { if (isset($this->_deleted[$count*8 + $bit])) { $byte |= (1<<$bit); } } $delBytes[$count] = chr($byte); } $bitCount = count($this->_deleted); } if ($this->_delGen == -1) { // Set delete file generation number to 1 $this->_delGen = 1; } else { // Increase delete file generation number by 1 $this->_delGen++; } $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); $delFile->writeInt($this->_docCount); $delFile->writeInt($bitCount); $delFile->writeBytes($delBytes); $this->_deletedDirty = false; } /** * Term Dictionary File object for stream like terms reading * * @var Zend_Search_Lucene_Storage_File */ private $_tisFile = null; /** * Actual offset of the .tis file data * * @var integer */ private $_tisFileOffset; /** * Frequencies File object for stream like terms reading * * @var Zend_Search_Lucene_Storage_File */ private $_frqFile = null; /** * Actual offset of the .frq file data * * @var integer */ private $_frqFileOffset; /** * Positions File object for stream like terms reading * * @var Zend_Search_Lucene_Storage_File */ private $_prxFile = null; /** * Actual offset of the .prx file in the compound file * * @var integer */ private $_prxFileOffset; /** * Actual number of terms in term stream * * @var integer */ private $_termCount = 0; /** * Overall number of terms in term stream * * @var integer */ private $_termNum = 0; /** * Segment index interval * * @var integer */ private $_indexInterval; /** * Segment skip interval * * @var integer */ private $_skipInterval; /** * Last TermInfo in a terms stream * * @var Zend_Search_Lucene_Index_TermInfo */ private $_lastTermInfo = null; /** * Last Term in a terms stream * * @var Zend_Search_Lucene_Index_Term */ private $_lastTerm = null; /** * Map of the document IDs * Used to get new docID after removing deleted documents. * It's not very effective from memory usage point of view, * but much more faster, then other methods * * @var array|null */ private $_docMap = null; /** * An array of all term positions in the documents. * Array structure: array( docId => array( pos1, pos2, ...), ...) * * Is set to null if term positions loading has to be skipped * * @var array|null */ private $_lastTermPositions; /** * Terms scan mode * * Values: * * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved * document numbers are compacted (shifted if segment has deleted documents) * * @var integer */ private $_termsScanMode; /** Scan modes */ const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved // document numbers are compacted (shifted if segment contains deleted documents) /** * Reset terms stream * * $startId - id for the fist document * $compact - remove deleted documents * * Returns start document id for the next segment * * @param integer $startId * @param integer $mode * @throws Zend_Search_Lucene_Exception * @return integer */ public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */) { /** * SegmentInfo->resetTermsStream() method actually takes two optional parameters: * $startId (default value is 0) * $mode (default value is self::SM_TERMS_ONLY) */ $argList = func_get_args(); if (count($argList) > 2) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong number of arguments'); } else if (count($argList) == 2) { $startId = $argList[0]; $mode = $argList[1]; } else if (count($argList) == 1) { $startId = $argList[0]; $mode = self::SM_TERMS_ONLY; } else { $startId = 0; $mode = self::SM_TERMS_ONLY; } if ($this->_tisFile !== null) { $this->_tisFile = null; } $this->_tisFile = $this->openCompoundFile('.tis', false); $this->_tisFileOffset = $this->_tisFile->tell(); $tiVersion = $this->_tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $this->_termCount = $this->_termNum = $this->_tisFile->readLong(); // Read terms count $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { $maxSkipLevels = $this->_tisFile->readInt(); } if ($this->_frqFile !== null) { $this->_frqFile = null; } if ($this->_prxFile !== null) { $this->_prxFile = null; } $this->_docMap = array(); $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); $this->_lastTermPositions = null; $this->_termsScanMode = $mode; switch ($mode) { case self::SM_TERMS_ONLY: // Do nothing break; case self::SM_FULL_INFO: // break intentionally omitted case self::SM_MERGE_INFO: $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_frqFileOffset = $this->_frqFile->tell(); $this->_prxFile = $this->openCompoundFile('.prx', false); $this->_prxFileOffset = $this->_prxFile->tell(); for ($count = 0; $count < $this->_docCount; $count++) { if (!$this->isDeleted($count)) { $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count); } } break; default: require_once 'Zend/Search/Lucene/Exception.php'; throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.'); break; } $this->nextTerm(); return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount); } /** * Skip terms stream up to specified term preffix. * * Prefix contains fully specified field info and portion of searched term * * @param Zend_Search_Lucene_Index_Term $prefix * @throws Zend_Search_Lucene_Exception */ public function skipTo(Zend_Search_Lucene_Index_Term $prefix) { if ($this->_termDictionary === null) { $this->_loadDictionaryIndex(); } $searchField = $this->getFieldNum($prefix->field); if ($searchField == -1) { /** * Field is not presented in this segment * Go to the end of dictionary */ $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; return; } $searchDicField = $this->_getFieldPosition($searchField); // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); $delta = $searchDicField - $fieldNum; if ($delta == 0) { $delta = strcmp($prefix->text, $midTerm[1] /* text */); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { // We have reached term we are looking for break; } } if ($highIndex == -1) { // Term is out of the dictionary range $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; return; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; if ($this->_tisFile === null) { // The end of terms stream is reached and terms dictionary file is closed // Perform mini-reset operation $this->_tisFile = $this->openCompoundFile('.tis', false); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_frqFile = $this->openCompoundFile('.frq', false); $this->_prxFile = $this->openCompoundFile('.prx', false); } } $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET); $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */, ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name); $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */, $prevTermInfo[1] /* freqPointer */, $prevTermInfo[2] /* proxPointer */, $prevTermInfo[3] /* skipOffset */); $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval; if ($highIndex == 0) { // skip start entry $this->nextTerm(); } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) { // We got exact match in the dictionary index if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { $docDelta = $this->_frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } return; } // Search term matching specified prefix while ($this->_lastTerm !== null) { if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 || ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) { // Current term matches or greate than the pattern return; } $this->nextTerm(); } } /** * Scans terms dictionary and returns next term * * @return Zend_Search_Lucene_Index_Term|null */ public function nextTerm() { if ($this->_tisFile === null || $this->_termCount == 0) { $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; // may be necessary for "empty" segment $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; return null; } $termPrefixLength = $this->_tisFile->readVInt(); $termSuffix = $this->_tisFile->readString(); $termFieldNum = $this->_tisFile->readVInt(); $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name); $docFreq = $this->_tisFile->readVInt(); $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); if ($docFreq >= $this->_skipInterval) { $skipOffset = $this->_tisFile->readVInt(); } else { $skipOffset = 0; } $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { $this->_lastTermPositions = array(); $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); $freqs = array(); $docId = 0; for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { $docDelta = $this->_frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $this->_frqFile->readVInt(); } } $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $this->_prxFile->readVInt(); $positions[] = $termPosition; } if (isset($this->_docMap[$docId])) { $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; } } } $this->_termCount--; if ($this->_termCount == 0) { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; } return $this->_lastTerm; } /** * Close terms stream * * Should be used for resources clean up if stream is not read up to the end */ public function closeTermsStream() { $this->_tisFile = null; $this->_frqFile = null; $this->_prxFile = null; $this->_lastTerm = null; $this->_lastTermInfo = null; $this->_lastTermPositions = null; $this->_docMap = null; } /** * Returns term in current position * * @return Zend_Search_Lucene_Index_Term|null */ public function currentTerm() { return $this->_lastTerm; } /** * Returns an array of all term positions in the documents. * Return array structure: array( docId => array( pos1, pos2, ...), ...) * * @return array */ public function currentTermPositions() { return $this->_lastTermPositions; } }