1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

17 Commits

Author SHA1 Message Date
Viktor Lofgren
edd453531e (index) Partition keyword lexicons by language 2025-09-04 17:24:48 +02:00
Viktor Lofgren
096496ada1 (refac) Fold ft-anchor-keywords into converting-process 2025-09-03 13:04:30 +02:00
Viktor Lofgren
8ca6209260 (refac) Fold ft-anchor-keywords into converting-process 2025-09-03 13:03:38 +02:00
Viktor Lofgren
673c65d3c9 (refac) Fold term-frequency-dict into language-processing 2025-09-03 12:59:10 +02:00
Viktor Lofgren
acb9ec7b15 (refac) Consistently use 'languageIsoCode' for the language field 2025-09-03 12:54:18 +02:00
Viktor Lofgren
47079e05db (index) Store language information in the index journal 2025-09-03 12:33:24 +02:00
Viktor Lofgren
c93056e77f (refac) Clean up index code 2025-09-03 09:51:57 +02:00
Viktor Lofgren
6f7530e807 (refac) Clean up index code 2025-09-02 18:53:58 +02:00
Viktor Lofgren
87ce4a1b52 (refac) Clean up index code 2025-09-02 17:52:38 +02:00
Viktor Lofgren
52194cbe7a (refac) Clean up index code 2025-09-02 17:44:42 +02:00
Viktor Lofgren
fd1ac03c78 (refac) Clean up index code 2025-09-02 17:30:19 +02:00
Viktor Lofgren
5e5b86efb4 (refac) Clean up index code 2025-09-02 17:24:30 +02:00
Viktor Lofgren
f332ec6191 (refac) Clean up index code 2025-09-02 13:13:10 +02:00
Viktor Lofgren
c25c1af437 (refac) Clean up index code 2025-09-02 13:04:05 +02:00
Viktor Lofgren
eb0c911b45 (refac) Clean up index code 2025-09-02 12:50:07 +02:00
Viktor Lofgren
1979870ce4 (refac) Merge index-forward, index-reverse, index/query into index
The project has too many submodules, and it's a bit of a headache to navigate.
2025-09-02 12:30:42 +02:00
Viktor Lofgren
0ba2ea38e1 (index) Move reverse index into a distinct package 2025-09-02 11:59:56 +02:00
184 changed files with 1586 additions and 2252 deletions

View File

@@ -22,7 +22,6 @@ dependencies {
implementation project(':code:processes:ping-process')
implementation project(':code:processes:new-domain-process')
implementation project(':code:processes:converting-process')
implementation project(':code:processes:index-constructor-process')
implementation project(':code:common:config')
implementation project(':code:common:model')
@@ -34,7 +33,7 @@ dependencies {
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:language-processing')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:live-capture:api')

View File

@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.ConverterMain;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.index.IndexConstructorMain;
import nu.marginalia.livecrawler.LiveCrawlerMain;
import nu.marginalia.loading.LoaderMain;
import nu.marginalia.ndp.NdpMain;
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
LIVE_CRAWLER(LiveCrawlerMain.class),
CONVERTER(ConverterMain.class),
LOADER(LoaderMain.class),
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
NDP(NdpMain.class),
EXPORT_TASKS(ExportTasksMain.class),
;
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
ProcessId(Class<? extends ProcessMainClass> mainClass) {
this.mainClass = mainClass.getName();
}
ProcessId(String mainClassFullName) {
this.mainClass = mainClassFullName;
}
List<String> envOpts() {
String variable = switch (this) {

View File

@@ -22,7 +22,6 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:index:query')
implementation project(':code:libraries:language-processing')
implementation libs.bundles.slf4j

View File

@@ -2,8 +2,8 @@ package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
import java.util.ArrayList;
import java.util.List;

View File

@@ -9,7 +9,7 @@ import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.model.EdgeUrl;
import java.util.ArrayList;

View File

@@ -2,8 +2,6 @@ package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import javax.annotation.Nullable;
import java.util.List;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.limit;
package nu.marginalia.api.searchquery.model.query;
public enum QueryStrategy {
SENTENCE,

View File

@@ -2,8 +2,6 @@ package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import javax.annotation.Nullable;
import java.util.List;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.limit;
package nu.marginalia.api.searchquery.model.query;
public record SpecificationLimit(SpecificationLimitType type, int value) {
public boolean isNone() {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.limit;
package nu.marginalia.api.searchquery.model.query;
public enum SpecificationLimitType {
NONE,

View File

@@ -3,7 +3,7 @@ package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import org.junit.jupiter.api.Test;
import java.util.List;

View File

@@ -22,17 +22,13 @@ dependencies {
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:query')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:openzim')
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j

View File

@@ -8,8 +8,8 @@ import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;

View File

@@ -1,7 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.util.transform_list.TransformList;

View File

@@ -1,7 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser.token;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
public sealed interface QueryToken {
String str();

View File

@@ -3,14 +3,9 @@ package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.Assertions;
@@ -217,6 +212,12 @@ public class QueryFactoryTest {
}
@Test
public void testExpansion10() {
var subquery = parseAndGetSpecs("when was captain james cook born");
System.out.println(subquery);
}
@Test
public void testContractionWordNum() {
var subquery = parseAndGetSpecs("glove 80");

View File

@@ -23,8 +23,12 @@ dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:skiplist')
implementation project(':code:libraries:native')
implementation project(':code:libraries:random-write-funnel')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:message-queue')
implementation project(':code:common:db')
implementation project(':code:common:config')
@@ -33,11 +37,9 @@ dependencies {
implementation project(':code:common:service')
implementation project(':code:processes:converting-process:model')
implementation project(':code:processes:process-mq-api')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
@@ -75,7 +77,7 @@ dependencies {
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:language-processing')
testImplementation project(':code:libraries:braille-block-punch-cards')
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -1,39 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:processes:converting-process:model')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.roaringbitmap
implementation libs.fastutil
implementation libs.trove
implementation libs.slop
testImplementation project(':code:libraries:test-helpers')
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@@ -1,33 +0,0 @@
package nu.marginalia.index.forward;
import java.nio.file.Path;
public class ForwardIndexFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case DOC_ID -> switch (version) {
case NEXT -> basePath.resolve("fwd-doc-id.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-id.dat");
};
case DOC_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
};
case SPANS_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-spans.dat.next");
case CURRENT -> basePath.resolve("fwd-spans.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
}
public enum FileIdentifier {
DOC_DATA,
SPANS_DATA,
DOC_ID
}
}

View File

@@ -1,71 +0,0 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.sequence.VarintCodedSequence;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
@Deprecated
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
private final FileChannel spansFileChannel;
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
}
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// Decode the size and offset from the encoded offset
long size = SpansCodec.decodeSize(encodedOffset);
long offset = SpansCodec.decodeStartOffset(encodedOffset);
// Allocate a buffer from the arena
var buffer = arena.allocate(size).asByteBuffer();
buffer.clear();
while (buffer.hasRemaining()) {
spansFileChannel.read(buffer, offset + buffer.position());
}
buffer.flip();
// Read the number of spans in the document
int count = buffer.get();
DocumentSpans ret = new DocumentSpans();
// Decode each span
while (count-- > 0) {
byte code = buffer.get();
short len = buffer.getShort();
ByteBuffer data = buffer.slice(buffer.position(), len);
ret.accept(code, new VarintCodedSequence(data));
// Reset the buffer position to the end of the span
buffer.position(buffer.position() + len);
}
return ret;
}
@Override
public DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws IOException {
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int i = 0; i < encodedOffsets.length; i++) {
if (encodedOffsets[i] >= 0) {
ret[i] = readSpans(arena, encodedOffsets[i]);
}
}
return ret;
}
@Override
public void close() throws IOException {
spansFileChannel.close();
}
}

View File

@@ -1,21 +0,0 @@
# Forward Index
The forward index contains a mapping from document id to various forms of document metadata.
In practice, the forward index consists of two files, an `id` file and a `data` file.
The `id` file contains a list of sorted document ids, and the `data` file contains
metadata for each document id, in the same order as the `id` file, with a fixed
size record containing data associated with each document id.
Each record contains a binary encoded [DocumentMetadata](../../common/model/java/nu/marginalia/model/idx/DocumentMetadata.java) object,
as well as a [HtmlFeatures](../../common/model/java/nu/marginalia/model/crawl/HtmlFeature.java) bitmask.
Unlike the reverse index, the forward index is not split into two tiers, and the data is in the same
order as it is in the source data, and the cardinality of the document IDs is assumed to fit in memory,
so it's relatively easy to construct.
## Central Classes
* [ForwardIndexConverter](java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java) constructs the index.
* [ForwardIndexReader](java/nu/marginalia/index/forward/ForwardIndexReader.java) interrogates the index.

View File

@@ -2,11 +2,10 @@ package nu.marginalia.index.journal;
import nu.marginalia.slop.SlopTable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.*;
public record IndexJournal(Path journalDir) {
@@ -47,4 +46,21 @@ public record IndexJournal(Path journalDir) {
return instances;
}
public Set<String> languages() {
try {
Set<String> languages = new HashSet<>(languages());
for (var instance : pages()) {
try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
languages.addAll(instance.openLanguageIsoCode(slopTable).getDictionary());
}
}
return languages;
}
catch (IOException ex) {
throw new RuntimeException("Failed to read langauges from index journal");
}
}
}

View File

@@ -6,17 +6,22 @@ import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import nu.marginalia.slop.desc.StorageType;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
public record IndexJournalPage(Path baseDir, int page) {
public static IntColumn features = new IntColumn("features", StorageType.PLAIN);
public static IntColumn size = new IntColumn("size", StorageType.PLAIN);
public static LongColumn combinedId = new LongColumn("combinedId", StorageType.PLAIN);
public static LongColumn documentMeta = new LongColumn("documentMeta", StorageType.PLAIN);
public static EnumColumn languageIsoCode = new EnumColumn("languageIsoCode", StandardCharsets.US_ASCII, StorageType.PLAIN);
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
@@ -24,6 +29,7 @@ public record IndexJournalPage(Path baseDir, int page) {
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public IndexJournalPage {
if (!baseDir.toFile().isDirectory()) {
throw new IllegalArgumentException("Invalid base directory: " + baseDir);
@@ -46,6 +52,9 @@ public record IndexJournalPage(Path baseDir, int page) {
return size.open(table);
}
public EnumColumn.Reader openLanguageIsoCode(SlopTable table) throws IOException {
return languageIsoCode.open(table);
}
public LongArrayColumn.Reader openTermIds(SlopTable table) throws IOException {
return termIds.open(table);

View File

@@ -8,6 +8,7 @@ import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import java.io.IOException;
import java.nio.file.Files;
@@ -27,6 +28,7 @@ public class IndexJournalSlopWriter extends SlopTable {
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
private final ByteArrayColumn.Writer spanCodesWriter;
private final EnumColumn.Writer languagesWriter;
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
@@ -48,6 +50,8 @@ public class IndexJournalSlopWriter extends SlopTable {
spanCodesWriter = IndexJournalPage.spanCodes.create(this);
spansWriter = IndexJournalPage.spans.create(this);
languagesWriter = IndexJournalPage.languageIsoCode.create(this);
}
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection, KeywordHasher hasher) throws IOException {
@@ -56,6 +60,7 @@ public class IndexJournalSlopWriter extends SlopTable {
featuresWriter.put(keywordsProjection.htmlFeatures());
sizeWriter.put(keywordsProjection.length());
documentMetaWriter.put(keywordsProjection.documentMetadata());
languagesWriter.put(keywordsProjection.languageIsoCode());
// -- write keyword data --
@@ -85,6 +90,7 @@ public class IndexJournalSlopWriter extends SlopTable {
termIdsWriter.close();
termMetadataWriter.close();
termPositionsWriter.close();
languagesWriter.close();
spansWriter.close();
spanCodesWriter.close();
}

View File

@@ -23,13 +23,9 @@ dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:language-processing')
implementation project(':code:common:linkdb')
implementation project(':code:index')
implementation project(':code:index:query')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':third-party:commons-codec')
implementation project(':code:functions:search-query')
implementation project(':code:functions:search-query:api')

View File

@@ -9,19 +9,18 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.CombinedIndexReader;
import nu.marginalia.index.IndexQueryExecution;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.StatefulIndex;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.DomainRankingOverrides;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.WordLexicon;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.linkdb.docs.DocumentDbReader;
@@ -90,13 +89,13 @@ public class PerfTestMain {
),
new FullReverseIndexReader(
"full",
indexDir.resolve("ir/rev-words.dat"),
List.of(new WordLexicon("en", indexDir.resolve("ir/rev-words-en.dat"))),
indexDir.resolve("ir/rev-docs.dat"),
new PositionsFileReader(indexDir.resolve("ir/rev-positions.dat"))
indexDir.resolve("ir/rev-positions.dat")
),
new PrioReverseIndexReader(
"prio",
indexDir.resolve("ir/rev-prio-words.dat"),
List.of(new WordLexicon("en", indexDir.resolve("ir/rev-words-prio-en.dat"))),
indexDir.resolve("ir/rev-prio-docs.dat")
)
);
@@ -156,7 +155,7 @@ public class PerfTestMain {
allResults.subList(512, allResults.size()).clear();
}
var rankingData = rankingService.prepareRankingData(rankingContext, new CombinedDocIdList(allResults.toArray()), null);
var rankingData = rankingService.prepareRankingData(rankingContext, new CombinedDocIdList(allResults.toArray()));
int sum = 0;
@@ -167,9 +166,8 @@ public class PerfTestMain {
int iter;
for (iter = 0;; iter++) {
IndexSearchBudget budget = new IndexSearchBudget(10000);
long start = System.nanoTime();
sum2 += rankingService.rankResults(budget, rankingContext, rankingData, false).size();
sum2 += rankingService.rankResults(rankingContext, rankingData, false).size();
long end = System.nanoTime();
times.add((end - start)/1_000_000.);
@@ -219,7 +217,7 @@ public class PerfTestMain {
List<Double> times = new ArrayList<>();
int iter;
for (iter = 0;; iter++) {
var execution = new IndexQueryExecution(SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny()), 1, rankingService, indexReader);
var execution = new IndexQueryExecution(indexReader, rankingService, SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny()), 1);
long start = System.nanoTime();
execution.run();
long end = System.nanoTime();

View File

@@ -1,44 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:native')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:skiplist')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:random-write-funnel')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:processes:converting-process:model')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j
implementation libs.slop
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:language-processing')
}

View File

@@ -1,248 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.array.pool.BufferPool;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.*;
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListConstants;
import nu.marginalia.skiplist.SkipListReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeoutException;
import java.util.function.Consumer;
public class FullReverseIndexReader {
private final LongArray words;
private final LongArray documents;
private final long wordsDataOffset;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final BTreeReader wordsBTreeReader;
private final String name;
private final PositionsFileReader positionsFileReader;
private final BufferPool dataPool;
public FullReverseIndexReader(String name,
Path words,
Path documents,
PositionsFileReader positionsFileReader) throws IOException {
this.name = name;
this.positionsFileReader = positionsFileReader;
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
this.wordsBTreeReader = null;
this.wordsDataOffset = -1;
this.dataPool = null;
return;
}
logger.info("Switching reverse index");
this.words = LongArrayFactory.mmapForReadingShared(words);
this.documents = LongArrayFactory.mmapForReadingShared(documents);
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
LinuxSystemCalls.madviseRandom(this.documents.getMemorySegment());
dataPool = new BufferPool(documents, SkipListConstants.BLOCK_SIZE, (int) (Long.getLong("index.bufferPoolSize", 512*1024*1024L) / SkipListConstants.BLOCK_SIZE));
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
if (getClass().desiredAssertionStatus()) {
if (Boolean.getBoolean("index-self-test")) {
Executors.newSingleThreadExecutor().execute(this::selfTest);
}
}
}
public void reset() {
dataPool.reset();
}
private void selfTest() {
logger.info("Running self test program");
long wordsDataSize = wordsBTreeReader.getHeader().numEntries() * 2L;
var wordsDataRange = words.range(wordsDataOffset, wordsDataOffset + wordsDataSize);
// ReverseIndexSelfTest.runSelfTest1(wordsDataRange, wordsDataSize);
// ReverseIndexSelfTest.runSelfTest2(wordsDataRange, documents);
// ReverseIndexSelfTest.runSelfTest3(wordsDataRange, wordsBTreeReader);
// ReverseIndexSelfTest.runSelfTest4(wordsDataRange, documents);
ReverseIndexSelfTest.runSelfTest5(wordsDataRange, wordsBTreeReader);
ReverseIndexSelfTest.runSelfTest6(wordsDataRange, documents);
}
public void eachDocRange(Consumer<LongArray> eachDocRange) {
long wordsDataSize = wordsBTreeReader.getHeader().numEntries() * 2L;
var wordsDataRange = words.range(wordsDataOffset, wordsDataOffset + wordsDataSize);
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
eachDocRange.accept(docsBTreeReader.data());
}
}
/** Calculate the offset of the word in the documents.
* If the return-value is negative, the term does not exist
* in the index.
*/
long wordOffset(long termId) {
long idx = wordsBTreeReader.findEntry(termId);
if (idx < 0)
return -1L;
return words.get(wordsDataOffset + idx + 1);
}
public EntrySource documents(long termId) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new EmptyEntrySource();
return new FullIndexEntrySource(name, getReader(offset), termId);
}
/** Create a filter step requiring the specified termId to exist in the documents */
public QueryFilterStepIf also(long termId, IndexSearchBudget budget) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(getReader(offset), name, termId, budget);
}
/** Create a filter step requiring the specified termId to be absent from the documents */
public QueryFilterStepIf not(long termId, IndexSearchBudget budget) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterLetThrough();
return new ReverseIndexRejectFilter(getReader(offset), budget);
}
/** Return the number of documents with the termId in the index */
public int numDocuments(long termId) {
long offset = wordOffset(termId);
if (offset < 0)
return 0;
return getReader(offset).estimateSize();
}
/** Create a BTreeReader for the document offset associated with a termId */
private SkipListReader getReader(long offset) {
return new SkipListReader(dataPool, offset);
}
public TermData[] getTermData(Arena arena,
IndexSearchBudget budget,
long[] termIds,
long[] docIds)
throws TimeoutException
{
long[] offsetsAll = new long[termIds.length * docIds.length];
for (int i = 0; i < termIds.length; i++) {
long termId = termIds[i];
long offset = wordOffset(termId);
if (offset < 0) {
// This is likely a bug in the code, but we can't throw an exception here
logger.debug("Missing offset for word {}", termId);
continue;
}
var reader = getReader(offset);
// Read the size and offset of the position data
var offsetsForTerm = reader.getValueOffsets(docIds);
System.arraycopy(offsetsForTerm, 0, offsetsAll, i * docIds.length, docIds.length);
}
return positionsFileReader.getTermData(arena, budget, offsetsAll);
}
public TermData[] getTermData(Arena arena,
long termId,
long[] docIds)
{
var ret = new TermData[docIds.length];
long offset = wordOffset(termId);
if (offset < 0) {
// This is likely a bug in the code, but we can't throw an exception here
logger.debug("Missing offset for word {}", termId);
return ret;
}
var reader = getReader(offset);
// Read the size and offset of the position data
var offsets = reader.getValueOffsets(docIds);
// FIXME this entire method chain only exists for a single unit test
// remove me!
try {
return positionsFileReader.getTermData(arena, new IndexSearchBudget(1000), offsets);
} catch (TimeoutException e) {
throw new RuntimeException(e);
}
}
public void close() {
try {
dataPool.close();
}
catch (Exception e) {
logger.warn("Error while closing bufferPool", e);
}
if (documents != null)
documents.close();
if (words != null)
words.close();
if (positionsFileReader != null) {
try {
positionsFileReader.close();
} catch (IOException e) {
logger.error("Failed to close positions file reader", e);
}
}
}
}

View File

@@ -1,116 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
public class PrioReverseIndexReader {
private final LongArray words;
private final long wordsDataOffset;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final BTreeReader wordsBTreeReader;
private final String name;
private final FileChannel documentsChannel;
public PrioReverseIndexReader(String name,
Path words,
Path documents) throws IOException {
this.name = name;
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.wordsBTreeReader = null;
this.documentsChannel = null;
this.wordsDataOffset = -1;
return;
}
logger.info("Switching reverse index");
this.words = LongArrayFactory.mmapForReadingShared(words);
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
documentsChannel = (FileChannel) Files.newByteChannel(documents);
}
/** Calculate the offset of the word in the documents.
* If the return-value is negative, the term does not exist
* in the index.
*/
long wordOffset(long termId) {
long idx = wordsBTreeReader.findEntry(termId);
if (idx < 0)
return -1L;
return words.get(wordsDataOffset + idx + 1);
}
public EntrySource documents(long termId) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new EmptyEntrySource();
return new PrioIndexEntrySource(name,
documentsChannel,
offset,
termId);
}
/** Return the number of documents with the termId in the index */
public int numDocuments(long termId) {
long offset = wordOffset(termId);
if (offset < 0) // No documents
return 0;
ByteBuffer buffer = ByteBuffer.allocate(4);
try {
documentsChannel.read(buffer, offset);
}
catch (IOException e) {
logger.error("Failed to read documents channel", e);
return 0;
}
return buffer.getInt(0) & 0x3FFF_FFFF;
}
public void close() {
try {
documentsChannel.close();
}
catch (IOException e) {
logger.error("Failed to close documents channel", e);
}
if (words != null)
words.close();
}
}

View File

@@ -1,33 +0,0 @@
package nu.marginalia.index;
import java.nio.file.Path;
public class ReverseIndexFullFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case WORDS -> switch (version) {
case NEXT -> basePath.resolve("rev-words.dat.next");
case CURRENT -> basePath.resolve("rev-words.dat");
};
case DOCS -> switch (version) {
case NEXT -> basePath.resolve("rev-docs.dat.next");
case CURRENT -> basePath.resolve("rev-docs.dat");
};
case POSITIONS -> switch (version) {
case NEXT -> basePath.resolve("rev-positions.dat.next");
case CURRENT -> basePath.resolve("rev-positions.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT,
}
public enum FileIdentifier {
WORDS,
DOCS,
POSITIONS,
}
}

View File

@@ -1,11 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_512);
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
}

View File

@@ -1,28 +0,0 @@
package nu.marginalia.index;
import java.nio.file.Path;
public class ReverseIndexPrioFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case WORDS -> switch (version) {
case NEXT -> basePath.resolve("rev-prio-words.dat.next");
case CURRENT -> basePath.resolve("rev-prio-words.dat");
};
case DOCS -> switch (version) {
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
}
public enum FileIdentifier {
WORDS,
DOCS,
}
}

View File

@@ -1,109 +0,0 @@
package nu.marginalia.index;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Random;
public class ReverseIndexSelfTest {
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexSelfTest.class);
public static void runSelfTest1(LongArray wordsDataRange, long wordsDataSize) {
logger.info("Starting test 1");
if (!wordsDataRange.isSortedN(2, 0, wordsDataSize))
logger.error("Failed test 1: Words data is not sorted");
else
logger.info("Passed test 1");
}
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 2");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
if (!docRange.isSortedN(2, 0, header.numEntries() * 2L)) {
logger.error("Failed test 2: numEntries={}, offset={}", header.numEntries(), header.dataOffsetLongs());
return;
}
}
logger.info("Passed test 2");
}
public static void runSelfTest3(LongArray wordsDataRange, BTreeReader reader) {
logger.info("Starting test 3");
for (long i = 0; i < wordsDataRange.size(); i+=2) {
if (reader.findEntry(wordsDataRange.get(i)) < 0) {
logger.error("Failed Test 3");
return;
}
}
logger.info("Passed test 3");
}
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 4");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
for (int j = 0; j < docRange.size(); j+=2) {
if (docsBTreeReader.findEntry(docRange.get(j)) < 0) {
logger.info("Failed test 4");
return;
}
}
}
logger.info("Passed test 4");
}
public static void runSelfTest5(LongArray wordsDataRange, BTreeReader wordsBTreeReader) {
logger.info("Starting test 5");
LongOpenHashSet words = new LongOpenHashSet((int)wordsDataRange.size()/2);
for (int i = 0; i < wordsDataRange.size(); i+=2) {
words.add(wordsDataRange.get(i));
}
var random = new Random();
for (int i = 0; i < 100_000_000; i++) {
long v;
do {
v = random.nextLong();
} while (words.contains(v));
if (wordsBTreeReader.findEntry(v) >= 0) {
logger.error("Failed test 5 @ W{}", v);
return;
}
}
logger.info("Passed test 5");
}
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 6");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
Long prev = null;
for (int j = 0; j < docRange.size(); j+=2) {
if (prev == null) {
prev = docRange.get(j);
continue;
}
long thisVal = prev + 1;
long nextVal = docRange.get(j);
while (thisVal < nextVal) {
if (docsBTreeReader.findEntry(thisVal) >= 0) {
logger.info("Failed test 6 @ W{}:D{}", wordsDataRange.get(i-1), thisVal);
return;
}
thisVal++;
}
}
}
logger.info("Passed test 6");
}
}

View File

@@ -1,56 +0,0 @@
# Reverse Index
The reverse index contains a mapping from word to document id.
There are two tiers of this index.
* A priority index which only indexes terms that are flagged with priority flags<sup>1</sup>.
* A full index that indexes all terms.
The full index also provides access to term-level metadata, while the priority index is
a binary index that only offers information about which documents has a specific word.
The priority index is also compressed, while the full index at this point is not.
[1] See WordFlags in [common/model](../../common/model/) and
KeywordMetadata in [converting-process/ft-keyword-extraction](../../processes/converting-process/ft-keyword-extraction).
## Construction
The reverse index is constructed by first building a series of preindexes.
Preindexes consist of a Segment and a Documents object. The segment contains
information about which word identifiers are present and how many, and the
documents contain information about in which documents the words can be found.
![Memory layout illustrations](./preindex.svg)
These would typically not fit in RAM, so the index journal is paged
and the preindexes are constructed small enough to fit in memory, and
then merged. Merging sorted arrays is a very fast operation that does
not require additional RAM.
![Illustration of successively merged preindex files](./merging.svg)
Once merged into one large preindex, indexes are added to the preindex data
to form a finalized reverse index.
![Illustration of the data layout of the finalized index](index.svg)
## Central Classes
Full index:
* [FullPreindex](java/nu/marginalia/index/construction/full/FullPreindex.java) intermediate reverse index state.
* [FullIndexConstructor](java/nu/marginalia/index/construction/full/FullIndexConstructor.java) constructs the index.
* [FullReverseIndexReader](java/nu/marginalia/index/FullReverseIndexReader.java) interrogates the index.
Prio index:
* [PrioPreindex](java/nu/marginalia/index/construction/prio/PrioPreindex.java) intermediate reverse index state.
* [PrioIndexConstructor](java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java) constructs the index.
* [PrioIndexReader](java/nu/marginalia/index/PrioReverseIndexReader.java) interrogates the index.
## See Also
* [index-journal](../index-journal)
* [index-forward](../index-forward)
* [libraries/btree](../../libraries/btree)
* [libraries/array](../../libraries/array)

View File

@@ -1,49 +0,0 @@
package nu.marginalia.index;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
public class ReverseIndexDebugTest {
@Test
@Disabled // this is a debugging utility
public void debug() throws IOException {
long problemWord = -7909917549851025932L;
long problemDoc = 9079256848846028801L;
var words = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-words.dat"));
var documents = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-docs.dat"));
var wordsBTreeReader = new BTreeReader(words, ReverseIndexParameters.wordsBTreeContext, 0);
var wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
long wordOffset = wordsBTreeReader.findEntry(problemWord);
assertTrue(wordOffset >= 0);
var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
// We find problemDoc even though it doesn't exist in the document range
long docOffset = docsReader.findEntry(problemDoc);
assertTrue(docOffset < 0);
// We know it doesn't exist because when we check, we can't find it,
// either by iterating...
var dataRange = docsReader.data();
System.out.println(dataRange.size());
for (int i = 0; i < dataRange.size(); i+=2) {
assertNotEquals(problemDoc, dataRange.get(i));
}
// or by binary searching
assertTrue(dataRange.binarySearchN(2, problemDoc, 0, dataRange.size()) < 0);
}
}

View File

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@@ -1,25 +1,24 @@
package nu.marginalia.index.index;
package nu.marginalia.index;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.index.model.TermMetadataList;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.IndexLanguageContext;
import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import org.slf4j.Logger;
@@ -28,7 +27,6 @@ import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
@@ -56,24 +54,33 @@ public class CombinedIndexReader {
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
public IndexLanguageContext createLanguageContext(String languageIsoCode) {
return new IndexLanguageContext(languageIsoCode,
reverseIndexFullReader.getWordLexicon(languageIsoCode),
reverseIndexPriorityReader.getWordLexicon(languageIsoCode)
);
}
public QueryFilterStepIf hasWordFull(long termId, IndexSearchBudget budget) {
return reverseIndexFullReader.also(termId, budget);
public IndexQueryBuilder newQueryBuilder(IndexLanguageContext context, IndexQuery query) {
return new IndexQueryBuilder(reverseIndexFullReader, context, query);
}
public QueryFilterStepIf hasWordFull(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
return reverseIndexFullReader.also(languageContext, termId, budget);
}
/** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId), true))
.withSourceTerms(wordId);
public IndexQueryBuilder findPriorityWord(IndexLanguageContext languageContext, long wordId) {
IndexQuery query = new IndexQuery(reverseIndexPriorityReader.documents(languageContext, wordId), true);
return newQueryBuilder(languageContext, query).withSourceTerms(wordId);
}
/** Creates a query builder for terms in the full index */
public IndexQueryBuilder findFullWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexFullReader.documents(wordId), false))
.withSourceTerms(wordId);
public IndexQueryBuilder findFullWord(IndexLanguageContext languageContext, long wordId) {
IndexQuery query = new IndexQuery(reverseIndexFullReader.documents(languageContext, wordId), false);
return newQueryBuilder(languageContext, query).withSourceTerms(wordId);
}
/** Creates a parameter matching filter step for the provided parameters */
@@ -82,8 +89,8 @@ public class CombinedIndexReader {
}
/** Returns the number of occurrences of the word in the full index */
public int numHits(long word) {
return reverseIndexFullReader.numDocuments(word);
public int numHits(IndexLanguageContext languageContext, long word) {
return reverseIndexFullReader.numDocuments(languageContext, word);
}
/** Reset caches and buffers */
@@ -98,9 +105,15 @@ public class CombinedIndexReader {
return Collections.emptyList();
}
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
final IndexLanguageContext languageContext = context.languageContext;
final long[] termPriority = context.sortedDistinctIncludes((a,b) -> {
return Long.compare(
numHits(languageContext, a),
numHits(languageContext, b)
);
});
final long[] termPriority = context.sortedDistinctIncludes(this::compareKeywords);
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(context.compiledQueryIds);
// Remove any paths that do not contain all prioritized terms, as this means
@@ -120,18 +133,18 @@ public class CombinedIndexReader {
return 0;
});
var head = findFullWord(elements.getLong(0));
var head = findFullWord(languageContext, elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
head.addInclusionFilter(hasWordFull(elements.getLong(i), context.budget));
head.addInclusionFilter(hasWordFull(languageContext, elements.getLong(i), context.budget));
}
queryHeads.add(head);
// If there are few paths, we can afford to check the priority index as well
if (paths.size() < 4) {
var prioHead = findPriorityWord(elements.getLong(0));
var prioHead = findPriorityWord(languageContext, elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i), context.budget));
prioHead.addInclusionFilter(hasWordFull(languageContext, elements.getLong(i), context.budget));
}
queryHeads.add(prioHead);
}
@@ -165,37 +178,20 @@ public class CombinedIndexReader {
return permittedTerms::containsAll;
}
private int compareKeywords(long a, long b) {
return Long.compare(
numHits(a),
numHits(b)
);
}
/** Returns the number of occurrences of the word in the priority index */
public int numHitsPrio(long word) {
return reverseIndexPriorityReader.numDocuments(word);
public int numHitsPrio(IndexLanguageContext languageContext, long word) {
return reverseIndexPriorityReader.numDocuments(languageContext, word);
}
/** Retrieves the term metadata for the specified word for the provided documents */
public TermMetadataList[] getTermMetadata(Arena arena,
IndexLanguageContext languageContext,
IndexSearchBudget budget,
long[] wordIds,
CombinedDocIdList docIds)
throws TimeoutException
{
TermData[] combinedTermData = reverseIndexFullReader.getTermData(arena, budget, wordIds, docIds.array());
TermMetadataList[] ret = new TermMetadataList[wordIds.length];
for (int i = 0; i < wordIds.length; i++) {
ret[i] = new TermMetadataList(Arrays.copyOfRange(combinedTermData, i*docIds.size(), (i+1)*docIds.size()));
}
return ret;
}
public TermMetadataList getTermMetadata(Arena arena,
long wordId,
CombinedDocIdList docIds)
{
return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
return reverseIndexFullReader.getTermData(arena, languageContext, budget, wordIds, docIds);
}
/** Retrieves the document metadata for the specified document */
@@ -220,12 +216,7 @@ public class CombinedIndexReader {
/** Retrieves the document spans for the specified documents */
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, CombinedDocIdList docIds) throws TimeoutException {
long[] decodedIDs = docIds.array();
for (int i = 0; i < decodedIDs.length; i++) {
decodedIDs[i] = UrlIdCodec.removeRank(decodedIDs[i]);
}
return forwardIndexReader.getDocumentSpans(arena, budget, decodedIDs);
return forwardIndexReader.getDocumentSpans(arena, budget, docIds);
}
/** Close the indexes (this is not done immediately)

View File

@@ -3,16 +3,18 @@ package nu.marginalia.index;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.config.IndexFileName;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.reverse.construction.full.FullIndexConstructor;
import nu.marginalia.index.reverse.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.searchset.DomainRankings;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mqapi.ProcessInboxNames;
import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName;
import nu.marginalia.process.ProcessConfiguration;
import nu.marginalia.process.ProcessConfigurationModule;
import nu.marginalia.process.ProcessMainClass;
@@ -25,11 +27,9 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
public class IndexConstructorMain extends ProcessMainClass {
private final FileStorageService fileStorageService;
private final ProcessHeartbeatImpl heartbeat;
@@ -37,7 +37,7 @@ public class IndexConstructorMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
public static void main(String[] args) throws Exception {
static void main(String[] args) throws Exception {
Instructions<CreateIndexRequest> instructions = null;
try {
new org.mariadb.jdbc.Driver();
@@ -74,20 +74,20 @@ public class IndexConstructorMain extends ProcessMainClass {
ProcessConfiguration processConfiguration,
DomainRankings domainRankings) {
super(messageQueueFactory, processConfiguration, GsonFactory.get(), INDEX_CONSTRUCTOR_INBOX);
super(messageQueueFactory, processConfiguration, GsonFactory.get(), ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX);
this.fileStorageService = fileStorageService;
this.heartbeat = heartbeat;
this.domainRankings = domainRankings;
}
private void run(CreateIndexRequest instructions) throws SQLException, IOException {
private void run(CreateIndexRequest instructions) throws IOException {
heartbeat.start();
switch (instructions.indexName()) {
case FORWARD -> createForwardIndex();
case REVERSE_FULL -> createFullReverseIndex();
case REVERSE_PRIO -> createPrioReverseIndex();
case IndexName.FORWARD -> createForwardIndex();
case IndexName.REVERSE_FULL -> createFullReverseIndex();
case IndexName.REVERSE_PRIO -> createPrioReverseIndex();
}
heartbeat.shutDown();
@@ -95,50 +95,74 @@ public class IndexConstructorMain extends ProcessMainClass {
private void createFullReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileDocs = findNextFile(new IndexFileName.FullDocs());
Path outputFilePositions = findNextFile(new IndexFileName.FullPositions());
Files.deleteIfExists(outputFileDocs);
Files.deleteIfExists(outputFilePositions);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
this::addRankToIdEncoding,
tmpDir);
Set<String> languageIsoCodes = IndexJournal.findJournal(workDir)
.map(IndexJournal::languages)
.orElseGet(Set::of);
constructor.createReverseIndex(heartbeat, "createReverseIndexFull", workDir);
for (String languageIsoCode : languageIsoCodes) {
Path outputFileWords = findNextFile(new IndexFileName.FullWords(languageIsoCode));
FullIndexConstructor constructor = new FullIndexConstructor(
languageIsoCode,
outputFileDocs,
outputFileWords,
outputFilePositions,
this::addRankToIdEncoding,
tmpDir);
String processName = "createReverseIndexFull[%s]".formatted(languageIsoCode);
constructor.createReverseIndex(heartbeat, processName, workDir);
}
}
private void createPrioReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileDocs = findNextFile(new IndexFileName.PrioDocs());
Files.deleteIfExists(outputFileDocs);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
this::addRankToIdEncoding,
tmpDir);
Set<String> languageIsoCodes = IndexJournal.findJournal(workDir)
.map(IndexJournal::languages)
.orElseGet(Set::of);
constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir);
for (String languageIsoCode : languageIsoCodes) {
Path outputFileWords = findNextFile(new IndexFileName.PrioWords(languageIsoCode));
Files.deleteIfExists(outputFileWords);
PrioIndexConstructor constructor = new PrioIndexConstructor(
languageIsoCode,
outputFileDocs,
outputFileWords,
this::addRankToIdEncoding,
tmpDir);
String processName = "createReverseIndexPrio[%s]".formatted(languageIsoCode);
constructor.createReverseIndex(heartbeat, processName, workDir);
}
}
private void createForwardIndex() throws IOException {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsId = findNextFile(new IndexFileName.ForwardDocIds());
Path outputFileDocsData = findNextFile(new IndexFileName.ForwardDocIds());
Path outputFileSpansData = findNextFile(new IndexFileName.ForwardSpansData());
ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat,
outputFileDocsId,
@@ -151,6 +175,10 @@ public class IndexConstructorMain extends ProcessMainClass {
converter.convert();
}
private Path findNextFile(IndexFileName fileName) {
return IndexFileName.resolve(IndexLocations.getCurrentIndex(fileStorageService), fileName, IndexFileName.Version.NEXT);
}
/** Append the domain's ranking to the high bits of a document ID
* to ensure they're sorted in order of rank within the index.
*/

View File

@@ -4,7 +4,7 @@ import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.searchset.DomainRankings;
import nu.marginalia.storage.FileStorageService;
public class IndexConstructorModule extends AbstractModule {

View File

@@ -3,27 +3,34 @@ package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.config.IndexFileName;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.PrioReverseIndexReader;
import nu.marginalia.index.reverse.WordLexicon;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.storage.FileStorageService;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
@Singleton
public class IndexFactory {
private final FileStorageService fileStorageService;
private final Path liveStorage;
private final LanguageConfiguration languageConfiguration;
@Inject
public IndexFactory(FileStorageService fileStorageService) {
public IndexFactory(FileStorageService fileStorageService, LanguageConfiguration languageConfiguration) {
this.fileStorageService = fileStorageService;
this.liveStorage = IndexLocations.getCurrentIndex(fileStorageService);
this.languageConfiguration = languageConfiguration;
}
public CombinedIndexReader getCombinedIndexReader() throws IOException {
@@ -39,47 +46,78 @@ public class IndexFactory {
}
public FullReverseIndexReader getReverseIndexReader() throws IOException {
Path docsFile = getCurrentPath(new IndexFileName.FullDocs());
Path positionsFile = getCurrentPath(new IndexFileName.FullPositions());
List<WordLexicon> wordLexicons = new ArrayList<>();
for (LanguageDefinition languageDefinition : languageConfiguration.languages()) {
String languageIsoCode = languageDefinition.isoCode();
Path wordsFile = getCurrentPath(new IndexFileName.FullWords(languageIsoCode));
if (Files.exists(wordsFile)) {
wordLexicons.add(new WordLexicon(languageIsoCode, wordsFile));
}
}
return new FullReverseIndexReader("full",
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
wordLexicons,
docsFile,
positionsFile
);
}
public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException {
return new PrioReverseIndexReader("prio",
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
);
List<WordLexicon> wordLexicons = new ArrayList<>();
for (LanguageDefinition languageDefinition : languageConfiguration.languages()) {
String languageIsoCode = languageDefinition.isoCode();
Path wordsFile = getCurrentPath(new IndexFileName.PrioWords(languageIsoCode));
if (Files.exists(wordsFile)) {
wordLexicons.add(new WordLexicon(languageIsoCode, wordsFile));
}
}
Path docsFile = getCurrentPath(new IndexFileName.PrioDocs());
return new PrioReverseIndexReader("prio", wordLexicons, docsFile);
}
public ForwardIndexReader getForwardIndexReader() throws IOException {
return new ForwardIndexReader(
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT),
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
);
Path docIdsFile = getCurrentPath(new IndexFileName.ForwardDocIds());
Path docDataFile = getCurrentPath(new IndexFileName.ForwardDocData());
Path spansFile = getCurrentPath(new IndexFileName.ForwardSpansData());
return new ForwardIndexReader(docIdsFile, docDataFile, spansFile);
}
private Path getCurrentPath(IndexFileName fileName) {
return IndexFileName.resolve(liveStorage, fileName, IndexFileName.Version.CURRENT);
}
/** Switches the current index to the next index */
public void switchFiles() throws IOException {
for (var file : ReverseIndexFullFileNames.FileIdentifier.values()) {
for (var file : IndexFileName.forwardIndexFiles()) {
switchFile(
ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.NEXT),
ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.CURRENT)
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
);
}
for (var file : ReverseIndexPrioFileNames.FileIdentifier.values()) {
for (IndexFileName file : IndexFileName.revPrioIndexFiles(languageConfiguration)) {
switchFile(
ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.NEXT),
ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.CURRENT)
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
);
}
for (var file : ForwardIndexFileNames.FileIdentifier.values()) {
for (IndexFileName file : IndexFileName.revFullIndexFiles(languageConfiguration)) {
switchFile(
ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.NEXT),
ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.CURRENT)
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
);
}
}

View File

@@ -10,7 +10,6 @@ import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.searchset.SearchSet;
@@ -95,13 +94,20 @@ public class IndexGrpcService
.time(() -> {
// Perform the search
try {
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
return List.of();
}
var rankingContext = SearchContext.create(statefulIndex.get(), hasher, request, getSearchSet(request));
return new IndexQueryExecution(rankingContext, nodeId, rankingService, statefulIndex.get()).run();
CombinedIndexReader indexReader = statefulIndex.get();
SearchContext rankingContext =
SearchContext.create(indexReader, hasher, request, getSearchSet(request));
IndexQueryExecution queryExecution =
new IndexQueryExecution(indexReader, rankingService, rankingContext, nodeId);
return queryExecution.run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
@@ -128,6 +134,9 @@ public class IndexGrpcService
}
}
/** Keywords are translated to a numeric format via a 64 bit hash algorithm,
* which varies depends on the language.
*/
private KeywordHasher findHasher(RpcIndexQuery request) {
KeywordHasher hasher = keywordHasherByLangIso.get(request.getLangIsoCode());
if (hasher != null)
@@ -149,9 +158,12 @@ public class IndexGrpcService
return List.of();
}
var currentIndex = statefulIndex.get();
CombinedIndexReader currentIndex = statefulIndex.get();
return new IndexQueryExecution(SearchContext.create(currentIndex, keywordHasherByLangIso.get("en"), specsSet, getSearchSet(specsSet)), 1, rankingService, statefulIndex.get()).run();
SearchContext context = SearchContext.create(currentIndex,
keywordHasherByLangIso.get("en"), specsSet, getSearchSet(specsSet));
return new IndexQueryExecution(currentIndex, rankingService, context, 1).run();
}
catch (Exception ex) {
logger.error("Error in handling request", ex);

View File

@@ -1,13 +1,14 @@
package nu.marginalia.index.index;
package nu.marginalia.index;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.reverse.FullReverseIndexReader;
import nu.marginalia.index.reverse.IndexLanguageContext;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
public class IndexQueryBuilder {
private final IndexLanguageContext context;
private final IndexQuery query;
private final FullReverseIndexReader reverseIndexFullReader;
@@ -19,8 +20,9 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
* */
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query)
IndexQueryBuilder(FullReverseIndexReader reverseIndexFullReader, IndexLanguageContext context, IndexQuery query)
{
this.context = context;
this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader;
}
@@ -34,7 +36,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
public IndexQueryBuilder also(long termId, IndexSearchBudget budget) {
if (alreadyConsideredTerms.add(termId)) {
query.addInclusionFilter(reverseIndexFullReader.also(termId, budget));
query.addInclusionFilter(reverseIndexFullReader.also(context, termId, budget));
}
return this;
@@ -42,7 +44,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
public IndexQueryBuilder not(long termId, IndexSearchBudget budget) {
query.addInclusionFilter(reverseIndexFullReader.not(termId, budget));
query.addInclusionFilter(reverseIndexFullReader.not(context, termId, budget));
return this;
}

View File

@@ -3,12 +3,11 @@ package nu.marginalia.index;
import io.prometheus.client.Gauge;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.reverse.query.IndexQuery;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.skiplist.SkipListConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -33,7 +32,10 @@ public class IndexQueryExecution {
// operations per lookup and again optimize tail latency
private static final int lookupBatchSize = SkipListConstants.BLOCK_SIZE / 16;
private static final ExecutorService threadPool = new ThreadPoolExecutor(indexValuationThreads, Integer.MAX_VALUE, 60L, TimeUnit.SECONDS, new SynchronousQueue<>());
private static final ExecutorService threadPool =
new ThreadPoolExecutor(indexValuationThreads, 256,
60L, TimeUnit.SECONDS, new SynchronousQueue<>());
private static final Logger log = LoggerFactory.getLogger(IndexQueryExecution.class);
private final String nodeName;
@@ -82,19 +84,19 @@ public class IndexQueryExecution {
public IndexQueryExecution(SearchContext rankingContext,
int serviceNode,
public IndexQueryExecution(CombinedIndexReader currentIndex,
IndexResultRankingService rankingService,
CombinedIndexReader currentIndex) {
SearchContext rankingContext,
int serviceNode) {
this.nodeName = Integer.toString(serviceNode);
this.rankingService = rankingService;
this.rankingContext = rankingContext;
resultHeap = new ResultPriorityQueue(rankingContext.fetchSize);
budget = rankingContext.budget;
limitByDomain = rankingContext.limitByDomain;
limitTotal = rankingContext.limitTotal;
this.rankingContext = rankingContext;
queries = currentIndex.createQueries(rankingContext);
@@ -198,7 +200,7 @@ public class IndexQueryExecution {
if (docIds == null) continue;
long st = System.nanoTime();
var preparedData = rankingService.prepareRankingData(rankingContext, docIds, budget);
var preparedData = rankingService.prepareRankingData(rankingContext, docIds);
long et = System.nanoTime();
metric_index_prep_time_s
.labels(nodeName)
@@ -226,7 +228,7 @@ public class IndexQueryExecution {
try (rankingData) {
long st = System.nanoTime();
resultHeap.addAll(rankingService.rankResults(budget, rankingContext, rankingData, false));
resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData, false));
long et = System.nanoTime();
metric_index_rank_time_s

View File

@@ -25,7 +25,7 @@ public class ResultPriorityQueue implements Iterable<SearchResultItem> {
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
}
public Iterator<SearchResultItem> iterator() {
public @NotNull Iterator<SearchResultItem> iterator() {
return queue.iterator();
}

View File

@@ -1,8 +1,7 @@
package nu.marginalia.index.index;
package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.index.IndexFactory;
import nu.marginalia.service.control.ServiceEventLog;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.forward;
package nu.marginalia.index.config;
public class ForwardIndexParameters {
public static final int ENTRY_SIZE = 3;

View File

@@ -0,0 +1,97 @@
package nu.marginalia.index.config;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.model.LanguageDefinition;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public sealed interface IndexFileName {
enum Version {
CURRENT, NEXT
}
record FullWords(String languageIsoCode) implements IndexFileName {}
record FullDocs() implements IndexFileName {}
record FullPositions() implements IndexFileName {}
record PrioWords(String languageIsoCode) implements IndexFileName {}
record PrioDocs() implements IndexFileName {}
record ForwardDocIds() implements IndexFileName { }
record ForwardDocData() implements IndexFileName { }
record ForwardSpansData() implements IndexFileName { }
static List<IndexFileName> revFullIndexFiles(LanguageConfiguration languageConfiguration) {
List<IndexFileName> ret = new ArrayList<>();
ret.add(new FullDocs());
ret.add(new FullPositions());
for (LanguageDefinition ld : languageConfiguration.languages()) {
ret.add(new FullWords(ld.isoCode()));
}
return ret;
}
static List<IndexFileName> revPrioIndexFiles(LanguageConfiguration languageConfiguration) {
List<IndexFileName> ret = new ArrayList<>();
ret.add(new PrioDocs());
for (LanguageDefinition ld : languageConfiguration.languages()) {
ret.add(new PrioWords(ld.isoCode()));
}
return ret;
}
static List<IndexFileName> forwardIndexFiles() {
return List.of(
new ForwardDocData(),
new ForwardDocIds(),
new ForwardSpansData()
);
}
static Path resolve(Path basePath, IndexFileName fileName, Version version) {
return switch (fileName) {
case FullWords(String isoCode) -> switch (version) {
case CURRENT -> basePath.resolve("rev-words-%s.dat".formatted(isoCode));
case NEXT -> basePath.resolve("rev-words-%s.dat.next".formatted(isoCode));
};
case FullDocs() -> switch (version) {
case CURRENT -> basePath.resolve("rev-docs.dat");
case NEXT -> basePath.resolve("rev-docs.dat.next");
};
case FullPositions() -> switch (version) {
case CURRENT -> basePath.resolve("rev-positions.dat");
case NEXT -> basePath.resolve("rev-positions.dat.next");
};
case PrioWords(String languageIsoCode) -> switch (version) {
case CURRENT -> basePath.resolve("rev-prio-words-%s.dat".formatted(languageIsoCode));
case NEXT -> basePath.resolve("rev-prio-words-%s.dat.next".formatted(languageIsoCode));
};
case PrioDocs() -> switch (version) {
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
};
case ForwardDocIds() -> switch (version) {
case CURRENT -> basePath.resolve("fwd-doc-ids.dat");
case NEXT -> basePath.resolve("fwd-doc-ids.dat.next");
};
case ForwardDocData() -> switch (version) {
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
};
case ForwardSpansData() -> switch (version) {
case CURRENT -> basePath.resolve("fwd-spans.dat");
case NEXT -> basePath.resolve("fwd-spans.dat.next");
};
};
}
}

View File

@@ -0,0 +1,9 @@
package nu.marginalia.index.config;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
}

View File

@@ -6,7 +6,8 @@ import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.forward.spans.IndexSpansReader;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -17,7 +18,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeoutException;
import static nu.marginalia.index.forward.ForwardIndexParameters.*;
import static nu.marginalia.index.config.ForwardIndexParameters.*;
/** Reads the forward index.
* <p/>
@@ -140,10 +141,12 @@ public class ForwardIndexReader {
return (int) offset;
}
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, long[] docIds) throws TimeoutException {
long[] offsets = new long[docIds.length];
for (int i = 0; i < docIds.length; i++) {
long offset = idxForDoc(docIds[i]);
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, CombinedDocIdList combinedIds) throws TimeoutException {
long[] offsets = new long[combinedIds.size()];
for (int i = 0; i < offsets.length; i++) {
long docId = UrlIdCodec.removeRank(combinedIds.at(i));
long offset = idxForDoc(docId);
if (offset >= 0) {
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
}
@@ -157,7 +160,7 @@ public class ForwardIndexReader {
}
catch (IOException ex) {
logger.error("Failed to read spans for docIds", ex);
return new DocumentSpans[docIds.length];
return new DocumentSpans[offsets.length];
}
}

View File

@@ -3,10 +3,10 @@ package nu.marginalia.index.forward.construction;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexParameters;
import nu.marginalia.index.config.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.IndexSpansWriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.searchset.DomainRankings;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat;

View File

@@ -1,6 +1,6 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import java.io.IOException;
import java.lang.foreign.Arena;
@@ -8,17 +8,11 @@ import java.nio.file.Path;
import java.util.concurrent.TimeoutException;
public interface IndexSpansReader extends AutoCloseable {
@Deprecated
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException, IOException;
static IndexSpansReader open(Path fileName) throws IOException {
int version = SpansCodec.parseSpanFilesFooter(fileName);
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
return new IndexSpansReaderCompressed(fileName);
}
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
return new IndexSpansReaderPlain(fileName);
}
else {

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.uring.UringFileReader;
import java.io.IOException;
@@ -32,13 +32,41 @@ public class IndexSpansReaderPlain implements IndexSpansReader {
}
@Override
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
// for testing, slow
try {
return readSpans(arena, new IndexSearchBudget(1000), new long[] { encodedOffset})[0];
} catch (TimeoutException e) {
throw new IOException(e);
public DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException {
int readCnt = 0;
for (long offset : encodedOffsets) {
if (offset < 0) continue;
readCnt ++;
}
if (readCnt == 0) {
return new DocumentSpans[encodedOffsets.length];
}
long[] offsets = new long[readCnt];
int[] sizes = new int[readCnt];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
long offset = encodedOffsets[idx];
offsets[j] = SpansCodec.decodeStartOffset(offset);
sizes[j] = SpansCodec.decodeSize(offset);
j++;
}
List<MemorySegment> buffers = uringReader.readUnaligned(arena, budget.timeLeft(), offsets, sizes, 4096);
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
ret[idx] = decode(buffers.get(j++));
}
return ret;
}
public DocumentSpans decode(MemorySegment ms) {
@@ -64,46 +92,6 @@ public class IndexSpansReaderPlain implements IndexSpansReader {
return ret;
}
@Override
public DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException {
int readCnt = 0;
for (long offset : encodedOffsets) {
if (offset < 0)
continue;
readCnt ++;
}
if (readCnt == 0) {
return new DocumentSpans[encodedOffsets.length];
}
long[] offsets = new long[readCnt];
int[] sizes = new int[readCnt];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
long offset = encodedOffsets[idx];
offsets[j] = SpansCodec.decodeStartOffset(offset);
sizes[j] = SpansCodec.decodeSize(offset);
j++;
}
List<MemorySegment> buffers = uringReader.readUnaligned(arena, budget.timeLeft(), offsets, sizes, 4096);
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
if (encodedOffsets[idx] < 0)
continue;
ret[idx] = decode(buffers.get(j++));
}
return ret;
}
@Override
public void close() throws IOException {
uringReader.close();

View File

@@ -12,7 +12,7 @@ public class SpansCodec {
public enum SpansCodecVersion {
@Deprecated
COMPRESSED,
DEPRECATED_1, // This must not be removed, the ordinal is used to encode the version
PLAIN
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.array.page.LongQueryBuffer;
@@ -8,7 +8,7 @@ import java.util.stream.LongStream;
/** A list of document ids, with their ranking bits still remaining.
*
* @see nu.marginalia.index.results.model.ids.DocIdList
* @see DocIdList
* @see nu.marginalia.model.id.UrlIdCodec
* */
public final class CombinedDocIdList {

View File

@@ -1,14 +1,13 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.stream.LongStream;
/** A list of document ids, with their ranking bits removed.
*
* @see nu.marginalia.index.results.model.ids.CombinedDocIdList
* @see CombinedDocIdList
* @see nu.marginalia.model.id.UrlIdCodec
* */
public final class DocIdList {

View File

@@ -1,9 +1,8 @@
package nu.marginalia.index.results.model;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;

View File

@@ -1,8 +1,8 @@
package nu.marginalia.index.model;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
import nu.marginalia.index.searchset.SearchSet;
import java.util.Objects;

View File

@@ -12,15 +12,14 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.index.CombinedIndexReader;
import nu.marginalia.index.reverse.IndexLanguageContext;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.language.keywords.KeywordHasher;
import org.slf4j.Logger;
@@ -75,9 +74,12 @@ public class SearchContext {
public final LongList termIdsExcludes;
public final LongList termIdsPriority;
public final IndexLanguageContext languageContext;
public static SearchContext create(CombinedIndexReader currentIndex,
KeywordHasher keywordHasher,
SearchSpecification specsSet, SearchSet searchSet) {
SearchSpecification specsSet,
SearchSet searchSet) {
var queryParams = new QueryParams(specsSet.quality, specsSet.year, specsSet.size, specsSet.rank, searchSet, specsSet.queryStrategy);
var rankingParams = specsSet.rankingParams;
@@ -85,6 +87,7 @@ public class SearchContext {
return new SearchContext(
keywordHasher,
"en", // FIXME: This path currently only supports english
currentIndex,
specsSet.query.compiledQuery,
queryParams,
@@ -111,6 +114,7 @@ public class SearchContext {
return new SearchContext(
keywordHasher,
request.getLangIsoCode(),
currentIndex,
query.compiledQuery,
queryParams,
@@ -121,6 +125,7 @@ public class SearchContext {
public SearchContext(
KeywordHasher keywordHasher,
String langIsoCode,
CombinedIndexReader currentIndex,
String queryExpression,
QueryParams queryParams,
@@ -129,6 +134,7 @@ public class SearchContext {
RpcQueryLimits limits)
{
this.docCount = currentIndex.totalDocCount();
this.languageContext = currentIndex.createLanguageContext(langIsoCode);
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
this.searchQuery = query;
@@ -150,8 +156,8 @@ public class SearchContext {
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = currentIndex.numHits(id);
prio[idx] = currentIndex.numHitsPrio(id);
full[idx] = currentIndex.numHits(this.languageContext, id);
prio[idx] = currentIndex.numHitsPrio(this.languageContext, id);
if (compiledQuery.at(idx).contains("_")) {
ngramsMask.set(idx);

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;

View File

@@ -1,6 +1,6 @@
package nu.marginalia.index.results.model.ids;
package nu.marginalia.index.model;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.reverse.positions.TermData;
import nu.marginalia.sequence.CodedSequence;
import javax.annotation.Nullable;

View File

@@ -10,20 +10,18 @@ import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.index.CombinedIndexReader;
import nu.marginalia.index.StatefulIndex;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.index.model.TermMetadataList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.sequence.CodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import java.sql.SQLException;
import java.util.ArrayList;
@@ -51,8 +49,8 @@ public class IndexResultRankingService {
this.domainRankingOverrides = domainRankingOverrides;
}
public RankingData prepareRankingData(SearchContext rankingContext, CombinedDocIdList resultIds, @Nullable IndexSearchBudget budget) throws TimeoutException {
return new RankingData(rankingContext, resultIds, budget);
public RankingData prepareRankingData(SearchContext rankingContext, CombinedDocIdList resultIds) throws TimeoutException {
return new RankingData(rankingContext, resultIds);
}
public final class RankingData implements AutoCloseable {
@@ -66,7 +64,7 @@ public class IndexResultRankingService {
private AtomicBoolean closed = new AtomicBoolean(false);
int pos = -1;
public RankingData(SearchContext rankingContext, CombinedDocIdList resultIds, @Nullable IndexSearchBudget budget) throws TimeoutException {
public RankingData(SearchContext rankingContext, CombinedDocIdList resultIds) throws TimeoutException {
this.resultIds = resultIds;
this.arena = Arena.ofShared();
@@ -83,8 +81,8 @@ public class IndexResultRankingService {
// Perform expensive I/O operations
try {
this.termsForDocs = currentIndex.getTermMetadata(arena, budget, rankingContext.termIdsAll.array, resultIds);
this.documentSpans = currentIndex.getDocumentSpans(arena, budget, resultIds);
this.termsForDocs = currentIndex.getTermMetadata(arena, rankingContext.languageContext, rankingContext.budget, rankingContext.termIdsAll.array, resultIds);
this.documentSpans = currentIndex.getDocumentSpans(arena, rankingContext.budget, resultIds);
}
catch (TimeoutException|RuntimeException ex) {
arena.close();
@@ -133,7 +131,6 @@ public class IndexResultRankingService {
}
public List<SearchResultItem> rankResults(
IndexSearchBudget budget,
SearchContext rankingContext,
RankingData rankingData,
boolean exportDebugData)
@@ -145,7 +142,7 @@ public class IndexResultRankingService {
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well
while (rankingData.next() && budget.hasTimeLeft()) {
while (rankingData.next() && rankingContext.budget.hasTimeLeft()) {
// Ignore documents that don't match the mandatory constraints
if (!rankingContext.phraseConstraints.testMandatory(rankingData.positions())) {
@@ -211,10 +208,9 @@ public class IndexResultRankingService {
}
resultsList.clear();
IndexSearchBudget budget = new IndexSearchBudget(10000);
try (var data = prepareRankingData(searchContext, new CombinedDocIdList(combinedIdsList), null)) {
try (var data = prepareRankingData(searchContext, new CombinedDocIdList(combinedIdsList))) {
resultsList.addAll(this.rankResults(
budget,
searchContext,
data,
true)

View File

@@ -6,15 +6,15 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.index.CombinedIndexReader;
import nu.marginalia.index.StatefulIndex;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.PhraseConstraintGroupList;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchContext;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;

View File

@@ -1,23 +0,0 @@
package nu.marginalia.index.results.model;
import gnu.trove.map.hash.TObjectLongHashMap;
import nu.marginalia.index.results.model.ids.TermIdList;
public class QuerySearchTerms {
private final TObjectLongHashMap<String> termToId;
public final TermIdList termIdsAll;
public final PhraseConstraintGroupList phraseConstraints;
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
TermIdList termIdsAll,
PhraseConstraintGroupList phraseConstraints) {
this.termToId = termToId;
this.termIdsAll = termIdsAll;
this.phraseConstraints = phraseConstraints;
}
public long getIdForTerm(String searchTerm) {
return termToId.get(searchTerm);
}
}

View File

@@ -1,66 +0,0 @@
package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.sequence.CodedSequence;
import javax.annotation.Nullable;
public class TermMetadataForCombinedDocumentIds {
private final Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta;
public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta) {
this.termdocToMeta = termdocToMeta;
}
public byte getTermMetadata(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
return 0;
}
return metaByCombinedId.get(combinedId).flags();
}
@Nullable
public CodedSequence getPositions(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
return null;
}
return metaByCombinedId.get(combinedId).positions();
}
public boolean hasTermMeta(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
return false;
}
return metaByCombinedId.data().containsKey(combinedId);
}
public record DocumentsWithMetadata(Long2ObjectOpenHashMap<TermData> data) {
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) {
this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size()));
long[] ids = combinedDocIdsAll.array();
TermData[] data = metadata.array();
for (int i = 0; i < combinedDocIdsAll.size(); i++) {
if (data[i] != null) {
this.data.put(ids[i], data[i]);
}
}
}
public TermData get(long combinedId) {
return data.get(combinedId);
}
}
}

View File

@@ -1,15 +1,12 @@
package nu.marginalia.index;
package nu.marginalia.index.reverse;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.reverse.query.EntrySource;
import nu.marginalia.skiplist.SkipListReader;
public class FullIndexEntrySource implements EntrySource {
private final String name;
int pos;
int endOffset;
private final SkipListReader reader;
private final long wordId;
@@ -19,18 +16,11 @@ public class FullIndexEntrySource implements EntrySource {
this.name = name;
this.reader = reader;
this.wordId = wordId;
pos = 0;
}
@Override
public void skip(int n) {
pos += n;
}
@Override
public void read(LongQueryBuffer buffer) {
reader.getData(buffer);
reader.getKeys(buffer);
}
@Override

View File

@@ -0,0 +1,227 @@
package nu.marginalia.index.reverse;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.array.pool.BufferPool;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.model.CombinedDocIdList;
import nu.marginalia.index.model.TermMetadataList;
import nu.marginalia.index.reverse.positions.PositionsFileReader;
import nu.marginalia.index.reverse.positions.TermData;
import nu.marginalia.index.reverse.query.*;
import nu.marginalia.index.reverse.query.filter.QueryFilterLetThrough;
import nu.marginalia.index.reverse.query.filter.QueryFilterNoPass;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListConstants;
import nu.marginalia.skiplist.SkipListReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;
public class FullReverseIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Map<String, WordLexicon> wordLexiconMap;
private final LongArray documents;
private final PositionsFileReader positionsFileReader;
private final BufferPool dataPool;
private final String name;
public FullReverseIndexReader(String name,
Collection<WordLexicon> wordLexicons,
Path documents,
Path positionsFile)
throws IOException
{
this.name = name;
if (!Files.exists(documents)) {
this.documents = null;
this.dataPool = null;
this.positionsFileReader = null;
this.wordLexiconMap = Map.of();
wordLexicons.forEach(WordLexicon::close);
return;
}
this.wordLexiconMap = wordLexicons.stream().collect(Collectors.toUnmodifiableMap(lexicon -> lexicon.languageIsoCode, v->v));
this.positionsFileReader = new PositionsFileReader(positionsFile);
logger.info("Switching reverse index");
this.documents = LongArrayFactory.mmapForReadingShared(documents);
LinuxSystemCalls.madviseRandom(this.documents.getMemorySegment());
dataPool = new BufferPool(documents, SkipListConstants.BLOCK_SIZE,
(int) (Long.getLong("index.bufferPoolSize", 512*1024*1024L) / SkipListConstants.BLOCK_SIZE)
);
}
public void reset() {
dataPool.reset();
}
public EntrySource documents(IndexLanguageContext languageContext, long termId) {
if (null == languageContext.wordLexiconFull) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
long offset = languageContext.wordLexiconFull.wordOffset(termId);
if (offset < 0) // No documents
return new EmptyEntrySource();
return new FullIndexEntrySource(name, getReader(offset), termId);
}
/** Create a filter step requiring the specified termId to exist in the documents */
public QueryFilterStepIf also(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
var lexicon = languageContext.wordLexiconFull;
if (null == lexicon)
return new QueryFilterNoPass();
long offset = lexicon.wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(getReader(offset), name, termId, budget);
}
/** Create a filter step requiring the specified termId to be absent from the documents */
public QueryFilterStepIf not(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
var lexicon = languageContext.wordLexiconFull;
if (null == lexicon)
return new QueryFilterLetThrough();
long offset = lexicon.wordOffset(termId);
if (offset < 0) // No documents
return new QueryFilterLetThrough();
return new ReverseIndexRejectFilter(getReader(offset), budget);
}
/** Return the number of documents with the termId in the index */
public int numDocuments(IndexLanguageContext languageContext, long termId) {
var lexicon = languageContext.wordLexiconFull;
if (null == lexicon)
return 0;
long offset = lexicon.wordOffset(termId);
if (offset < 0)
return 0;
return getReader(offset).estimateSize();
}
/** Create a BTreeReader for the document offset associated with a termId */
private SkipListReader getReader(long offset) {
return new SkipListReader(dataPool, offset);
}
/** Get term metadata for each document, return an array of TermMetadataList of the same
* length and order as termIds, with each list of the same length and order as docIds
*
* @throws TimeoutException if the read could not be queued in a timely manner;
* (the read itself may still exceed the budgeted time)
*/
public TermMetadataList[] getTermData(Arena arena,
IndexLanguageContext languageContext,
IndexSearchBudget budget,
long[] termIds,
CombinedDocIdList docIds)
throws TimeoutException
{
// Gather all termdata to be retrieved into a single array,
// to help cluster related disk accesses and get better I/O performance
WordLexicon lexicon = languageContext.wordLexiconFull;
if (null == lexicon) {
TermMetadataList[] ret = new TermMetadataList[termIds.length];
for (int i = 0; i < termIds.length; i++) {
ret[i] = new TermMetadataList(new TermData[docIds.size()]);
}
return ret;
}
long[] offsetsAll = new long[termIds.length * docIds.size()];
for (int i = 0; i < termIds.length; i++) {
long termId = termIds[i];
long offset = lexicon.wordOffset(termId);
if (offset < 0) {
// This is likely a bug in the code, but we can't throw an exception here.
logger.debug("Missing offset for word {}", termId);
// We'll pass zero offsets to positionsFileReader.getTermData(), which will be
// interpreted as an instruction to ignore these positions.
continue;
}
// Read the size and offset of the position data
long[] offsetsForTerm = getReader(offset).getValueOffsets(docIds.array());
// Add to the big array of term data offsets
System.arraycopy(offsetsForTerm, 0, offsetsAll, i * docIds.size(), docIds.size());
}
// Perform the read
TermData[] termDataCombined = positionsFileReader.getTermData(arena, budget, offsetsAll);
// Break the result data into separate arrays by termId again
TermMetadataList[] ret = new TermMetadataList[termIds.length];
for (int i = 0; i < termIds.length; i++) {
ret[i] = new TermMetadataList(
Arrays.copyOfRange(termDataCombined, i*docIds.size(), (i+1)*docIds.size())
);
}
return ret;
}
public void close() {
try {
dataPool.close();
}
catch (Exception e) {
logger.warn("Error while closing bufferPool", e);
}
if (documents != null)
documents.close();
wordLexiconMap.values().forEach(WordLexicon::close);
if (positionsFileReader != null) {
try {
positionsFileReader.close();
} catch (IOException e) {
logger.error("Failed to close positions file reader", e);
}
}
}
@Nullable
public WordLexicon getWordLexicon(String languageIsoCode) {
return wordLexiconMap.get(languageIsoCode);
}
}

View File

@@ -0,0 +1,19 @@
package nu.marginalia.index.reverse;
import javax.annotation.Nullable;
public class IndexLanguageContext {
public final String languageIsoCode;
@Nullable
final WordLexicon wordLexiconFull;
@Nullable
final WordLexicon wordLexiconPrio;
public IndexLanguageContext(String languageIsoCode, WordLexicon wordLexiconFull, WordLexicon wordLexiconPrio) {
this.languageIsoCode = languageIsoCode;
this.wordLexiconFull = wordLexiconFull;
this.wordLexiconPrio = wordLexiconPrio;
}
}

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index;
package nu.marginalia.index.reverse;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.reverse.query.EntrySource;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.io.BitReader;
@@ -55,17 +55,13 @@ public class PrioIndexEntrySource implements EntrySource {
}
}
@Override
public void skip(int n) {
throw new UnsupportedOperationException("Not implemented");
}
@Override
@SuppressWarnings("preview")
public void read(LongQueryBuffer buffer) {
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
outputBuffer.clear();
// FYI: The encoding end of this compression algorithm is at PrioDocIdsTransformer
while (outputBuffer.hasRemaining() && readItems++ < numItems) {
int rank;
int domainId;

View File

@@ -0,0 +1,102 @@
package nu.marginalia.index.reverse;
import nu.marginalia.index.reverse.query.EmptyEntrySource;
import nu.marginalia.index.reverse.query.EntrySource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PrioReverseIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final String name;
private final Map<String, WordLexicon> wordLexiconMap;
private final FileChannel documentsChannel;
public PrioReverseIndexReader(String name,
List<WordLexicon> wordLexicons,
Path documents) throws IOException {
this.name = name;
if (!Files.exists(documents)) {
this.documentsChannel = null;
this.wordLexiconMap = Map.of();
return;
}
wordLexiconMap = wordLexicons.stream().collect(Collectors.toUnmodifiableMap(lexicon -> lexicon.languageIsoCode, v -> v));
documentsChannel = (FileChannel) Files.newByteChannel(documents);
logger.info("Switching reverse index");
}
public EntrySource documents(IndexLanguageContext languageContext, long termId) {
if (languageContext.wordLexiconPrio == null) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
long offset = languageContext.wordLexiconPrio.wordOffset(termId);
if (offset < 0) // No documents
return new EmptyEntrySource();
return new PrioIndexEntrySource(name,
documentsChannel,
offset,
termId);
}
/**
* Return the number of documents with the termId in the index
*/
public int numDocuments(IndexLanguageContext languageContext, long termId) {
var lexicon = languageContext.wordLexiconPrio;
if (null == lexicon)
return 0;
long offset = lexicon.wordOffset(termId);
if (offset < 0) // No documents
return 0;
ByteBuffer buffer = ByteBuffer.allocate(4);
try {
documentsChannel.read(buffer, offset);
} catch (IOException e) {
logger.error("Failed to read documents channel", e);
return 0;
}
return buffer.getInt(0) & 0x3FFF_FFFF;
}
public void close() {
try {
documentsChannel.close();
} catch (IOException e) {
logger.error("Failed to close documents channel", e);
}
wordLexiconMap.values().forEach(WordLexicon::close);
}
@Nullable
public WordLexicon getWordLexicon(String languageIsoCode) {
return wordLexiconMap.get(languageIsoCode);
}
}

View File

@@ -0,0 +1,46 @@
package nu.marginalia.index.reverse;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.ffi.LinuxSystemCalls;
import nu.marginalia.index.config.ReverseIndexParameters;
import java.io.IOException;
import java.nio.file.Path;
public class WordLexicon {
public final String languageIsoCode;
private final LongArray words;
private final BTreeReader wordsBTreeReader;
private final long wordsDataOffset;
public WordLexicon(String languageIsoCode, Path fileName) throws IOException {
this.languageIsoCode = languageIsoCode;
this.words = LongArrayFactory.mmapForReadingShared(fileName);
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
this.wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
this.wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
}
/** Calculate the offset of the word in the documents.
* If the return-value is negative, the term does not exist
* in the index.
*/
public long wordOffset(long termId) {
long idx = wordsBTreeReader.findEntry(termId);
if (idx < 0)
return -1L;
return words.get(wordsDataOffset + idx + 1);
}
public void close() {
words.close();
}
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.reverse.construction;
import nu.marginalia.array.algo.LongArrayTransformations;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.reverse.construction;
public interface DocIdRewriter {
long rewriteDocId(long docId);

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.reverse.construction;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.btree.model.BTreeContext;

View File

@@ -1,6 +1,6 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.reverse.construction;
import nu.marginalia.index.positions.PositionCodec;
import nu.marginalia.index.reverse.positions.PositionCodec;
import java.io.IOException;
import java.nio.ByteBuffer;
@@ -28,6 +28,7 @@ public class PositionsFileConstructor implements AutoCloseable {
this.file = file;
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
channel.position(channel.size());
}
/** Represents a block of positions lists. Each writer thread should hold on to

View File

@@ -1,9 +1,9 @@
package nu.marginalia.index.construction.full;
package nu.marginalia.index.reverse.construction.full;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.reverse.construction.DocIdRewriter;
import nu.marginalia.index.reverse.construction.PositionsFileConstructor;
import nu.marginalia.process.control.ProcessHeartbeat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -22,17 +22,20 @@ public class FullIndexConstructor {
FINISHED
}
private final String languageIsoCode;
private final Path outputFileDocs;
private final Path outputFileWords;
private final Path outputFilePositions;
private final DocIdRewriter docIdRewriter;
private final Path tmpDir;
public FullIndexConstructor(Path outputFileDocs,
public FullIndexConstructor(String languageIsoCode,
Path outputFileDocs,
Path outputFileWords,
Path outputFilePositions,
DocIdRewriter docIdRewriter,
Path tmpDir) {
this.languageIsoCode = languageIsoCode;
this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords;
this.outputFilePositions = outputFilePositions;
@@ -80,7 +83,7 @@ public class FullIndexConstructor {
private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) {
try {
return FullPreindex
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
.constructPreindex(journalInstance, languageIsoCode, positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
}
catch (IOException e) {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction.full;
package nu.marginalia.index.reverse.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArrayTransformations;
@@ -7,7 +7,7 @@ import nu.marginalia.skiplist.SkipListWriter;
import java.io.IOException;
import java.nio.file.Path;
/** Constructs the BTrees in a reverse index */
/** Constructs the skiplists in a reverse index */
public class FullIndexSkipListTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
private final SkipListWriter writer;
private final LongArray documentsArray;

View File

@@ -1,13 +1,13 @@
package nu.marginalia.index.construction.full;
package nu.marginalia.index.reverse.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.config.ReverseIndexParameters;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.reverse.construction.CountToOffsetTransformer;
import nu.marginalia.index.reverse.construction.DocIdRewriter;
import nu.marginalia.index.reverse.construction.PositionsFileConstructor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -42,6 +42,7 @@ public class FullPreindex {
* will have randomly assigned names.
*/
public static FullPreindex constructPreindex(IndexJournalPage journalInstance,
String languageIsoCode,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
@@ -50,8 +51,8 @@ public class FullPreindex {
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = FullPreindexWordSegments.construct(journalInstance, segmentWordsFile, segmentCountsFile);
var docs = FullPreindexDocuments.construct(docsFile, workDir, journalInstance, docIdRewriter, positionsFileConstructor, segments);
var segments = FullPreindexWordSegments.construct(journalInstance, languageIsoCode, segmentWordsFile, segmentCountsFile);
var docs = FullPreindexDocuments.construct(docsFile, workDir, journalInstance, languageIsoCode, docIdRewriter, positionsFileConstructor, segments);
return new FullPreindex(segments, docs);
}
@@ -75,7 +76,6 @@ public class FullPreindex {
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
var offsets = segments.counts;
Files.deleteIfExists(outputFileDocs);
Files.deleteIfExists(outputFileWords);
// Estimate the size of the docs index data

View File

@@ -1,12 +1,18 @@
package nu.marginalia.index.construction.full;
package nu.marginalia.index.reverse.construction.full;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.reverse.construction.DocIdRewriter;
import nu.marginalia.index.reverse.construction.PositionsFileConstructor;
import nu.marginalia.rwf.RandomFileAssembler;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,51 +43,39 @@ public class FullPreindexDocuments {
Path docsFile,
Path workDir,
IndexJournalPage journalInstance,
String languageIsoCode,
DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor,
FullPreindexWordSegments segments) throws IOException {
FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments);
return new FullPreindexDocuments(docsFileMap, docsFile);
}
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
public long size() {
return documents.size();
}
private static void createUnsortedDocsFile(Path docsFile,
Path workDir,
IndexJournalPage instance,
FullPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
final ByteBuffer tempBuffer = ByteBuffer.allocate(1024*1024*100);
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var slopTable = new SlopTable(instance.baseDir(), instance.page()))
try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
SlopTable slopTable = new SlopTable(journalInstance.baseDir(), journalInstance.page()))
{
var docIds = instance.openCombinedId(slopTable);
var termIds = instance.openTermIds(slopTable);
var termMeta = instance.openTermMetadata(slopTable);
var positions = instance.openTermPositions(slopTable);
LongColumn.Reader docIds = journalInstance.openCombinedId(slopTable);
LongArrayColumn.Reader termIds = journalInstance.openTermIds(slopTable);
ByteArrayColumn.Reader termMeta = journalInstance.openTermMetadata(slopTable);
VarintCodedSequenceArrayColumn.Reader positions = journalInstance.openTermPositions(slopTable);
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
EnumColumn.Reader languageIsoCodes = journalInstance.openLanguageIsoCode(slopTable);
final int desiredLanguageOrdinal = languageIsoCodes.getDictionary().indexOf(languageIsoCode);
Long2LongOpenHashMap offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
var positionsBlock = positionsFileConstructor.getBlock();
PositionsFileConstructor.PositionsFileBlock positionsBlock = positionsFileConstructor.getBlock();
while (languageIsoCodes.hasRemaining()) {
if (languageIsoCodes.getOrdinal() == desiredLanguageOrdinal) {
slopTable.prealignAll(languageIsoCodes);
}
else continue;
while (docIds.hasRemaining()) {
long docId = docIds.get();
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
@@ -102,15 +96,15 @@ public class FullPreindexDocuments {
assembly.put(offset + 1, encodedPosOffset);
}
}
positionsBlock.commit();
slopTable.alignAll(languageIsoCodes);
positionsBlock.commit();
assembly.write(docsFile);
}
}
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) {
var iter = segments.iterator(RECORD_SIZE_LONGS);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
FullPreindexWordSegments.SegmentIterator iter = segments.iterator(RECORD_SIZE_LONGS);
while (iter.next()) {
long iterStart = iter.startOffset;
@@ -118,6 +112,16 @@ public class FullPreindexDocuments {
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
}
return new FullPreindexDocuments(docsFileMap, docsFile);
}
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
public long size() {
return documents.size();
}
public void delete() throws IOException {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction.full;
package nu.marginalia.index.reverse.construction.full;
import nu.marginalia.array.LongArrayFactory;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction.full;
package nu.marginalia.index.reverse.construction.full;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
@@ -7,6 +7,8 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import java.io.IOException;
import java.nio.file.Files;
@@ -53,6 +55,7 @@ public class FullPreindexWordSegments {
}
public static FullPreindexWordSegments construct(IndexJournalPage instance,
String languageIsoCode,
Path wordIdsFile,
Path countsFile)
throws IOException
@@ -61,13 +64,26 @@ public class FullPreindexWordSegments {
countsMap.defaultReturnValue(0);
try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
var termIds = instance.openTermIds(slopTable);
while (termIds.hasRemaining()) {
LongArrayColumn.Reader termIds = instance.openTermIds(slopTable);
EnumColumn.Reader languageIsoCodes = instance.openLanguageIsoCode(slopTable);
// Find out the integer representation of the enum value corresponding to the desired iso code,
// so that we don't have to do expensive string comparisons for each document in the journal
final int desiredLanguageOrdinal = languageIsoCodes.getDictionary().indexOf(languageIsoCode);
while (languageIsoCodes.hasRemaining()) {
if (languageIsoCodes.getOrdinal() == desiredLanguageOrdinal) {
termIds.prealign(languageIsoCodes);
}
else continue;
long[] tids = termIds.get();
for (long termId : tids) {
countsMap.addTo(termId, 1);
}
}
slopTable.alignAll(languageIsoCodes);
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction.prio;
package nu.marginalia.index.reverse.construction.prio;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.model.id.UrlIdCodec;

View File

@@ -1,8 +1,8 @@
package nu.marginalia.index.construction.prio;
package nu.marginalia.index.reverse.construction.prio;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.reverse.construction.DocIdRewriter;
import nu.marginalia.process.control.ProcessHeartbeat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -21,15 +21,18 @@ public class PrioIndexConstructor {
FINISHED
}
private final String languageIsoCode;
private final Path outputFileDocs;
private final Path outputFileWords;
private final DocIdRewriter docIdRewriter;
private final Path tmpDir;
public PrioIndexConstructor(Path outputFileDocs,
public PrioIndexConstructor(String languageIsoCode,
Path outputFileDocs,
Path outputFileWords,
DocIdRewriter docIdRewriter,
Path tmpDir) {
this.languageIsoCode = languageIsoCode;
this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords;
this.docIdRewriter = docIdRewriter;
@@ -75,7 +78,7 @@ public class PrioIndexConstructor {
private PrioPreindexReference construct(IndexJournalPage journalInstance) {
try {
return PrioPreindex
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
.constructPreindex(journalInstance, languageIsoCode, docIdRewriter, tmpDir)
.closeToReference();
}
catch (IOException ex) {

View File

@@ -1,12 +1,12 @@
package nu.marginalia.index.construction.prio;
package nu.marginalia.index.reverse.construction.prio;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.config.ReverseIndexParameters;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.reverse.construction.CountToOffsetTransformer;
import nu.marginalia.index.reverse.construction.DocIdRewriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -43,6 +43,7 @@ public class PrioPreindex {
* will have randomly assigned names.
*/
public static PrioPreindex constructPreindex(IndexJournalPage indexJournalPage,
String languageIsoCode,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
{
@@ -50,8 +51,8 @@ public class PrioPreindex {
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = PrioPreindexWordSegments.construct(indexJournalPage, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, docIdRewriter, segments);
var segments = PrioPreindexWordSegments.construct(indexJournalPage, languageIsoCode, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, languageIsoCode, docIdRewriter, segments);
return new PrioPreindex(segments, docs);
}
@@ -75,7 +76,6 @@ public class PrioPreindex {
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
var offsets = segments.counts;
Files.deleteIfExists(outputFileDocs);
Files.deleteIfExists(outputFileWords);
// Estimate the size of the docs index data
@@ -83,9 +83,10 @@ public class PrioPreindex {
// Write the docs file
try (var intermediateDocChannel = documents.createDocumentsFileChannel();
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE);
var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
var transformer = new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel)
) {
destFileChannel.position(destFileChannel.size());
offsets.transformEachIO(0, offsets.size(), transformer);
}

View File

@@ -1,11 +1,16 @@
package nu.marginalia.index.construction.prio;
package nu.marginalia.index.reverse.construction.prio;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.reverse.construction.DocIdRewriter;
import nu.marginalia.rwf.RandomFileAssembler;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -35,46 +40,30 @@ public class PrioPreindexDocuments {
Path docsFile,
Path workDir,
IndexJournalPage journalInstance,
DocIdRewriter docIdRewriter,
String languageIsoCode, DocIdRewriter docIdRewriter,
PrioPreindexWordSegments segments) throws IOException {
createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments);
return new PrioPreindexDocuments(docsFileMap, docsFile);
}
public FileChannel createDocumentsFileChannel() throws IOException {
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
}
public long size() {
return documents.size();
}
private static void createUnsortedDocsFile(Path docsFile,
Path workDir,
IndexJournalPage instance,
PrioPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var slopTable = new SlopTable(instance.baseDir(), instance.page()))
var slopTable = new SlopTable(journalInstance.baseDir(), journalInstance.page()))
{
var docIds = instance.openCombinedId(slopTable);
var termIds = instance.openTermIds(slopTable);
var termMeta = instance.openTermMetadata(slopTable);
LongColumn.Reader docIds = journalInstance.openCombinedId(slopTable);
LongArrayColumn.Reader termIds = journalInstance.openTermIds(slopTable);
ByteArrayColumn.Reader termMeta = journalInstance.openTermMetadata(slopTable);
EnumColumn.Reader languageIsoCodes = journalInstance.openLanguageIsoCode(slopTable);
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
Long2LongOpenHashMap offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
final int desiredLanguageOrdinal = languageIsoCodes.getDictionary().indexOf(languageIsoCode);
while (languageIsoCodes.hasRemaining()) {
if (languageIsoCodes.getOrdinal() == desiredLanguageOrdinal) {
slopTable.prealignAll(languageIsoCodes);
}
else continue;
while (docIds.hasRemaining()) {
long docId = docIds.get();
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
@@ -91,14 +80,14 @@ public class PrioPreindexDocuments {
}
}
}
slopTable.alignAll(languageIsoCodes);
assembly.write(docsFile);
}
}
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) {
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
var iter = segments.iterator(RECORD_SIZE_LONGS);
PrioPreindexWordSegments.SegmentIterator iter = segments.iterator(RECORD_SIZE_LONGS);
while (iter.next()) {
long iterStart = iter.startOffset;
@@ -106,6 +95,17 @@ public class PrioPreindexDocuments {
docsFileMap.sort(iterStart, iterEnd);
}
return new PrioPreindexDocuments(docsFileMap, docsFile);
}
public FileChannel createDocumentsFileChannel() throws IOException {
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
}
public long size() {
return documents.size();
}
public void delete() throws IOException {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction.prio;
package nu.marginalia.index.reverse.construction.prio;
import nu.marginalia.array.LongArrayFactory;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.construction.prio;
package nu.marginalia.index.reverse.construction.prio;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
@@ -7,6 +7,9 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.array.LongArrayColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import java.io.IOException;
import java.nio.file.Files;
@@ -53,6 +56,7 @@ public class PrioPreindexWordSegments {
}
public static PrioPreindexWordSegments construct(IndexJournalPage instance,
String languageIsoCode,
Path wordIdsFile,
Path countsFile)
throws IOException
@@ -61,10 +65,18 @@ public class PrioPreindexWordSegments {
countsMap.defaultReturnValue(0);
try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
var termIds = instance.openTermIds(slopTable);
var termMetas = instance.openTermMetadata(slopTable);
LongArrayColumn.Reader termIds = instance.openTermIds(slopTable);
ByteArrayColumn.Reader termMetas = instance.openTermMetadata(slopTable);
EnumColumn.Reader languageIsoCodes = instance.openLanguageIsoCode(slopTable);
final int desiredLanguageOrdinal = languageIsoCodes.getDictionary().indexOf(languageIsoCode);
while (languageIsoCodes.hasRemaining()) {
if (languageIsoCodes.getOrdinal() == desiredLanguageOrdinal) {
slopTable.prealignAll(languageIsoCodes);
}
else continue;
while (termIds.hasRemaining()) {
long[] data = termIds.get();
byte[] meta = termMetas.get();
@@ -74,6 +86,8 @@ public class PrioPreindexWordSegments {
}
}
}
slopTable.alignAll(languageIsoCodes);
}
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.positions;
package nu.marginalia.index.reverse.positions;
/** A utility class for encoding and decoding position data offsets,
* the data is encoded by using the highest 16 bits to store the offset,

View File

@@ -1,6 +1,6 @@
package nu.marginalia.index.positions;
package nu.marginalia.index.reverse.positions;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.reverse.query.IndexSearchBudget;
import nu.marginalia.uring.UringFileReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.positions;
package nu.marginalia.index.reverse.positions;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.VarintCodedSequence;

View File

@@ -1,12 +1,9 @@
package nu.marginalia.index.query;
package nu.marginalia.index.reverse.query;
import nu.marginalia.array.page.LongQueryBuffer;
/** Dummy EntrySource that returns no entries. */
public class EmptyEntrySource implements EntrySource {
@Override
public void skip(int n) {
}
@Override
public void read(LongQueryBuffer buffer) {

View File

@@ -1,14 +1,10 @@
package nu.marginalia.index.query;
package nu.marginalia.index.reverse.query;
import nu.marginalia.array.page.LongQueryBuffer;
/** An EntrySource is a source of entries for a query.
*/
public interface EntrySource {
/** Skip n entries. */
@Deprecated
void skip(int n);
/** Fill the buffer with entries, updating its data and length appropriately. */
void read(LongQueryBuffer buffer);

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index.query;
package nu.marginalia.index.reverse.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
import java.util.ArrayList;
import java.util.List;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query;
package nu.marginalia.index.reverse.query;
/** An execution time budget for index search operations. */

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index.query;
package nu.marginalia.index.reverse.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListReader;
public record ReverseIndexRejectFilter(SkipListReader range, IndexSearchBudget budget) implements QueryFilterStepIf {

View File

@@ -1,7 +1,7 @@
package nu.marginalia.index.query;
package nu.marginalia.index.reverse.query;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
import nu.marginalia.skiplist.SkipListReader;
public record ReverseIndexRetainFilter(SkipListReader range, String name, long wordId, IndexSearchBudget budget) implements QueryFilterStepIf {

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.filter;
package nu.marginalia.index.reverse.query.filter;
import nu.marginalia.array.page.LongQueryBuffer;

View File

@@ -1,4 +1,4 @@
package nu.marginalia.index.query.filter;
package nu.marginalia.index.reverse.query.filter;
import nu.marginalia.array.page.LongQueryBuffer;

Some files were not shown because too many files have changed in this diff Show More